In [1]:
# Classifier imports
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification

import os.path
import pylab as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from joblib import dump, load

# 1. Read data

In [2]:
original_data = pd.read_csv('../data/LISS_example_input_data.csv', encoding='cp1252', low_memory = False)
outcome = pd.read_csv('../data/LISS_example_groundtruth_data.csv')

In [3]:
data = original_data.copy()

# 2. Clean data (remove NAs from target and select features of interest)

In [4]:
# Drop observations where the outcome is missing
y_isna = outcome['new_child'].isnull()
data = data.loc[~y_isna]
outcome = outcome.loc[~y_isna]

In [5]:
# Pre-pre process (keep some columns)
#desired_columns = [col for col in column_names if any(substring in col for substring in
column_names = data.columns
desired_columns = [col for col in column_names if col.startswith(('nomem_encr',
                                                                  'geslacht', #gender
                                                                  'positie', #position in the houshold
                                                                  'aantalhh', #number of houshold members
                                                                  'aantalki', #number of living at home children in the houshold
                                                                  'partner', #houshold head lives together with partner
                                                                  'burgstat', #civil status
                                                                  'woonvorm', #domestic situation of houshold head
                                                                  'woning', #type of dwelling houshold inhabits
                                                                  'sted', #urban character of place of recidence 
                                                                  'belbezig', #primary occupation
                                                                  'brutoink', #brutto income
                                                                  'brutoink_f',  #imputed brutto income
                                                                  'netinc', #persona net income
                                                                  'netinc', #brutto houshold income
                                                                  'brutohh_f', #netto houshold income
                                                                  'nettohh_f', #highest level of education irrespective of diploma
                                                                  'oplzon', #highest level of education with  diploma
                                                                  'oplmet', #houshold member participates in the panel
                                                                  'doetmee', #origin
                                                                  'herkomstgroep', #does the houshold have a simPC
                                                                  'simpc',
                                                                  'cr',
                                                                  'cs',
                                                                  'ch',
                                                                  'cf',
                                                                  'cw'))]



In [6]:
len(desired_columns)

20302

In [18]:
data_clean = data.loc[:, desired_columns]
data_clean.shape


In [20]:
# Delete all columns with a |
data_clean = data_clean.replace(r'.*\|.*', np.nan, regex=True)
# Convert all columns to strings
data_clean = data_clean.astype(str)

# Iterate over columns/variables in the dataset
for column in data_clean.columns:
    # Replace 'NA' with NaN
    data_clean[column] = data_clean[column].replace('NA', np.nan)

    # Convert strings containing only numbers to numeric values
    data_clean[column] = pd.to_numeric(data_clean[column], errors='coerce')

    # Convert NaN values back to strings
    data_clean[column] = data_clean[column].astype(object).where(data_clean[column].notna(), np.nan)


# 3. Split data into train and test
First thing always, otherwise you risk overfitting.

In [8]:
data_train, data_test, outcome_train, outcome_test = train_test_split(
    data_clean,
    outcome,
    test_size=0.2, random_state=2023)
outcome_train = outcome_train["new_child"]
outcome_test = outcome_test["new_child"]

# 4. Create transformers

In [12]:
# Create transformers
# Imputer are sometimes not necessary
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=50))])

numerical_transformer = Pipeline(steps=[
    #('imputer', IterativeImputer(max_iter=10, random_state=0)),
    #('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())])

# Use ColumnTransformer to apply the transformations to the correct columns in the dataframe
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, selector(dtype_exclude=object)(data_train)),
        ('cat', categorical_transformer, selector(dtype_include=object)(data_train))])

# 5. Make pipeline, run pipeline, select best model

In [13]:
# Including feature selection
model = Pipeline([
                ("preprocess", preprocessor),
                ('selector', SelectKBest(score_func=f_classif, k=150)),
                ("classifier", ExtraTreesClassifier())
               ])

# Define the hyperparameters
parameters = [
    {
        'classifier': [ExtraTreesClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, bootstrap=False)],
        'classifier__criterion': ('gini', 'entropy', 'log_loss'),
        'selector__k' : (100,125,150,175)

    }
]
# Perform hyperparameter tuning using cross-validation
grid_search = GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring="f1", verbose=3)
grid_search.fit(data_train, outcome_train)
# Keep best model
model = grid_search.best_estimator_
model


Fitting 5 folds for each of 12 candidates, totalling 60 fits


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=125;, score=0.596 total time=   5.0s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=100;, score=0.667 total time=   5.0s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=100;, score=0.605 total time=   5.1s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=125;, score=0.524 total time=   5.2s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=100;, score=0.609 total time=   5.2s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=100;, score=0.625 total time=   5.2s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=125;, score=0.615 total time=   5.1s
[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=100;, score=0.818 total time=   5.2s


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=125;, score=0.844 total time=   3.8s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=125;, score=0.634 total time=   3.8s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=150;, score=0.571 total time=   4.0s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=150;, score=0.596 total time=   3.9s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=150;, score=0.600 total time=   4.1s
[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=150;, score=0.791 total time=   4.0s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=150;, score=0.667 total time=   4.0s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=175;, score=0.579 total time=   3.9s


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=175;, score=0.579 total time=   3.7s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=175;, score=0.591 total time=   3.6s


  f = msb / msw


[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=175;, score=0.791 total time=   3.8s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=100;, score=0.605 total time=   3.7s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=gini, selector__k=175;, score=0.564 total time=   3.8s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=100;, score=0.652 total time=   3.7s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=100;, score=0.634 total time=   3.8s
[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=100;, score=0.818 total time=   3.6s


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=100;, score=0.636 total time=   3.5s


  f = msb / msw


[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=125;, score=0.585 total time=   3.6s


  f = msb / msw
  f = msb / msw


[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=125;, score=0.800 total time=   3.6s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=125;, score=0.565 total time=   3.8s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=125;, score=0.650 total time=   3.9s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=125;, score=0.682 total time=   3.7s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=150;, score=0.500 total time=   3.8s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=150;, score=0.667 total time=   3.5s


  f = msb / msw
  f = msb / msw


[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=150;, score=0.638 total time=   3.7s


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=150;, score=0.818 total time=   3.8s


  f = msb / msw


[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=150;, score=0.636 total time=   3.8s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=175;, score=0.667 total time=   3.7s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=175;, score=0.558 total time=   3.9s
[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=175;, score=0.791 total time=   3.6s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=175;, score=0.596 total time=   3.9s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=entropy, selector__k=175;, score=0.698 total time=   3.7s


  f = msb / msw
  f = msb / msw


[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=100;, score=0.591 total time=   3.7s


  f = msb / msw
  f = msb / msw


[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=100;, score=0.634 total time=   3.5s


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=100;, score=0.638 total time=   3.8s
[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=100;, score=0.826 total time=   4.0s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=100;, score=0.667 total time=   4.0s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=125;, score=0.578 total time=   3.9s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=125;, score=0.615 total time=   3.9s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=125;, score=0.652 total time=   3.9s


  f = msb / msw
  f = msb / msw


[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=125;, score=0.791 total time=   3.9s


  f = msb / msw
  f = msb / msw


[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=125;, score=0.682 total time=   3.9s


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=150;, score=0.615 total time=   4.1s
[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=150;, score=0.667 total time=   4.1s
[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=150;, score=0.844 total time=   3.8s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=150;, score=0.622 total time=   4.0s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=150;, score=0.619 total time=   4.0s
[CV 1/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=175;, score=0.634 total time=   3.8s


  f = msb / msw
  f = msb / msw


[CV 2/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=175;, score=0.619 total time=   2.9s
[CV 3/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=175;, score=0.591 total time=   2.6s


  f = msb / msw
  f = msb / msw


[CV 4/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=175;, score=0.762 total time=   2.0s
[CV 5/5] END classifier=ExtraTreesClassifier(), classifier__criterion=log_loss, selector__k=175;, score=0.564 total time=   2.0s


  f = msb / msw


# 6. Evaluate the model

In [14]:
# Create predictions
outcome_pred = model.predict(data_test)

# Report classification table
print(classification_report(outcome_test, outcome_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96       157
         1.0       0.82      0.58      0.68        24

    accuracy                           0.93       181
   macro avg       0.88      0.78      0.82       181
weighted avg       0.92      0.93      0.92       181



# 7. Save models 

In [15]:
import os
os.makedirs("../models", exist_ok=True)

# Dump model (don't change the name)
dump(model, "../models/model.joblib")

['../models/model.joblib']

# 8. How the submission needs to look like, 

In [16]:
def predict_outcomes(df):
    """Process the input data and write the predictions."""
    results = df[["nomem_encr"]]

    # Keep 
    column_names = df.columns
    column_names
    desired_columns = [col for col in column_names if col.startswith(('nomem_encr','leeftijd', 
                                                 'geslacht',
                                                 'positie',
                                                 'aantalhh',
                                                 'aantalki',
                                                 'partner',
                                                 'burgstat',
                                                 'woonvorm',
                                                 'woning',
                                                 'sted',
                                                 'belbezig', 
                                                 'brutoink',
                                                 'brutoink_f', 
                                                 'netinc',
                                                 'netinc',
                                                 'brutohh_f',
                                                 'nettohh_f',
                                                 'oplzon',
                                                 'oplmet',
                                                 'doetmee',
                                                 'herkomstgroep',
                                                 'simpc',
                                                 'cr',
                                                 'cs',
                                                 'ch',
                                                 'cf',
                                                 'cw'))]

    
    
    df = df.loc[:, desired_columns]
                            
    # Load your trained model from the models directory
    model_path = os.path.join(os.path.dirname(__file__), "..", "models", "model.joblib")
    model = load(model_path)

    # Use your trained model for prediction
    results.loc[:, "prediction"] = model.predict(df)

    #If you use predict_proba to get a probability and a different threshold
    #df["prediction"] = (df["prediction"] >= 0.5).astype(int)
    return results

In [None]:
#od = pd.read_csv('../data/LISS_example_input_data.csv', encoding='cp1252', low_memory = False)
#od = od[desired_columns]

In [17]:
__file__ = './' #this is not needed outside juypter notebooks
predict_outcomes(original_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[:, "prediction"] = model.predict(df)


Unnamed: 0,nomem_encr,prediction
0,800000.0,0.0
1,800018.0,0.0
2,800021.0,0.0
3,800033.0,1.0
4,800042.0,0.0
...,...,...
9454,899917.0,0.0
9455,899935.0,0.0
9456,899942.0,1.0
9457,899957.0,0.0


In [None]:
# Try to check feature importance 

In [None]:
from sklearn.inspection import permutation_importance


In [None]:
r = permutation_importance(model, data_test, outcome_test,
                            n_repeats=10,
                            random_state=0)

print_sorted_importance(r["importances_mean"], data_train.columns)

KeyboardInterrupt: 