In [129]:
import pandas as pd
import numpy as np
import csv
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

- `job`
- `education`
- `device`
- `outcome_old`

# import data

In [130]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

# dummify marital and outcome_old

In [131]:
campaign_ad = pd.get_dummies(campaign_ad, columns=['marital', 'outcome_old'])

# train-valid-test split

In [132]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [133]:
print('size of training set:', X_train.shape[0])
print('size of validation set:', X_valid.shape[0])
print('size of test set:', X_test.shape[0])

size of training set: 6266
size of validation set: 1343
size of test set: 1343


# imputation for JOB

In [134]:
X_tr_job = X_train.drop(columns=['device', 'education'])

In [135]:
X_test_job = X_tr_job[X_tr_job['job'] == 'na'].drop(columns='job')

In [136]:
X_tr_job = X_tr_job[X_tr_job['job'] != 'na']
y_tr_job = X_tr_job[['job']]
X_tr_job = X_tr_job.drop(columns='job')

In [137]:
enc = OrdinalEncoder()
y_tr_job = enc.fit_transform(y_tr_job)

In [138]:
rfc = RandomForestClassifier(n_estimators=100, random_state=59, n_jobs=-2)

In [139]:
rfc.fit(X_tr_job, y_tr_job)

  rfc.fit(X_tr_job, y_tr_job)


In [140]:
y_tr_job_pred = rfc.predict(X_tr_job)
accuracy_score(y_true=y_tr_job, y_pred=y_tr_job_pred)

1.0

In [141]:
X_v_job = X_valid.drop(columns=['device', 'education'])
X_test_job = X_v_job[X_v_job['job'] == 'na'].drop(columns='job')
X_v_job = X_v_job[X_v_job['job'] != 'na']
y_v_job = X_v_job[['job']]
X_v_job = X_v_job.drop(columns='job')
enc = OrdinalEncoder()
y_v_job = enc.fit_transform(y_v_job)


In [142]:
y_v_job_pred = rfc.predict(X_v_job)
accuracy_score(y_pred=y_v_job_pred, y_true=y_v_job)

0.29451540195341847

In [146]:
hyperparam_grid={"max_features":[3,4,5,6,7],
                 'min_samples_leaf':[1,3,5,7,9,11]}

best_score=0

# takes line 3 minutes to run!!!!
for g in ParameterGrid(hyperparam_grid):
    rfc.set_params(**g)
    #or rfc.set_params(min_samples_leaf=g['min_samples_leaf'], max_features=g['max_features'])
    rfc.fit(X_tr_job,y_tr_job)
    # save if best
    y_pred = rfc.predict(X=X_v_job)
    acc = accuracy_score(y_true=y_v_job, y_pred=y_pred)
    print(acc)
    if acc > best_score:
        best_score = acc
        best_params = g

print(f"Accuracy: %0.5f" % best_score)
print("Best parameters:", best_params)

  rfc.fit(X_tr_job,y_tr_job)


0.2930127723516153


  rfc.fit(X_tr_job,y_tr_job)


0.31555221637866265


  rfc.fit(X_tr_job,y_tr_job)


0.33809166040571


  rfc.fit(X_tr_job,y_tr_job)


0.3208114199849737


  rfc.fit(X_tr_job,y_tr_job)


0.3245679939894816


  rfc.fit(X_tr_job,y_tr_job)


0.32006010518407213


  rfc.fit(X_tr_job,y_tr_job)


0.29451540195341847


  rfc.fit(X_tr_job,y_tr_job)


0.33208114199849736


  rfc.fit(X_tr_job,y_tr_job)


0.3253193087903832


  rfc.fit(X_tr_job,y_tr_job)


0.3208114199849737


  rfc.fit(X_tr_job,y_tr_job)


0.3305785123966942


  rfc.fit(X_tr_job,y_tr_job)


0.3268219383921863


  rfc.fit(X_tr_job,y_tr_job)


0.3080390683696469


  rfc.fit(X_tr_job,y_tr_job)


0.3140495867768595


  rfc.fit(X_tr_job,y_tr_job)


0.3230653643876784


  rfc.fit(X_tr_job,y_tr_job)


0.3268219383921863


  rfc.fit(X_tr_job,y_tr_job)


0.32607062359128475


  rfc.fit(X_tr_job,y_tr_job)


0.3305785123966942


  rfc.fit(X_tr_job,y_tr_job)


0.29601803155522166


  rfc.fit(X_tr_job,y_tr_job)


0.3148009015777611


  rfc.fit(X_tr_job,y_tr_job)


0.3305785123966942


  rfc.fit(X_tr_job,y_tr_job)


0.3245679939894816


  rfc.fit(X_tr_job,y_tr_job)


0.32231404958677684


  rfc.fit(X_tr_job,y_tr_job)


0.32607062359128475


  rfc.fit(X_tr_job,y_tr_job)


0.3012772351615327


  rfc.fit(X_tr_job,y_tr_job)


0.31555221637866265


  rfc.fit(X_tr_job,y_tr_job)


0.318557475582269


  rfc.fit(X_tr_job,y_tr_job)


0.32832456799398946


  rfc.fit(X_tr_job,y_tr_job)


0.318557475582269


  rfc.fit(X_tr_job,y_tr_job)


0.3230653643876784
Accuracy: 0.33809
Best parameters: {'max_features': 3, 'min_samples_leaf': 5}
