In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

- `job`
- `education`
- `device`
- `outcome_old`

# import data

In [2]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

In [3]:
campaign_test.groupby(by='job').size()

job
entrepreneur         114
freelance            121
housekeeper          100
industrial_worker    747
manager              852
na                    21
retired              242
salesman             341
student              120
teacher              436
technology           629
unemployed           114
dtype: int64

# dummify marital and outcome_old

In [4]:
campaign_ad = pd.get_dummies(campaign_ad, columns=['marital', 'outcome_old'])

# train-valid-test split

In [5]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=46)

In [6]:
print('size of training set:', X_train.shape[0])
print('size of validation set:', X_valid.shape[0])
print('size of test set:', X_test.shape[0])

size of training set: 6266
size of validation set: 1343
size of test set: 1343


# Imputation for Device 
## Train data - `device` 

In [7]:
X_tr_dev = X_train.drop(columns=['job', 'education'])
X_tr_imp_dev = X_tr_dev[X_tr_dev['device'] == 'na'].drop(columns='device') 
#this is the data set that we need to predict and fuse with actual Training data for future imputation of "education"
X_tr_imp_dev


Unnamed: 0_level_0,age,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4359,44,17,6,24.05,1,0,-1,0,0,0,0.076980,0,1,0,0,1,0,0
5378,31,26,5,8.60,5,0,-1,0,0,1,0.068700,1,0,0,0,1,0,0
3095,49,5,6,0.45,3,0,-1,0,0,0,0.088029,0,1,0,0,1,0,0
8737,36,16,6,33.80,11,0,-1,0,0,1,0.075291,0,1,0,0,1,0,0
7745,32,12,5,13.95,3,0,-1,0,0,1,0.085014,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4510,32,6,6,29.80,5,0,-1,0,0,1,0.061255,0,1,0,0,1,0,0
4515,36,11,6,5.35,2,0,-1,0,0,0,0.095655,0,1,0,0,1,0,0
4030,29,6,5,12.90,2,0,-1,0,0,1,0.073194,0,1,0,0,1,0,0
5194,44,29,5,22.30,1,0,-1,0,0,1,0.073457,0,1,0,0,1,0,0


In [8]:
X_tr_dev = X_tr_dev[X_tr_dev['device'] != 'na'] # this is the data set that does not have NAs
y_tr_dev = X_tr_dev[['device']] 
X_tr_dev = X_tr_dev.drop(columns='device')


In [9]:
X_tr_dev

Unnamed: 0_level_0,age,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,X1,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4949,59,6,8,3.90,2,0,-1,0,0,0,0.073675,0,1,0,0,1,0,0
4298,50,5,8,27.15,1,0,-1,0,0,0,0.093803,0,1,0,0,1,0,0
6904,55,21,11,2.25,5,0,-1,1,0,1,0.085450,0,1,0,0,1,0,0
4642,44,19,11,17.65,2,0,-1,0,0,0,0.086812,0,1,0,0,1,0,0
2403,53,19,11,27.55,1,0,-1,0,0,0,0.078133,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8241,27,4,2,53.40,1,0,-1,0,0,1,0.081256,0,0,1,0,1,0,0
278,42,12,5,0.70,1,0,-1,0,0,0,0.072803,0,1,0,0,1,0,0
3714,39,13,5,98.25,5,0,-1,1,0,1,0.072803,0,0,1,0,1,0,0
3325,60,18,8,4.40,7,0,-1,1,0,1,0.146533,0,1,0,0,1,0,0


In [10]:

enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])
y_tr_dev = enc.fit_transform(y_tr_dev)
y_tr_dev

array([[0.],
       [0.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [11]:
# changing shape of y_tr_dev so RandomForestClassifier stops complaining
print(y_tr_dev.shape)
y_tr_dev.shape = (y_tr_dev.shape[0],)
print(y_tr_dev.shape)

(4830, 1)
(4830,)


In [12]:
rfc_dev = RandomForestClassifier(n_estimators=100, random_state=59, n_jobs=-2)

In [13]:
rfc_dev.fit(X_tr_dev, y_tr_dev)

In [14]:
y_tr_dev_pred = rfc_dev.predict(X_tr_dev)
accuracy_score(y_true=y_tr_dev, y_pred=y_tr_dev_pred)
#np.array(y_tr_dev_pred.shape)

1.0

## Validation `device`

In [15]:
X_v_dev = X_valid.drop(columns=['job', 'education'])
X_v_imp_dev = X_v_dev[X_v_dev['device'] == 'na'].drop(columns='device')
X_v_dev = X_v_dev[X_v_dev['device'] != 'na']
y_v_dev = X_v_dev[['device']]
X_v_dev = X_v_dev.drop(columns='device')
enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])
y_v_dev = enc.fit_transform(y_v_dev)

In [16]:
y_v_dev_pred = rfc_dev.predict(X_v_dev)
accuracy_score(y_pred=y_v_dev_pred, y_true=y_v_dev)

0.913926499032882

# Test `device`

In [17]:
X_te_dev = X_test.drop(columns=['job', 'education'])
X_te_imp_dev = X_te_dev[X_te_dev['device'] == 'na'].drop(columns='device')
X_te_dev = X_te_dev[X_te_dev['device'] != 'na']
y_te_dev = X_te_dev[['device']]
X_te_dev = X_te_dev.drop(columns='device')
enc = OrdinalEncoder(categories=[['smartphone', 'desktop']])
y_te_dev = enc.fit_transform(y_te_dev)

In [18]:
y_te_dev_pred = rfc_dev.predict(X_te_dev)
accuracy_score(y_pred=y_te_dev_pred, y_true=y_te_dev)

0.918111753371869

# Inserting device imputations in our sets

In [19]:
y_tr_imp_dev = rfc_dev.predict(X_tr_imp_dev)
y_v_imp_dev  = rfc_dev.predict(X_v_imp_dev)
y_te_imp_dev = rfc_dev.predict(X_te_imp_dev)

In [20]:
X_train.loc[X_train['device'] == 'smartphone', 'device'] = 0
X_train.loc[X_train['device'] == 'desktop', 'device'] = 1
X_train.loc[X_tr_imp_dev.index, 'device'] = y_tr_imp_dev
X_train

Unnamed: 0_level_0,age,job,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,...,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4949,59,housekeeper,high_school,0,6,8,3.90,2,0,-1,...,0,0,0.073675,0,1,0,0,1,0,0
4298,50,technology,grad_school,0,5,8,27.15,1,0,-1,...,0,0,0.093803,0,1,0,0,1,0,0
4359,44,housekeeper,grad_school,0.0,17,6,24.05,1,0,-1,...,0,0,0.076980,0,1,0,0,1,0,0
5378,31,industrial_worker,university,0.0,26,5,8.60,5,0,-1,...,0,1,0.068700,1,0,0,0,1,0,0
3095,49,teacher,university,0.0,5,6,0.45,3,0,-1,...,0,0,0.088029,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,42,industrial_worker,high_school,1,12,5,0.70,1,0,-1,...,0,0,0.072803,0,1,0,0,1,0,0
3714,39,industrial_worker,high_school,0,13,5,98.25,5,0,-1,...,0,1,0.072803,0,0,1,0,1,0,0
7409,46,industrial_worker,high_school,0.0,18,6,6.25,3,0,-1,...,0,0,0.089354,0,1,0,0,1,0,0
3325,60,retired,university,0,18,8,4.40,7,0,-1,...,0,1,0.146533,0,1,0,0,1,0,0


In [21]:
X_valid.loc[X_valid['device'] == 'smartphone', 'device'] = 0
X_valid.loc[X_valid['device'] == 'desktop', 'device'] = 1
X_valid.loc[X_v_imp_dev.index, 'device'] = y_v_imp_dev
X_valid

Unnamed: 0_level_0,age,job,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,...,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8392,53,manager,grad_school,0,27,8,4.25,6,0,-1,...,0,0,0.091815,0,1,0,0,1,0,0
2970,38,freelance,university,0.0,28,5,5.00,5,0,-1,...,0,1,0.075336,0,1,0,0,1,0,0
1269,73,retired,high_school,0,11,11,25.30,2,3,90,...,0,0,0.155966,0,1,0,0,0,1,0
5924,36,industrial_worker,high_school,0,13,5,6.00,2,0,-1,...,0,1,0.078051,0,1,0,0,1,0,0
4121,25,teacher,university,0,30,7,2.60,12,0,-1,...,0,1,0.075000,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3271,39,salesman,university,0,29,1,8.35,1,0,-1,...,0,1,0.084433,0,0,1,0,1,0,0
7176,61,retired,university,0,10,3,11.45,1,0,-1,...,0,1,0.098233,0,1,0,0,1,0,0
532,45,manager,grad_school,0,26,8,6.15,7,0,-1,...,0,0,0.072894,0,1,0,0,1,0,0
1680,41,manager,grad_school,0.0,20,6,0.15,3,0,-1,...,0,1,0.091243,0,1,0,0,1,0,0


In [22]:
X_test.loc[X_test['device'] == 'smartphone', 'device'] = 0
X_test.loc[X_test['device'] == 'desktop', 'device'] = 1
X_test.loc[X_te_imp_dev.index, 'device'] = y_te_imp_dev
X_test

Unnamed: 0_level_0,age,job,education,device,day,month,time_spent,banner_views,banner_views_old,days_elapsed_old,...,X2,X3,X4,marital_divorced,marital_married,marital_single,outcome_old_failure,outcome_old_na,outcome_old_other,outcome_old_success
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6475,39,industrial_worker,grad_school,0,9,2,5.75,2,2,183,...,0,1,0.074365,0,0,1,1,0,0,0
5568,38,technology,grad_school,0,14,5,14.20,2,0,-1,...,0,0,0.084669,1,0,0,0,1,0,0
1915,36,technology,grad_school,0,12,2,20.45,2,1,183,...,0,0,0.075899,0,1,0,0,0,0,1
1205,25,industrial_worker,high_school,0,7,7,5.30,1,0,-1,...,0,0,0.072803,0,1,0,0,1,0,0
3551,30,manager,grad_school,0,4,6,9.20,1,0,-1,...,0,1,0.076607,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,salesman,university,0,1,6,9.10,1,0,-1,...,0,0,0.073085,0,0,1,0,1,0,0
111,29,technology,university,0,28,1,14.30,1,0,-1,...,0,0,0.072803,0,0,1,0,1,0,0
3445,38,salesman,university,0,12,2,91.25,3,2,182,...,0,0,0.079839,0,1,0,0,0,0,1
6641,23,salesman,university,0,18,5,10.60,1,0,-1,...,0,1,0.075073,0,0,1,0,1,0,0
