In [62]:
import pandas as pd
import numpy as np
import csv
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [56]:
campaign_ad = pd.read_csv("MLUnige2023_subscriptions_train.csv", index_col="Id")
campaign_test = pd.read_csv("MLUnige2023_subscriptions_test.csv", index_col="Id")

In [50]:
campaign_ad.info()

cat_columns = ['job', 'marital', 'education', 'device', 'outcome_old']
num_columns = ['age', 'day', 'month', 'time_spent', 'banner_views', 'banner_views_old', 'days_elapsed_old', 'X4']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8952 entries, 0 to 8951
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               8952 non-null   int64  
 1   job               8952 non-null   object 
 2   marital           8952 non-null   object 
 3   education         8952 non-null   object 
 4   device            8952 non-null   object 
 5   day               8952 non-null   int64  
 6   month             8952 non-null   int64  
 7   time_spent        8952 non-null   float64
 8   banner_views      8952 non-null   int64  
 9   banner_views_old  8952 non-null   int64  
 10  days_elapsed_old  8952 non-null   int64  
 11  outcome_old       8952 non-null   object 
 12  X1                8952 non-null   int64  
 13  X2                8952 non-null   int64  
 14  X3                8952 non-null   int64  
 15  X4                8952 non-null   float64
 16  subscription      8952 non-null   int64  


In [14]:
X = campaign_ad.drop(columns='subscription')
y = campaign_ad['subscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1952, random_state=12)

In [36]:
rf_pipe = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown='ignore')),
    ("rfc", RandomForestClassifier(n_estimators=500, random_state=53, n_jobs=-2))
])

False

In [41]:
hyperparam_grid={"rfc__max_features":[3,4,5,6,7],
                 'rfc__min_samples_leaf':[1,3,5,7,9,11]}

best_score=0.5

# takes line 3 minutes to run!!!!
for g in ParameterGrid(hyperparam_grid):
    rf_pipe.set_params(**g)
    #or rfc.set_params(min_samples_leaf=g['min_samples_leaf'], max_features=g['max_features'])
    rf_pipe.fit(X_train,y_train)
    # save if best
    y_pred = rf_pipe.predict(X=X_test)
    acc = accuracy_score(y_true=y_test, y_pred=y_pred)
    if acc > best_score:
        best_score = acc
        best_params = g

print(f"Accuracy: %0.5f" % best_score)
print("Best parameters:", best_params)

OOB: 0.74180
Best parameters: {'rfc__max_features': 3, 'rfc__min_samples_leaf': 1}


In [37]:
rf_pipe.fit(X=X_train, y=y_train)

In [54]:
preprocessor = ColumnTransformer(transformers=
                                 [('num', StandardScaler(), num_columns),
                                  ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)])

log_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("logistic", LogisticRegression(random_state=44))
])

log_pipe.fit(X_train,y_train)

In [55]:
y_pred = log_pipe.predict(X=X_test)
accuracy_score(y_true=y_test, y_pred=y_pred)

0.8206967213114754

In [61]:
y_pred_answer = log_pipe.predict(X=campaign_test)
print(y_pred_answer)

[0 0 0 ... 0 0 1]


In [70]:
file = open('test_file.csv', 'w')
writer = csv.writer(file)
writer.writerow(['Id', 'subscription'])
for i in range(len(y_pred_answer)):
    writer.writerow([i, y_pred_answer[i]])
file.close()