# Challenge : predict conversions 🏆🏆

# Import libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff

# Read file with labels

In [None]:
df = pd.read_csv('https://julie-2-next-resources.s3.eu-west-3.amazonaws.com/full-stack-full-time/projects-supervised-machine-learning-ft/walmart-sales-ft/conversion_data_train.csv')
print('Set with labels (our train+test) :', df.shape)

Set with labels (our train+test) : (284580, 6)


In [None]:
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [None]:
df = df.sample(50000)

## Model using ALL features

In [None]:
print("Separating labels from features...")
features_list = df.columns[:-1]
target_variable = "converted"

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
158287    0
242596    0
219211    0
197826    0
238836    0
Name: converted, dtype: int64

X :
       country  age  new_user  source  total_pages_visited
158287   China   18         1     Ads                    2
242596   China   35         0     Seo                    1
219211      US   22         1  Direct                    7
197826      US   38         1  Direct                    5
238836   China   19         0     Seo                    2


In [None]:
numeric_features = ['age', 'new_user', 'total_pages_visited']
categorical_features = ['country', 'source']

In [None]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('encoder', OneHotEncoder(drop='first')) 
    ])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
X_train = preprocessor.fit_transform(X_train)

In [None]:
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)

In [None]:
X_test = preprocessor.fit_transform(X_test)
Y_test = encoder.fit_transform(Y_test)

In [None]:
print("Grid search...")
random_forest = RandomForestClassifier()

# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'min_samples_split': [1, 2, 4, 6, 8],
    'n_estimators': [2, 4, 6, 8, 10, 12]
}
print(params)
gridsearch = GridSearchCV(random_forest, param_grid = params, cv = 3, verbose = 1) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)

random_forest.fit(X_train, Y_train)

Grid search...
...Done.


In [None]:
Y_train_pred = random_forest.predict(X_train)
Y_test_pred = random_forest.predict(X_test)
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on training set :  0.777391304347826
f1-score on test set :  0.7686832740213523


In [None]:
decision_tree = DecisionTreeClassifier()

# Grid of values to be tested
params = {
    'max_depth': [4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'min_samples_split': [1, 2, 4, 6, 8],
    'n_estimators': [2, 4, 6, 8, 10, 12]
}
print(params)
gridsearch = GridSearchCV(decision_tree, param_grid = params, cv = 3, verbose = 1) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)

decision_tree.fit(X_train, Y_train)

Grid search...


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=7,
                                                         min_samples_split=3),
                   n_estimators=60)

In [None]:
Y_train_pred = decision_tree.predict(X_train)
Y_test_pred = decision_tree.predict(X_test)
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on training set :  0.8760597496972143
f1-score on test set :  0.7385103011093502


In [None]:
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

random_forest.fit(X,Y)

RandomForestClassifier(max_depth=7, min_samples_split=3, n_estimators=60)

In [None]:
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features 
# used by your best classifier)
#features_list = ['total_pages_visited', 'age', 'new_user', 'country']
X_without_labels = data_without_labels.iloc[:, [True, True, True, True, True]]

Prediction set (without labels) : (31620, 5)


In [None]:
X_without_labels = preprocessor.transform(X_without_labels)

In [None]:
data = {
    'converted': random_forest.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_MarieP.csv', index=False)