# Challenge : predict conversions 🏆🏆

# Import libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff

# Read file with labels

In [None]:
df = pd.read_csv('https://julie-2-next-resources.s3.eu-west-3.amazonaws.com/full-stack-full-time/projects-supervised-machine-learning-ft/walmart-sales-ft/conversion_data_train.csv')
print('Set with labels (our train+test) :', df.shape)

Set with labels (our train+test) : (284580, 6)


In [None]:
df.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


## Feature engineering

In [None]:
df['age_2'] = df['age'] ** 2
df['total_pages_visited_2'] = df['total_pages_visited'] **2
df['age_inverse'] = 1/df['age']
df['total_pages_visited_inverse'] = 1/df['total_pages_visited']
df['age_3'] = df['age'] ** 3
df['total_pages_visited_3'] = df['total_pages_visited'] **3
df['age_inverse_2'] = 1/(df['age'])**2
df['total_pages_visited_inverse_2'] = 1/(df['total_pages_visited'])**2
df['age_4'] = df['age'] ** 4
df['total_pages_visited_4'] = df['total_pages_visited'] **4

## Preprocessings

In [None]:
print("Separating labels from features...")
features_list = ['age', 'new_user', 'total_pages_visited', 'age_2', 'age_inverse', 'total_pages_visited_2', 'total_pages_visited_inverse','country',  'age_3', 'age_inverse_2', 'total_pages_visited_3', 'total_pages_visited_inverse_2', 'age_4', 'total_pages_visited_4']
target_variable = "converted"

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    0
1    0
2    1
3    0
4    0
Name: converted, dtype: int64

X :
   age  new_user  total_pages_visited  age_2  age_inverse  \
0   22         1                    2    484     0.045455   
1   21         1                    3    441     0.047619   
2   20         0                   14    400     0.050000   
3   23         1                    3    529     0.043478   
4   28         1                    3    784     0.035714   

   total_pages_visited_2  total_pages_visited_inverse  country  age_3  \
0                      4                     0.500000    China  10648   
1                      9                     0.333333       UK   9261   
2                    196                     0.071429  Germany   8000   
3                      9                     0.333333       US  12167   
4                      9                     0.333333       US  21952   

   age_inverse_2  total_pages_visited_3  total_pages_visited_inverse_2  \


#### Separating numeric features from categorical features

In [None]:
numeric_features = ['age', 'new_user', 'total_pages_visited', 'age_2', 'age_inverse', 'total_pages_visited_2', 'total_pages_visited_inverse', 'age_3', 'age_inverse_2', 'total_pages_visited_3', 'total_pages_visited_inverse_2', 'age_4', 'total_pages_visited_4']
categorical_features = ['country']

#### Dividing into train and test sets

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

Dividing into train and test sets...
...Done.



#### Preprocessings : imputation of missing values, standardizing and one-hot encoding

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('encoder', OneHotEncoder(drop='first')) 
    ])
preprocessor = ColumnTransformer(
    transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
    ])
encoder = LabelEncoder()

In [None]:
X_train = preprocessor.fit_transform(X_train)
Y_train = encoder.fit_transform(Y_train)

In [None]:
X_test = preprocessor.transform(X_test)
Y_test = encoder.transform(Y_test)

## Training model

In [None]:
logistic_regression = LogisticRegression(C=8)
adaboost_logreg = AdaBoostClassifier(logistic_regression, n_estimators=100)

adaboost_logreg.fit(X_train, Y_train)
print("...Done.")

## Predictions

In [None]:
Y_train_pred = adaboost_logreg.predict(X_train)
Y_test_pred = adaboost_logreg.predict(X_test)

## Performance Evaluation

In [None]:
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))