# Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import feature_extraction
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import display

# Read file with labels

In [2]:
data_init = pd.read_csv(r'G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\CERTIF_PROJECTS\ML_Engineer_Certification_Projects\04_SUPERVISED_ML\Conversion_rate\src\conversion_data_train.csv')


In [3]:
data_init.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


# Explore dataset

In [4]:
# The dataset is quite big : we create a sample 
data_sample = data_init.sample(10000)

In [5]:
print("general info : ")
display(data_sample.info())
print()

print("Number of rows : {}".format(data_sample.shape[0]))
print()

print("Display of dataset: ")
display(data_sample.head())
print()

print("Basics statistics: ")
data_desc = data_sample.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

general info : 
<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 77090 to 235440
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   country              10000 non-null  object
 1   age                  10000 non-null  int64 
 2   new_user             10000 non-null  int64 
 3   source               10000 non-null  object
 4   total_pages_visited  10000 non-null  int64 
 5   converted            10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 546.9+ KB


None


Number of rows : 10000

Display of dataset: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
77090,China,42,1,Seo,4,0
31611,US,35,1,Direct,1,0
199891,China,28,1,Seo,3,0
242762,China,21,0,Ads,5,0
155445,China,37,1,Direct,7,0



Basics statistics: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,10000,10000.0,10000.0,10000,10000.0,10000.0
unique,4,,,3,,
top,US,,,Seo,,
freq,5687,,,4935,,
mean,,30.4375,0.6804,,4.9281,0.0331
std,,8.30177,0.466345,,3.414981,0.178907
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0



Percentage of missing values: 


country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

In [6]:
data_sample['converted'].value_counts()

converted
0    9669
1     331
Name: count, dtype: int64

In [7]:
print("Separating labels from features...")
target_variable = "converted"

X = data_sample.drop(target_variable, axis = 1)
Y = data_sample.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

numeric_features = list(X.select_dtypes(include=['float', 'int']).columns)
categorical_features = list(X.select_dtypes(exclude=['float', 'int']).columns)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Separating labels from features...
...Done.

Y : 
77090     0
31611     0
199891    0
242762    0
155445    0
Name: converted, dtype: int64

X :
       country  age  new_user  source  total_pages_visited
77090    China   42         1     Seo                    4
31611       US   35         1  Direct                    1
199891   China   28         1     Seo                    3
242762   China   21         0     Ads                    5
155445   China   37         1  Direct                    7
Found numeric features  ['age', 'new_user', 'total_pages_visited']
Found categorical features  ['country', 'source']


In [8]:
# Distribution of each numeric variable
for f in numeric_features:
    fig = px.histogram(data_sample, f, color = 'converted', facet_row = 'converted', histnorm = 'probability')
    fig.show()

In [9]:
# Distribution of each categorical variable

for i in range(len(categorical_features)):
    fig = px.histogram(data_sample[categorical_features[i]])
    fig.show()

In [10]:
## Correlation matrix

corr_matrix = data_sample.corr(numeric_only = True).round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

# Making our model

In [11]:
# Choose variables to use in the model, and create train and test sets
print("Separating labels from features...")
target_variable = "converted"

X = data_sample.drop(target_variable, axis = 1)
Y = data_sample.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
77090     0
31611     0
199891    0
242762    0
155445    0
Name: converted, dtype: int64

X :
       country  age  new_user  source  total_pages_visited
77090    China   42         1     Seo                    4
31611       US   35         1  Direct                    1
199891   China   28         1     Seo                    3
242762   China   21         0     Ads                    5
155445   China   37         1  Direct                    7


In [12]:
# Divide dataset Train set & Test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print("...Done.")
print()

...Done.



In [13]:
# Identificate the column data type

numeric_features = list(X.select_dtypes(include=['float', 'int']).columns)
categorical_features = list(X.select_dtypes(exclude=['float', 'int']).columns)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'new_user', 'total_pages_visited']
Found categorical features  ['country', 'source']


## Training pipeline

In [14]:
numeric_transformer = Pipeline(
    steps=[ #liste de 2 tuples
        ("scaler",  StandardScaler()),  # put data of the column on scale for all columns where mean = 0, and value = std
    ]
)

## Create pipeline for columns with categorical features
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(drop="first")),  # first column will be dropped to avoid creating correlations between features
    ]
)

## Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done to the columns group selected
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


## Preprocessings on train set
print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

## Preprocessings on test set
print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

## Encoding Y on train set (if Y is categorical in this example)
print("Encoding labels...")
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Encoding Y on test set
print("Encoding labels...")
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
...Done.
[[-1.01233462 -1.45815677 -1.15105122  1.          0.          0.
   0.          1.        ]
 [ 1.02826933  0.68579731 -0.85786446  0.          0.          1.
   0.          1.        ]
 [-0.53219251  0.68579731 -1.15105122  0.          1.          0.
   0.          1.        ]
 [-0.41215698  0.68579731 -0.5646777   0.          0.          1.
   0.          1.        ]
 [-1.37244119  0.68579731 -0.27149094  0.          0.          1.
   0.          1.        ]]

Performing preprocessings on test set...
...Done.
[[-1.01233462  0.68579731  1.48762962  0.          1.          0.
   0.          1.        ]
 [-0.77226356  0.68579731  1.19444286  0.          0.          1.
   0.          0.        ]
 [-0.29212146  0.68579731 -1.15105122  0.          0.          1.
   0.          0.        ]
 [-0.53219251  0.68579731 -0.85786446  0.          0.          1.
   0.          0.        ]
 [ 2.82880222  0.68579731  0.31488258  0.          0.       

In [26]:
# Train model
print("Train model...")
pipe = Pipeline(steps=[
    ("poly", PolynomialFeatures()),
    ("logit", LogisticRegression())
])

params = {
    "poly__degree" : [1,2,3],
    "logit__C" : [1e-20, 1e-9, 1e-7, 1e-5, 1e-3],
    "logit__penalty" : ["l2", None]
}


gridsearch = GridSearchCV(pipe, param_grid = params, cv = 3)
gridsearch.fit(X_train, Y_train)

print("...Done.")


Train model...



Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


Setting penalty=None will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Setting penalty=None will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number

...Done.



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Setting penalty=None will ignore the C and l1_ratio parameters



In [27]:
print(gridsearch.cv_results_)
print()
# Finding out best parameters and score
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best score : ", gridsearch.best_score_)

{'mean_fit_time': array([0.01177247, 0.04114151, 0.04488842, 0.02531743, 0.08749048,
       0.30611714, 0.01841823, 0.02687581, 0.08376209, 0.03233218,
       0.08984192, 0.28743855, 0.015107  , 0.02428985, 0.05471714,
       0.03059085, 0.08304962, 0.28488072, 0.02339681, 0.02487477,
       0.05863992, 0.02849436, 0.08230543, 0.28617422, 0.01667968,
       0.0326701 , 0.06516568, 0.03059538, 0.08352788, 0.3004206 ]), 'std_fit_time': array([0.00400955, 0.01754897, 0.01460609, 0.00091426, 0.01339162,
       0.02160561, 0.00254036, 0.00255791, 0.01350966, 0.00636715,
       0.00885211, 0.02375998, 0.0010273 , 0.00178576, 0.00477087,
       0.00251711, 0.01166936, 0.00404866, 0.00138474, 0.00404863,
       0.00281062, 0.00080027, 0.01078821, 0.01721282, 0.00086156,
       0.00289412, 0.01159419, 0.00326342, 0.00899044, 0.01571632]), 'mean_score_time': array([0.00319314, 0.00472546, 0.0162758 , 0.00119789, 0.0024368 ,
       0.00890724, 0.00132934, 0.00155703, 0.00689689, 0.00270367,
     

In [29]:
print("Train final model...")
gridsearch_best1 = Pipeline(steps=[
    ("poly", PolynomialFeatures(degree=1)),
    ("logit", LogisticRegression(penalty=None))
])

gridsearch_best1.fit(X_train, Y_train)

print("...Done.")

Train model...
...Done.


In [30]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_best1.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]



## Test pipeline

In [31]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_best1.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]



## Performance assessment

In [32]:
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on train set :  0.7845528455284553
f1-score on test set :  0.7575757575757576


In [33]:
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

Confusion matrix on train set : 
[[7701   34]
 [  72  193]]

Confusion matrix on test set : 
[[1918   16]
 [  16   50]]



In [34]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

gridsearch_best1.fit(X,Y)

In [None]:
# Read data without labels
data_without_labels = pd.read_csv(r'G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\PROJETS\04_SUPERVISED_ML\Conversion rate\conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features
# used by your best classifier)
X_without_labels = data_without_labels#.loc[:, features_list]

print("Encoding categorical features and standardizing numerical features...")
X_without_labels = preprocessor.transform(X_without_labels)
print("...Done")
print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
Encoding categorical features and standardizing numerical features...
...Done
[[-0.29131775 -1.49497068  3.298816    0.          1.          0.
   0.          1.        ]
 [-1.01999163  0.66890944  0.02872478  0.          1.          0.
   1.          0.        ]
 [ 0.19446484  0.66890944 -1.1603993   0.          0.          0.
   0.          1.        ]
 [ 0.19446484  0.66890944  0.3260058   0.          0.          1.
   0.          0.        ]
 [-0.65565469 -1.49497068 -0.56583726  0.          0.          0.
   0.          1.        ]]


In [None]:
# data = {
#     'converted': gridsearch_best1.predict(X_without_labels)
# }

# Y_predictions = pd.DataFrame(columns=['converted'],data=data)
# Y_predictions.to_csv(r'G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\PROJETS\04_SUPERVISED_ML\Conversion rate\conversion_data_test_predictions_MATHIAS_SAMSON_model_2.csv', index=False)
