# Import libraries

In [226]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import feature_extraction
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

import warnings

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import display

# Read file with labels

In [201]:
data_init = pd.read_csv(r'G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\PROJETS\04_SUPERVISED_ML\Conversion rate\conversion_data_train.csv')


In [202]:
data_init.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


# Explore dataset

In [203]:
# The dataset is quite big : we create a sample 
data_sample = data_init.sample(10000)

In [204]:
print("general info : ")
display(data_sample.info())
print()

print("Number of rows : {}".format(data_sample.shape[0]))
print()

print("Display of dataset: ")
display(data_sample.head())
print()

print("Basics statistics: ")
data_desc = data_sample.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*data_sample.isnull().sum()/data_sample.shape[0])

general info : 
<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 93908 to 277988
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   country              10000 non-null  object
 1   age                  10000 non-null  int64 
 2   new_user             10000 non-null  int64 
 3   source               10000 non-null  object
 4   total_pages_visited  10000 non-null  int64 
 5   converted            10000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 546.9+ KB


None


Number of rows : 10000

Display of dataset: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
93908,US,40,0,Direct,3,0
234691,UK,37,1,Direct,5,0
210916,UK,41,1,Ads,5,0
133750,US,23,1,Ads,1,0
95121,China,31,1,Seo,6,0



Basics statistics: 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,10000,10000.0,10000.0,10000,10000.0,10000.0
unique,4,,,3,,
top,US,,,Seo,,
freq,5606,,,4880,,
mean,,30.4336,0.6882,,4.8735,0.0325
std,,8.228546,0.463252,,3.345746,0.177333
min,,17.0,0.0,,1.0,0.0
25%,,24.0,0.0,,2.0,0.0
50%,,30.0,1.0,,4.0,0.0
75%,,36.0,1.0,,7.0,0.0



Percentage of missing values: 


country                0.0
age                    0.0
new_user               0.0
source                 0.0
total_pages_visited    0.0
converted              0.0
dtype: float64

In [205]:
data_sample['converted'].value_counts()

converted
0    9675
1     325
Name: count, dtype: int64

In [206]:
print("Separating labels from features...")
target_variable = "converted"

X = data_sample.drop(target_variable, axis = 1)
Y = data_sample.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

numeric_features = list(X.select_dtypes(include=['float', 'int']).columns)
categorical_features = list(X.select_dtypes(exclude=['float', 'int']).columns)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Separating labels from features...
...Done.

Y : 
93908     0
234691    0
210916    0
133750    0
95121     0
Name: converted, dtype: int64

X :
       country  age  new_user  source  total_pages_visited
93908       US   40         0  Direct                    3
234691      UK   37         1  Direct                    5
210916      UK   41         1     Ads                    5
133750      US   23         1     Ads                    1
95121    China   31         1     Seo                    6
Found numeric features  ['age', 'new_user', 'total_pages_visited']
Found categorical features  ['country', 'source']


In [207]:
# Distribution of each numeric variable
for f in numeric_features:
    fig = px.histogram(data_sample, f, color = 'converted', facet_row = 'converted', histnorm = 'probability')
    fig.show()

In [208]:
# Distribution of each categorical variable

for i in range(len(categorical_features)):
    fig = px.histogram(data_sample[categorical_features[i]])
    fig.show()

In [209]:
## Correlation matrix

corr_matrix = data_sample.corr(numeric_only = True).round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

# Making our model

In [210]:
# Choose variables to use in the model, and create train and test sets
print("Separating labels from features...")
target_variable = "converted"

X = data_sample.drop(target_variable, axis = 1)
Y = data_sample.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
93908     0
234691    0
210916    0
133750    0
95121     0
Name: converted, dtype: int64

X :
       country  age  new_user  source  total_pages_visited
93908       US   40         0  Direct                    3
234691      UK   37         1  Direct                    5
210916      UK   41         1     Ads                    5
133750      US   23         1     Ads                    1
95121    China   31         1     Seo                    6


In [211]:
# Divide dataset Train set & Test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print("...Done.")
print()

...Done.



In [212]:
# Identificate the column data type

numeric_features = list(X.select_dtypes(include=['float', 'int']).columns)
categorical_features = list(X.select_dtypes(exclude=['float', 'int']).columns)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['age', 'new_user', 'total_pages_visited']
Found categorical features  ['country', 'source']


## Training pipeline

In [213]:
numeric_transformer = Pipeline(
    steps=[ #liste de 2 tuples
        ("scaler",  StandardScaler()),  # put data of the column on scale for all columns where mean = 0, and value = std
    ]
)

## Create pipeline for columns with categorical features
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(drop="first")),  # first column will be dropped to avoid creating correlations between features
    ]
)

## Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done to the columns group selected
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


## Preprocessings on train set
print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

## Preprocessings on test set
print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

## Encoding Y on train set (if Y is categorical in this example)
print("Encoding labels...")
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train)
print("...Done")
print(Y_train[0:5])

# Encoding Y on test set
print("Encoding labels...")
Y_test = encoder.transform(Y_test)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
...Done.
[[-1.14143728 -1.49497068 -1.1603993   0.          0.          0.
   0.          0.        ]
 [-0.04842645  0.66890944 -0.26855624  0.          0.          0.
   0.          1.        ]
 [-1.26288293  0.66890944  1.21784886  0.          0.          0.
   0.          0.        ]
 [ 0.55880179 -1.49497068  0.92056784  0.          1.          0.
   0.          0.        ]
 [ 0.68024743  0.66890944  2.10969192  0.          1.          0.
   1.          0.        ]]

Performing preprocessings on test set...
...Done.
[[-0.77710034 -1.49497068 -0.86311828  0.          1.          0.
   0.          1.        ]
 [-0.77710034  0.66890944  0.02872478  0.          0.          1.
   1.          0.        ]
 [-0.77710034  0.66890944  0.3260058   0.          0.          1.
   1.          0.        ]
 [ 1.16603002 -1.49497068 -0.56583726  0.          0.          1.
   0.          0.        ]
 [-1.14143728  0.66890944  0.3260058   0.          0.       

In [246]:
# Train model
print("Train model...")
pipe = Pipeline(steps=[
    ("poly", PolynomialFeatures()),
    ("logit", LogisticRegression())
])

params = {
    "poly__degree" : [1,2,3],
    "logit__C" : [1e-9, 1e-7, 1e-5, 1e-3],
    "logit__penalty" : ["l1","none"]
}


gridsearch = GridSearchCV(pipe, param_grid = params, cv = 3)
gridsearch.fit(X_train, Y_train)

print("...Done.")


Train model...



`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the document

...Done.



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



63 fits failed out of a total of 126.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
63 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\PC\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\PC\anaconda3\Lib\site-

In [247]:
print(gridsearch.cv_results_)
print()
# Finding out best parameters and score
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best score : ", gridsearch.best_score_)

{'mean_fit_time': array([0.00203029, 0.00398715, 0.01163403, 0.02542146, 0.05218355,
       0.08035787, 0.00099667, 0.00199318, 0.0063084 , 0.0146695 ,
       0.06557512, 0.10685873, 0.00099691, 0.00371408, 0.00822012,
       0.01230899, 0.05139995, 0.08803868, 0.00099333, 0.00265368,
       0.00631166, 0.01211484, 0.04591107, 0.07626367, 0.00066566,
       0.00316437, 0.00664647, 0.01121545, 0.04257528, 0.07177679,
       0.0006392 , 0.00304453, 0.00654173, 0.01171883, 0.04365381,
       0.07154234, 0.00099476, 0.00299263, 0.00630927, 0.01190448,
       0.04538353, 0.07187374]), 'std_fit_time': array([5.36677815e-05, 6.25769923e-07, 2.35791949e-03, 1.63016415e-02,
       2.67094611e-03, 8.03912420e-03, 2.97360213e-07, 8.92080638e-07,
       4.64570672e-04, 1.99562211e-03, 7.21288273e-03, 3.50143819e-02,
       3.81635259e-06, 3.38056079e-04, 8.86279079e-04, 2.49058390e-03,
       3.10986287e-03, 2.31422087e-03, 8.09530998e-04, 9.40839798e-04,
       9.39381566e-04, 2.82908905e-03, 3.9

In [250]:
print("Train model...")
pipe = Pipeline(steps=[
    ("poly", PolynomialFeatures()),
    ("logit", LogisticRegression())
])

params = {
    "poly__degree" : [1],
    "logit__C" : [1e-20],
    "logit__penalty" : ["none"]
}


gridsearch_best1 = GridSearchCV(pipe, param_grid = params, cv = 3)
gridsearch_best1.fit(X_train, Y_train)

print("...Done.")

Train model...
...Done.



`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters



In [251]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch_best1.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]



## Test pipeline

In [252]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch_best1.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]



## Performance assessment

In [253]:
# Here, the f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

f1-score on train set :  0.7573221757322175
f1-score on test set :  0.7706422018348624


In [254]:
print("Confusion matrix on train set : ")
print(confusion_matrix(Y_train, Y_train_pred))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(Y_test, Y_test_pred))
print()

Confusion matrix on train set : 
[[7703   37]
 [  79  181]]

Confusion matrix on test set : 
[[1933    2]
 [  23   42]]



In [255]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

gridsearch_best1.fit(X,Y)


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters



In [256]:
# Read data without labels
data_without_labels = pd.read_csv(r'G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\PROJETS\04_SUPERVISED_ML\Conversion rate\conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Warning : check consistency of features_list (must be the same than the features
# used by your best classifier)
X_without_labels = data_without_labels#.loc[:, features_list]

print("Encoding categorical features and standardizing numerical features...")
X_without_labels = preprocessor.transform(X_without_labels)
print("...Done")
print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
Encoding categorical features and standardizing numerical features...
...Done
[[-0.29131775 -1.49497068  3.298816    0.          1.          0.
   0.          1.        ]
 [-1.01999163  0.66890944  0.02872478  0.          1.          0.
   1.          0.        ]
 [ 0.19446484  0.66890944 -1.1603993   0.          0.          0.
   0.          1.        ]
 [ 0.19446484  0.66890944  0.3260058   0.          0.          1.
   0.          0.        ]
 [-0.65565469 -1.49497068 -0.56583726  0.          0.          0.
   0.          1.        ]]


In [257]:
# data = {
#     'converted': gridsearch_best1.predict(X_without_labels)
# }

# Y_predictions = pd.DataFrame(columns=['converted'],data=data)
# Y_predictions.to_csv(r'G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\PROJETS\04_SUPERVISED_ML\Conversion rate\conversion_data_test_predictions_MATHIAS_SAMSON_model_2.csv', index=False)
