In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
import pickle
from IPython.display import display
%matplotlib inline

In [2]:
train = pd.read_pickle('new_train.pkl')
train.iloc[0:15, :]

Unnamed: 0,timestamp,device_id,user_id,won_price,clicked
0,1430252301,452439,316644,89000000,0
1,1430174639,94548,18299,173000000,0
2,1430253256,234183,167948,91000000,0
3,1430196621,240271,233483,128000000,0
4,1430214959,62190,105502,154000000,0
5,1430251111,255660,255364,28000000,1
6,1430208827,411813,281945,240000000,1
7,1430169254,255493,448034,188000000,0
8,1430223121,359788,17832,260000000,1
9,1430223138,251517,118937,116000000,0


In [3]:
t = (train['clicked'] == 0)
t.value_counts()

True     724386
False    605648
Name: clicked, dtype: int64

In [4]:
data = train.iloc[:,0:4]
labels = train.iloc[:,4]
data.head()

Unnamed: 0,timestamp,device_id,user_id,won_price
0,1430252301,452439,316644,89000000
1,1430174639,94548,18299,173000000
2,1430253256,234183,167948,91000000
3,1430196621,240271,233483,128000000
4,1430214959,62190,105502,154000000


In [5]:
train_x, valid_x, train_y, valid_y = train_test_split(data, labels, test_size=0.4, random_state=50)
train_x.shape, valid_x.shape, train_y.shape, valid_y.shape

((798020, 4), (532014, 4), (798020,), (532014,))

In [5]:
clf = RandomForestClassifier(n_estimators=101, n_jobs=-1, min_samples_split=30)
scores = cross_val_score(clf, data, labels, cv=5)
scores

array([0.98417717, 0.98345532, 0.98400042, 0.9839139 , 0.98356428])

In [6]:
clf = RandomForestClassifier(n_estimators=101, n_jobs=-1, min_samples_split=30)
%time _ = clf.fit(train_x, train_y) #apprentissage

CPU times: user 4min 43s, sys: 344 ms, total: 4min 43s
Wall time: 1min 17s


In [7]:
print([estimator.tree_.max_depth for estimator in clf.estimators_]) #display the depth of each tree

[74, 72, 70, 66, 77, 67, 66, 71, 67, 70, 74, 69, 65, 83, 59, 65, 60, 68, 59, 86, 62, 63, 67, 64, 57, 66, 86, 58, 83, 65, 67, 61, 72, 70, 56, 70, 71, 66, 61, 83, 65, 69, 67, 60, 64, 66, 72, 65, 64, 66, 73, 68, 63, 67, 68, 63, 70, 65, 80, 62, 71, 62, 70, 72, 75, 71, 76, 65, 64, 65, 68, 68, 63, 83, 59, 76, 68, 68, 65, 63, 67, 80, 71, 62, 70, 63, 63, 73, 70, 73, 71, 79, 73, 67, 68, 68, 65, 65, 76, 61, 86]


In [8]:
print(clf.score(train_x, train_y))
print(clf.score(valid_x, valid_y)) #validation

0.9893686875015664
0.9749611852319676


In [9]:
roc_auc_score(valid_y, clf.predict_proba(valid_x)[:,1])

0.9948420709258081

In [10]:
confusion_matrix(valid_y, clf.predict(valid_x), clf.classes_)

array([[277899,  11721],
       [  1600, 240794]])

In [11]:
pickle.dump(clf, open('final_model.sav', 'wb')) #save the model

In [12]:
clf = pickle.load(open('final_model.sav', 'rb')) #load the model

In [13]:
print(clf.score(train_x, train_y))
print([estimator.tree_.max_depth for estimator in clf.estimators_])
print(clf.score(valid_x, valid_y))

0.9893686875015664
[74, 72, 70, 66, 77, 67, 66, 71, 67, 70, 74, 69, 65, 83, 59, 65, 60, 68, 59, 86, 62, 63, 67, 64, 57, 66, 86, 58, 83, 65, 67, 61, 72, 70, 56, 70, 71, 66, 61, 83, 65, 69, 67, 60, 64, 66, 72, 65, 64, 66, 73, 68, 63, 67, 68, 63, 70, 65, 80, 62, 71, 62, 70, 72, 75, 71, 76, 65, 64, 65, 68, 68, 63, 83, 59, 76, 68, 68, 65, 63, 67, 80, 71, 62, 70, 63, 63, 73, 70, 73, 71, 79, 73, 67, 68, 68, 65, 65, 76, 61, 86]
0.9749611852319676
