In [2]:
import re
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

# Modelling

### 1. RandomForest

Формирую датасет для обучения и делю его на train и test

In [3]:
df_prep = pd.read_csv('data/final_prepared.csv')

In [30]:
target_0 = df_prep[df_prep.target_action == 0]
target_1 = df_prep[df_prep.target_action == 1]

target_0_downsampled = resample(
    target_0,
    replace = False,
    n_samples = len(target_1),
    random_state = 27,
    )


downsampled = pd.concat([target_0_downsampled, target_1])
downsampled

Unnamed: 0,session_id,target_action,utm_source_BHcvLfOaCWvWTykYqHVe,utm_source_BKeImrJuRDZcHiSSTdzm,utm_source_DnEUulZAecfGPvdtZBYS,utm_source_GpAkIXsclxDGyILfNlrR,utm_source_ISrKoXQCxqqYvAZICvjs,utm_source_IZEXUFLARCUMynmHNBGo,utm_source_KgicpPxiEQfzPlPwQZJq,utm_source_MvfHsxITijuriZxsqZqt,...,geo_city_Yekaterinburg,geo_city_Yuzhno-Sakhalinsk,geo_city_Zheleznodorozhny,geo_city_Zhukovskiy,geo_city_other_city,visit_number_std,month_std,day_std,hour_std,screen_square_std
78308,1128117812396260435.1621861009.1621861111,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.031966,-1.937681,0.785158,0.226205,-0.511824
386446,26117023457888840.1640778316.1640778316,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,1.231346,1.353871,0.063092,-0.828355
715957,4192513816316069945.1634715705.1634715705,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,0.325910,0.330187,-0.589360,-0.509734
550597,3395355048526924528.1640422003.1640422003,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.053926,1.231346,0.898900,-0.426247,-0.975865
40990,949066458394112298.1624355115.1624355115,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,-1.484963,0.557673,-0.263134,-1.068826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732152,9054912872702758664.1636781836.1636781836,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,0.778628,-0.466011,-0.915586,-0.680845
1732225,9055248417020050011.1634073188.1634073188,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,0.325910,-0.466011,-2.220490,-0.754600
1732227,9055248417020050011.1634075541.1634075541,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.031966,0.325910,-0.466011,-2.220490,-0.754600
1732254,9055376699099939975.1630766214.1630766214,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,-0.126808,-1.489694,0.552431,1.708504


In [31]:
downsampled['target_action'].value_counts()

target_action
0    50314
1    50314
Name: count, dtype: int64

In [42]:
df_forest = downsampled.copy()
df_forest = df_forest.drop(['session_id'], axis=1)

x = df_forest.drop(['target_action'], axis=1)
y = df_forest['target_action']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42
)

Обучаю модель RandomForest

In [43]:
rf_clf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=3, 
    bootstrap=False, 
    random_state=42, 
    max_depth=80,
    # class_weight='balanced'
)

rf_clf.fit(x_train, y_train)

In [44]:
predicted_train_rf = rf_clf.predict(x_train)
predicted_test_rf = rf_clf.predict(x_test)

print(accuracy_score(y_train, predicted_train_rf))
print(accuracy_score(y_test, predicted_test_rf))

0.9907153707463195
0.6127397396402663


In [46]:
roc_auc_score(y_test, rf_clf.predict_proba(x_test)[:, 1])

0.658539249040162

Кросс-валидация

In [None]:
cross_validate(rf_clf, x, y, cv=5)

KeyboardInterrupt: 

In [None]:
cv_score = cross_val_score(rf_clf, x, y, cv=5)

In [None]:
print(cv_score.mean())
print(cv_score.std())

0.7685304330534013
0.0093588363279649


### 2. LogisticRegression

Формирую датасет для обучения и делю его на train и test

In [4]:
logreg_df = df_prep.copy()
x = logreg_df.drop(['target_action', 'session_id'], axis=1)
y = logreg_df['target_action']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

Обучаю модель логистической регрессии

In [29]:
lr = LogisticRegression(
    C=4,
    max_iter=150,
    random_state=42,
    solver='liblinear',
    class_weight='balanced'
    )
lr.fit(x_train, y_train)

In [28]:
roc_auc_score(y_train, lr.predict_proba(x_train)[:, 1])

0.6677683830054905

In [26]:
roc_auc_score(y_test, lr.predict_proba(x_test)[:, 1])

0.6653865790842489

In [22]:
cf = list(zip(lr.feature_names_in_, lr.coef_[0]))
cf

[('utm_source_BHcvLfOaCWvWTykYqHVe', 1.403212650822588),
 ('utm_source_BKeImrJuRDZcHiSSTdzm', 1.1678270680447125),
 ('utm_source_DnEUulZAecfGPvdtZBYS', 1.0974918666495053),
 ('utm_source_GpAkIXsclxDGyILfNlrR', -0.21871747828899518),
 ('utm_source_ISrKoXQCxqqYvAZICvjs', 0.19355664974935727),
 ('utm_source_IZEXUFLARCUMynmHNBGo', 0.2914054215611115),
 ('utm_source_KgicpPxiEQfzPlPwQZJq', 0.5799898863389056),
 ('utm_source_MvfHsxITijuriZxsqZqt', 0.12624733911733169),
 ('utm_source_PlbkrSYoHuZBWfYjYnfw', -1.730630477404642),
 ('utm_source_QxAxdyPLuQMEcrdZWdWb', 1.8470216874994083),
 ('utm_source_RmEBuqrriAfAVsLQQmhk', -0.4864083519985542),
 ('utm_source_SzZERoLMmrEUEhDaYcyN', 0.442078599957388),
 ('utm_source_TxKUcPpthBDPieTGmVhx', -4.553389235030803),
 ('utm_source_YlsczTIyBSwTLNtuDkCd', 0.02448653210400662),
 ('utm_source_ZpYIoDJMcFzVoPFsHGJL', 1.1468150892097129),
 ('utm_source_aXQzDWsJuGXeBXexNHjc', 1.1974598565724757),
 ('utm_source_bByPQxmDaMXgpHeypKSM', 1.3305739436783712),
 ('utm_sou

In [23]:
logreg_pred_train = lr.predict(x_train)
print('accuracy train', accuracy_score(y_train, logreg_pred_train))

logreg_pred_test = lr.predict(x_test)
print('accuracy test' ,accuracy_score(y_test, logreg_pred_test))
print('confusion_matrix', confusion_matrix(y_test, logreg_pred_test))

accuracy train 0.5950744936853963
accuracy test 0.5934786791871921
confusion_matrix [[298697 205840]
 [  5421   9722]]


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'max_iter': list(range(150, 501, 50)),
   'solver': ['liblinear', 'lbfgs', 'saga', 'newton-cg'],
   'C': list(range(1, 10)),
   'random_state': list(range(10, 101, 10)),
   # 'verbose': [1, 0]
}
new_lr = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(
   estimator=new_lr,
   param_grid=param_grid,
   scoring='accuracy',
   verbose=1,
   n_jobs=-1
)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

grid_search_lr.fit(x_train, y_train)

best_params = grid_search_lr.best_params_
best_params

Fitting 5 folds for each of 2880 candidates, totalling 14400 fits




{'C': 4, 'max_iter': 150, 'random_state': 10, 'solver': 'lbfgs'}

In [None]:
cv_score = cross_val_score(lr, x, y, cv=5)
print(cv_score.mean())
print(cv_score.std())

0.6144214790916603
0.004649910510920084


### 3. Многослойный персептрон

Формирую датасет для обучения и делю его на train и test

In [32]:
mlp_df = downsampled.copy()
x = mlp_df.drop(['target_action', 'session_id'], axis=1)
y = mlp_df['target_action']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [33]:
mlp = MLPClassifier(
    random_state=42, 
    max_iter=500, 
    hidden_layer_sizes=(100,40),
    )
mlp.fit(x_train, y_train)

In [34]:
mlp_pred_train = mlp.predict(x_train)
print('accuracy train' ,accuracy_score(y_train, mlp_pred_train))

mlp_pred_test = mlp.predict(x_test)
print('accuracy test' ,accuracy_score(y_test, mlp_pred_test))

accuracy train 0.7853887761041468
accuracy test 0.598761138162907


In [36]:
roc_auc_score(y_test, mlp.predict_proba(x_test)[:, 1])

0.6309822131462819

In [None]:
cv_score = cross_val_score(mlp, x, y, cv=5)
print(cv_score.mean())
print(cv_score.std())

0.7411609787934876
0.012725531046048847


# Results

В моем случае лучше всего показала себя модель random forest.
По результатам кросс-вализации:
1. accuracy = 0.76
2. std = 0.009

Обучение модели на всем датасете

In [None]:
df_forest = df_prep.copy()
df_forest = df_forest.drop(['id', 'price'], axis=1)

x = df_forest.drop(['price_category'], axis=1)
y = df_forest['price_category']

In [None]:
rf_clf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=3, 
    bootstrap=False, 
    random_state=42, 
    max_depth=80,
)

rf_clf.fit(x, y)

Сохраняю полученню модель

In [None]:
with open('model.pickle', 'wb') as file:
    pickle.dump(rf_clf, file)