In [1]:
import re
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

# Modelling

### 1. RandomForest

Формирую датасет для обучения и делю его на train и test

In [2]:
df_prep = pd.read_csv('data/final_prepared.csv')

In [3]:
target_0 = df_prep[df_prep.target_action == 0]
target_1 = df_prep[df_prep.target_action == 1]

target_0_downsampled = resample(
    target_0,
    replace = False,
    n_samples = len(target_1),
    random_state = 27,
    )


downsampled = pd.concat([target_0_downsampled, target_1])
downsampled

Unnamed: 0,session_id,target_action,utm_source_BHcvLfOaCWvWTykYqHVe,utm_source_BKeImrJuRDZcHiSSTdzm,utm_source_DnEUulZAecfGPvdtZBYS,utm_source_GpAkIXsclxDGyILfNlrR,utm_source_ISrKoXQCxqqYvAZICvjs,utm_source_IZEXUFLARCUMynmHNBGo,utm_source_KgicpPxiEQfzPlPwQZJq,utm_source_MvfHsxITijuriZxsqZqt,...,geo_city_Yekaterinburg,geo_city_Yuzhno-Sakhalinsk,geo_city_Zheleznodorozhny,geo_city_Zhukovskiy,geo_city_other_city,visit_number_std,month_std,day_std,hour_std,screen_square_std
78308,1128117812396260435.1621861009.1621861111,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.031966,-1.937681,0.785158,0.226205,-0.511824
386446,26117023457888840.1640778316.1640778316,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,1.231346,1.353871,0.063092,-0.828355
715957,4192513816316069945.1634715705.1634715705,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,0.325910,0.330187,-0.589360,-0.509734
550597,3395355048526924528.1640422003.1640422003,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.053926,1.231346,0.898900,-0.426247,-0.975865
40990,949066458394112298.1624355115.1624355115,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,-1.484963,0.557673,-0.263134,-1.068826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732152,9054912872702758664.1636781836.1636781836,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,0.778628,-0.466011,-0.915586,-0.680845
1732225,9055248417020050011.1634073188.1634073188,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,0.325910,-0.466011,-2.220490,-0.754600
1732227,9055248417020050011.1634075541.1634075541,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.031966,0.325910,-0.466011,-2.220490,-0.754600
1732254,9055376699099939975.1630766214.1630766214,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.139817,-0.126808,-1.489694,0.552431,1.708504


In [31]:
downsampled['target_action'].value_counts()

target_action
0    50314
1    50314
Name: count, dtype: int64

In [4]:
df_forest = downsampled.copy()
df_forest = df_forest.drop(['session_id'], axis=1)

x = df_forest.drop(['target_action'], axis=1)
y = df_forest['target_action']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42
)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'n_estimators': list(range(150, 501, 50)),
   'min_samples_split': list(range(1, 11)),
   'bootstrap': [False ,True],
   'random_state': list(range(10, 101, 10)),
   'max_depth': list(range(10, 200, 20)),
   # 'verbose': [1, 0]
}
rf_clf = RandomForestClassifier()
grid_search_rf = GridSearchCV(
   estimator=rf_clf,
   param_grid=param_grid,
   scoring='roc_auc',
   # verbose=1,
   n_jobs=-1
)
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

grid_search_rf.fit(x_train, y_train)

best_params = grid_search_rf.best_params_
best_params

Обучаю модель RandomForest

In [6]:
rf_clf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=3, 
    # bootstrap=False,
    random_state=42, 
    max_depth=80,
    # class_weight='balanced'
)

rf_clf.fit(x_train, y_train)

In [7]:
predicted_train_rf = rf_clf.predict(x_train)
predicted_test_rf = rf_clf.predict(x_test)

print(accuracy_score(y_train, predicted_train_rf))
print(accuracy_score(y_test, predicted_test_rf))

0.9907153707463195
0.6125741164000132


In [8]:
roc_auc_score(y_test, rf_clf.predict_proba(x_test)[:, 1])

0.6586105360620775

Кросс-валидация

In [None]:
cross_validate(rf_clf, x, y, cv=5)

KeyboardInterrupt: 

In [None]:
cv_score = cross_val_score(rf_clf, x, y, cv=5)

In [None]:
print(cv_score.mean())
print(cv_score.std())

0.7685304330534013
0.0093588363279649


### 2. LogisticRegression

Формирую датасет для обучения и делю его на train и test

In [4]:
# logreg_df = df_prep.copy()
# x = logreg_df.drop(['target_action', 'session_id'], axis=1)
# y = logreg_df['target_action']
#
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

Обучаю модель логистической регрессии

In [26]:
lr = LogisticRegression(
    C=1,
    max_iter=150,
    random_state=42,
    solver='newton-cg',
    # class_weight='balanced'
    )
lr.fit(x_train, y_train)

In [27]:
roc_auc_score(y_train, lr.predict_proba(x_train)[:, 1])

0.67104060547112

In [28]:
roc_auc_score(y_test, lr.predict_proba(x_test)[:, 1])

0.6654918623683124

In [12]:
cf = list(zip(lr.feature_names_in_, lr.coef_[0]))
cf

[('utm_source_BHcvLfOaCWvWTykYqHVe', 1.2840175916981038),
 ('utm_source_BKeImrJuRDZcHiSSTdzm', 1.2745057990878974),
 ('utm_source_DnEUulZAecfGPvdtZBYS', 0.9754686289175221),
 ('utm_source_GpAkIXsclxDGyILfNlrR', -0.55104992383298),
 ('utm_source_ISrKoXQCxqqYvAZICvjs', -0.016417011362737245),
 ('utm_source_IZEXUFLARCUMynmHNBGo', -0.16912605565699054),
 ('utm_source_KgicpPxiEQfzPlPwQZJq', 0.8019570631258209),
 ('utm_source_MvfHsxITijuriZxsqZqt', -0.05499644786353818),
 ('utm_source_PlbkrSYoHuZBWfYjYnfw', -1.7806807176286044),
 ('utm_source_QxAxdyPLuQMEcrdZWdWb', 1.6770787981344264),
 ('utm_source_RmEBuqrriAfAVsLQQmhk', -0.7758355395988668),
 ('utm_source_SzZERoLMmrEUEhDaYcyN', 0.5698341336056385),
 ('utm_source_TxKUcPpthBDPieTGmVhx', -2.550760586210838),
 ('utm_source_YlsczTIyBSwTLNtuDkCd', -0.2173192051931357),
 ('utm_source_ZpYIoDJMcFzVoPFsHGJL', 0.8457134406556994),
 ('utm_source_aXQzDWsJuGXeBXexNHjc', 1.1488920648944876),
 ('utm_source_bByPQxmDaMXgpHeypKSM', 1.3537033942602095),
 ('ut

In [13]:
logreg_pred_train = lr.predict(x_train)
print('accuracy train', accuracy_score(y_train, logreg_pred_train))

logreg_pred_test = lr.predict(x_test)
print('accuracy test' ,accuracy_score(y_test, logreg_pred_test))
print('confusion_matrix', confusion_matrix(y_test, logreg_pred_test))

accuracy train 0.6191882337909397
accuracy test 0.61293848752857
confusion_matrix [[8731 6430]
 [5255 9773]]


In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = {
   'max_iter': list(range(150, 501, 50)),
   'solver': ['liblinear', 'lbfgs', 'saga', 'newton-cg'],
   'C': list(range(1, 10)),
   'random_state': list(range(10, 101, 10)),
   # 'verbose': [1, 0]
}
new_lr = LogisticRegression()
grid_search_lr = GridSearchCV(
   estimator=new_lr,
   param_grid=param_grid,
   scoring='roc_auc',
   # verbose=1,
   n_jobs=-1
)
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

grid_search_lr.fit(x_train, y_train)

best_params = grid_search_lr.best_params_
best_params

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 1, 'max_iter': 150, 'random_state': 10, 'solver': 'newton-cg'}

In [None]:
cv_score = cross_val_score(lr, x, y, cv=5)
print(cv_score.mean())
print(cv_score.std())

0.6144214790916603
0.004649910510920084


### 3. Многослойный персептрон

Формирую датасет для обучения и делю его на train и test

In [32]:
# mlp_df = downsampled.copy()
# x = mlp_df.drop(['target_action', 'session_id'], axis=1)
# y = mlp_df['target_action']
#
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [14]:
mlp = MLPClassifier(
    random_state=42, 
    max_iter=500, 
    hidden_layer_sizes=(100,40),
    )
mlp.fit(x_train, y_train)

In [15]:
mlp_pred_train = mlp.predict(x_train)
print('accuracy train' ,accuracy_score(y_train, mlp_pred_train))

mlp_pred_test = mlp.predict(x_test)
print('accuracy test' ,accuracy_score(y_test, mlp_pred_test))

accuracy train 0.7853887761041468
accuracy test 0.598761138162907


In [16]:
roc_auc_score(y_test, mlp.predict_proba(x_test)[:, 1])

0.6309822175353363

In [None]:
cv_score = cross_val_score(mlp, x, y, cv=5)
print(cv_score.mean())
print(cv_score.std())

0.7411609787934876
0.012725531046048847


# Results

В моем случае лучше всего показала себя модель random forest.
По результатам кросс-вализации:
1. accuracy = 0.76
2. std = 0.009

Обучение модели на всем датасете

In [None]:
df_forest = df_prep.copy()
df_forest = df_forest.drop(['id', 'price'], axis=1)

x = df_forest.drop(['price_category'], axis=1)
y = df_forest['price_category']

In [None]:
rf_clf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=3, 
    bootstrap=False, 
    random_state=42, 
    max_depth=80,
)

rf_clf.fit(x, y)

Сохраняю полученню модель

In [None]:
with open('model.pickle', 'wb') as file:
    pickle.dump(rf_clf, file)