### Чтение данных

In [1]:
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
import datetime as dtm
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")
import pickle

In [None]:
data_train = pd.read_csv('data_train.csv')
data_test = pd.read_csv('data_test.csv')

In [None]:
%%time
path = 'D:/Geekbrains/Мегафон/features.csv/'
df_features = dd.read_csv(path + 'features.csv', sep='\t')

### Визуальный просмотр данных и их размерностей

In [None]:
df_features.head()

In [None]:
data_train.head()

In [None]:
data_train['buy_time'].unique()

In [None]:
df_features['buy_time'].drop_duplicates().compute()

In [None]:
data_train.shape

In [None]:
%%time
df_features.shape[0].compute()

### Слияние данных

In [None]:
id_in_train = list(data_train.id)
id_in_test = list(data_test.id)

In [None]:
print(len(id_in_train))
print(len(id_in_test))

In [None]:
df_features.shape

In [None]:
%%time
df_features_for_train = df_features[df_features.id.isin(id_in_train)]

In [None]:
%%time
df_features_for_train_in_pd = df_features_for_train.compute()

In [None]:
df_features_for_train_in_pd.shape

In [None]:
%%time
df_features_for_test = df_features[df_features.id.isin(id_in_test)]

In [None]:
%%time
df_features_for_test_in_pd = df_features_for_test.compute()

In [None]:
df_features_for_test_in_pd.shape

In [None]:
data_train.head()

In [None]:
sort_data_train = data_train.sort_values(by='id')
sort_df_features_for_train_in_pd = df_features_for_train_in_pd.sort_values(by='id')

In [None]:
sort_data_test = data_test.sort_values(by='id')
sort_df_features_for_test_in_pd = df_features_for_test_in_pd.sort_values(by='id')

In [None]:
%%time
full_train = pd.merge_asof(sort_data_train, sort_df_features_for_train_in_pd, on='id', by='buy_time', direction='nearest')

In [None]:
full_train.shape

In [None]:
%%time
full_test = pd.merge_asof(sort_data_test, sort_df_features_for_test_in_pd, on='id', by='buy_time', direction='nearest')

In [None]:
full_test.shape

In [None]:
full_test.head()

In [None]:
# %%time
# full_train.reset_index(drop=True).to_csv('full_train.csv')
# full_test.reset_index(drop=True).to_csv('full_test.csv')

In [None]:
del df_features
del data_train
del df_features_for_train
del df_features_for_test
del df_features_for_train_in_pd
del df_features_for_test_in_pd
del sort_data_train
del sort_df_features_for_train_in_pd
del sort_data_test
del sort_df_features_for_test_in_pd

### Подготовка данных

In [2]:
# %%time
# full_train = pd.read_csv('full_train.csv')
# full_test = pd.read_csv('full_test.csv')

In [None]:
full_train.info()

In [None]:
full_train.describe()

In [3]:
# Столбцы 'Unnamed: 0_x' и 'Unnamed: 0_y' удалим, т.к. они не являются признаками, а отображают только порядок, индексы
full_train.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)
full_test.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)

In [None]:
name_columns_in_train = list(full_train.columns)
name_columns_in_train

In [None]:
full_len = full_train.shape[0]
for column_name in name_columns_in_train:
    if full_train[column_name].count() != full_len:
        count_nan = full_len - full_train[column_name].count()
        print('В столбце {column_name} пропущено {count_nan} значений')

В тренировочном датасете нет пропущенных данных

In [None]:
full_train.target.value_counts()

Наблюдается сильный дисбаланс классов

In [4]:
sort_train = full_train.sort_values(by='buy_time')

In [None]:
print(dtm.datetime.fromtimestamp(min(sort_train.buy_time)))
print(dtm.datetime.fromtimestamp(max(sort_train.buy_time)))

In [None]:
max(sort_train.buy_time)

In [None]:
int(dtm.datetime.strptime('01.12.2018 00:00:00', '%d.%m.%Y %H:%M:%S').timestamp())

In [5]:
timestamp_separation = int(dtm.datetime.strptime('01.12.2018 00:00:00', '%d.%m.%Y %H:%M:%S').timestamp())
df_train = sort_train[sort_train.buy_time < timestamp_separation]
df_test = sort_train[sort_train.buy_time >= timestamp_separation]

In [None]:
df_train.target.value_counts()

In [None]:
df_test.target.value_counts()

In [6]:
X_train = df_train.drop(columns=['target'])
y_train = df_train.target
X_test = df_test.drop(columns=['target'])
y_test_real = df_test.target

In [7]:
del df_train
del df_test

In [None]:
list_name_features = list(X_train.columns)[3:]
list_name_features

In [None]:
# Функция создания, обучения модели, формирования предсказаний и подсчета метрик
def model_preds(model, X_train, y_train, X_test, y_test_real):
    model_class = RandomForestClassifier(random_state=21, class_weight={0 : 1, 1 : 10})
    model_class.fit(X_train, y_train)
    y_preds = model_class.predict(X_test)
    metrics = f1_score(y_test_real, y_preds, average='macro')
    return model_class, y_preds, metrics

In [None]:
# Датафрейм с результатами
columns_name = ['model', 'standart', 'with_std', 'with_PCA_and_std', 
                'for_slice', 'for_slice_with_std', 'for_slice_with_std_and_PCA']
df_metrics_for_models = pd.DataFrame(columns=columns_name)
df_metrics_for_models

#### Рассмотрим модель Random Forest без дополнительных параметров

In [None]:
%%time
model_rf = RandomForestClassifier(random_state=21)
trained_model_rf, y_preds_rf, metrics_rf = model_preds(model_rf, X_train, y_train, X_test, y_test_real)
metrics_rf

Проверка модели Random Forest с весами для классов

In [None]:
%%time
model_rf_with_std = RandomForestClassifier(random_state=21, class_weight={0 : 1, 1 : 10})
trained_model_rf_with_std, y_preds_rf_with_std, metrics_rf_with_std = model_preds(model_rf_with_std, X_train, y_train, \
                                                                                  X_test, y_test_real)
metrics_rf_with_std

Стандартизируем данные

In [None]:
standard_scaler = StandardScaler()
X_std_train = X_train.copy()
X_std_train[list_name_features] = standard_scaler.fit_transform(X_std_train[list_name_features])

In [None]:
X_std_test = X_test.copy()
X_std_test[list_name_features] = standard_scaler.fit_transform(X_std_test[list_name_features])

In [None]:
%%time
model_rf_with_weight = RandomForestClassifier(random_state=21, class_weight={0.0 : 1, 1.0 : 10})
model_rf_with_weight.fit(X_std_train, y_train)
y_preds_rf_with_weight = model_rf_with_weight.predict(X_std_test)
f1_score(y_test_real, y_preds_rf_with_weight, average='macro')

### При помощи модели Random Forest выявим наиболее важные признаки

In [None]:
X_std_train.columns

In [None]:
df_feature_importance = pd.DataFrame({'feature': X_std_train.columns, 
                                      'importance': model_rf_with_weight.feature_importances_})

In [None]:
df_feature_importance.plot()

In [None]:
df_feature_importance.head()

In [None]:
df_feature_importance.sort_values(by='importance', ascending=False).head(100)

In [None]:
X_std_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)]

Построим модель random forest для части признаков

In [None]:
%%time
model_rf_for_slice = RandomForestClassifier(random_state=21)
trained_model_rf_for_slice, y_preds_rf_for_slice, metrics_rf_for_slice = model_preds(model_rf_for_slice, 
                     X_std_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)], 
                     y_train, X_test[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)],
                     y_test_real)
metrics_rf_for_slice

Модель random forest для части признаков с переопределением весов целевой переменной

In [None]:
%%time
model_rf_for_slice_with_std = RandomForestClassifier(random_state=21, class_weight={0 : 1, 1 : 10})
trained_model_rf_for_slice_with_std, y_preds_rf_for_slice_with_std, metrics_rf_for_slice_with_std = model_preds(model_rf_for_slice_with_std, 
                     X_std_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)], 
                     y_train, X_test[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)], 
                     y_test_real)
metrics_rf_for_slice_with_std

In [None]:
# Функция обучения pipeline, формирования предсказаний и подсчета метрик
def pipeline_preds(pipeline, X_train, y_train, X_test, y_test_real):
    pipeline.fit(X_train, y_train)
    y_preds = pipeline.predict(X_test)
    metrics = f1_score(y_test_real, y_preds, average='macro')
    return pipeline, y_preds, metrics

Создадим pipeline для модели random forest 

In [None]:
# %%time
# pipeline_rf_with_std = Pipeline([('standard_scaler', StandardScaler()), 
#                         ('random_forest', RandomForestClassifier(random_state=21))])
# trained_model_rf_with_std, y_preds_rf_with_std, metrics_rf_with_std = pipeline_preds(pipeline_rf_with_std, 
#                                          X_train, y_train, X_test, y_test_real)
# metrics_rf_with_std

Добавим PCA в pipeline для random forest

In [None]:
# %%time
# pipeline_rf_with_pca = Pipeline([('pca', PCA()),
#                         ('standard_scaler', StandardScaler()), 
#                         ('random_forest', RandomForestClassifier(random_state=21))])
# trained_model_rf_with_std_and_PCA, y_preds_rf_with_std_and_PCA, metrics_rf_with_std_and_PCA = pipeline_preds(pipeline_rf_with_pca, X_train, y_train, X_test, y_test_real)
# metrics_rf_with_std_and_PCA

Добавим PCA в pipeline для random forest для части признаков

In [None]:
X_train_slice = X_train[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)]
X_test_slice = X_test[list(df_feature_importance[df_feature_importance.importance > 0.002].feature.values)]

In [None]:
# %%time
# pipeline_rf_slice_with_pca = Pipeline([('pca', PCA()),
#                         ('standard_scaler', StandardScaler()), 
#                         ('random_forest', RandomForestClassifier(random_state=21))])
# trained_model_rf_slice_with_std_and_PCA, y_preds_rf_slice_with_std_and_PCA, metrics_rf_slice_with_std_and_PCA = pipeline_preds(pipeline_rf_slice_with_pca, 
#                         X_train_slice, y_train, X_test_slice, y_test_real)
# metrics_rf_slice_with_std_and_PCA

Добавим результаты вычисления метрик в общий датафрейм

In [None]:
metrics_rf_with_std_and_PCA = np.nan
metrics_rf_slice_with_std_and_PCA = np.nan
result_for_rf = ['Random Forest', metrics_rf, metrics_rf_with_std, metrics_rf_with_std_and_PCA, 
                metrics_rf_for_slice, metrics_rf_for_slice_with_std, metrics_rf_slice_with_std_and_PCA]
df_result_for_rf = pd.DataFrame([result_for_rf], columns=columns_name)
df_metrics_for_models = pd.concat([df_metrics_for_models, df_result_for_rf], axis=0, ignore_index=True)
df_metrics_for_models

#### Рассмотрим алгоритм logistic regression

In [None]:
%%time
pipeline_lr = Pipeline([('logistic_regression', LogisticRegression(random_state=21))])
trained_model_lr, y_preds_lr, metrics_lr = pipeline_preds(pipeline_lr, X_train, y_train, X_test, y_test_real)
metrics_lr

Добавим в logistic regression стандартизацию

In [None]:
%%time
pipeline_lr_with_std = Pipeline([('standard_scaler', StandardScaler()), 
                        ('logistic_regression', LogisticRegression(random_state=21))])
trained_model_lr_with_std, y_preds_lr_with_std, metrics_lr_with_std = pipeline_preds(pipeline_lr_with_std, 
                        X_train, y_train, X_test, y_test_real)
metrics_lr_with_std

Добавим в logistic regression стандартизацию и PCA

In [None]:
%%time
pipeline_lr_with_std_pca = Pipeline([('pca', PCA()),
                                   ('standard_scaler', StandardScaler()), 
                                    ('logistic_regression', LogisticRegression(random_state=21))])
trained_model_lr_with_std_pca, y_preds_lr_with_std_pca, metrics_lr_with_std_pca = pipeline_preds(pipeline_lr_with_std_pca, 
                                    X_train, y_train, X_test, y_test_real)
metrics_lr_with_std_pca

Рассмотрим logistic regression для ограниченного набора признаков

In [None]:
%%time
pipeline_lr_slice = Pipeline([('logistic_regression', LogisticRegression(random_state=21))])
trained_model_lr_slice, y_preds_lr_slice, metrics_lr_slice = pipeline_preds(pipeline_lr_slice, 
                                                X_train_slice, y_train, X_test_slice, y_test_real)
metrics_lr_slice

Добавим в logistic regression для ограниченного набора признаков стандартизацию

In [None]:
%%time
pipeline_lr_slice_with_std = Pipeline([('standard_scaler', StandardScaler()), 
                        ('logistic_regression', LogisticRegression(random_state=21))])
trained_model_lr_slice_with_std, y_preds_lr_slice_with_std, metrics_lr_slice_with_std = pipeline_preds(pipeline_lr_slice_with_std, 
                        X_train_slice, y_train, X_test_slice, y_test_real)
metrics_lr_slice_with_std

Добавим в logistic regression для ограниченного набора признаков стандартизацию и PCA

In [None]:
%%time
pipeline_lr_slice_with_std_pca = Pipeline([('pca', PCA()),
                                   ('standard_scaler', StandardScaler()), 
                                    ('logistic_regression', LogisticRegression(random_state=21))])
trained_model_lr_slice_with_std_pca, y_preds_lr_slice_with_std_pca, metrics_lr_slice_with_std_pca = pipeline_preds(pipeline_lr_slice_with_std_pca, 
                                    X_train_slice, y_train, X_test_slice, y_test_real)
metrics_lr_slice_with_std_pca

In [None]:
result_for_logreg = ['Logistic Regression', metrics_lr, metrics_lr_with_std, metrics_lr_with_std_pca, 
                metrics_lr_slice, metrics_lr_slice_with_std, metrics_lr_slice_with_std_pca]
df_result_for_logreg = pd.DataFrame([result_for_logreg], columns=columns_name)
df_metrics_for_models = pd.concat([df_metrics_for_models, df_result_for_logreg], axis=0, ignore_index=True)
df_metrics_for_models

#### Рассмотрим модель Gradient Boosting

In [None]:
%%time
pipeline_gb = Pipeline([('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb, y_preds_gb, metrics_gb = pipeline_preds(pipeline_gb, X_train, y_train, X_test, y_test_real)
metrics_gb

Добавим стандартизацию данных в модель Gradient Boosting

In [None]:
%%time
pipeline_gb_with_std = Pipeline([('standard_scaler', StandardScaler()), 
                                ('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb_with_std, y_preds_gb_with_std, metrics_gb_with_std = pipeline_preds(pipeline_gb_with_std, 
                                    X_train, y_train, X_test, y_test_real)
metrics_gb_with_std

Добавим стандартизацию данных и PCA в модель Gradient Boosting

In [None]:
# %%time
# pipeline_gb_with_std_pca = Pipeline([('pca', PCA()),
#                                 ('standard_scaler', StandardScaler()), 
#                                 ('gradient_boosting', GradientBoostingClassifier(random_state=21))])
# trained_model_gb_with_std_pca, y_preds_gb_with_std_pca, metrics_gb_with_std_pca = pipeline_preds(pipeline_gb_with_std_pca, 
#                                     X_train, y_train, X_test, y_test_real)
# metrics_gb_with_std_pca

Рассмотрим модель Gradient Boosting для отобранной части признаков

In [None]:
%%time
pipeline_gb_slice = Pipeline([('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb_slice, y_preds_gb_slice, metrics_gb_slice = pipeline_preds(pipeline_gb_slice, X_train_slice, 
                                                                            y_train, X_test_slice, y_test_real)
metrics_gb_slice

Рассмотрим модель Gradient Boosting со стандартизацией данных для отобранной части признаков

In [None]:
%%time
pipeline_gb_slice_with_std = Pipeline([('standard_scaler', StandardScaler()), 
                                ('gradient_boosting', GradientBoostingClassifier(random_state=21))])
trained_model_gb_slice_with_std, y_preds_gb_slice_with_std, metrics_gb_slice_with_std = pipeline_preds(pipeline_gb_slice_with_std, 
                                    X_train_slice, y_train, X_test_slice, y_test_real)
metrics_gb_slice_with_std

Рассмотрим модель Gradient Boosting со стандартизацией данных и PCA для отобранной части признаков

In [None]:
# %%time
# pipeline_gb_with_slice_std_pca = Pipeline([('pca', PCA()),
#                                 ('standard_scaler', StandardScaler()), 
#                                 ('gradient_boosting', GradientBoostingClassifier(random_state=21))])
# trained_model_gb_slice_with_std_pca, y_preds_gb_slice_with_std_pca, metrics_gb_slice_with_std_pca = pipeline_preds(pipeline_gb_slice_with_std_pca, 
#                                     X_train_slice, y_train, X_test_slice, y_test_real)
# metrics_gb_slice_with_std_pca

In [None]:
metrics_gb_with_std_pca = np.nan
metrics_gb_slice_with_std_pca = np.nan
result_for_gb = ['Gradient Boosting', metrics_gb, metrics_gb_with_std, metrics_gb_with_std_pca, 
                metrics_gb_slice, metrics_gb_slice_with_std, metrics_gb_slice_with_std_pca]
df_result_for_gb = pd.DataFrame([result_for_gb], columns=columns_name)
df_metrics_for_models = pd.concat([df_metrics_for_models, df_result_for_gb], axis=0, ignore_index=True)
df_metrics_for_models

#### Рассмотрим алгоритм knn

In [None]:
%%time
pipeline_knn = Pipeline([('knn', KNeighborsClassifier())])
trained_model_knn, y_preds_knn, metrics_knn = pipeline_preds(pipeline_knn, X_train, y_train, X_test, y_test_real)
metrics_knn

Добавим к алгоритму knn стандартизацию

In [None]:
%%time
pipeline_knn_std = Pipeline([('standard_scaler', StandardScaler()), 
                                ('knn', KNeighborsClassifier())])
trained_model_knn_std, y_preds_knn_with_std, metrics_knn_with_std = pipeline_preds(pipeline_knn_std, X_train, y_train,
                                                                                   X_test, y_test_real)
metrics_knn_with_std

Добавим к алгоритму knn стандартизацию и PCA

In [None]:
# %%time
# pipeline_knn_std_pca = Pipeline([('pca', PCA()),
#                                 ('standard_scaler', StandardScaler()), 
#                                 ('knn', KNeighborsClassifier())])
# trained_model_knn_with_std_pca, y_preds_knn_with_std_pca, metrics_knn_with_std_pca = pipeline_preds(pipeline_knn_with_std_pca, 
#                                     X_train, y_train, X_test, y_test_real)
# metrics_knn_with_std_pca

Применим алгоритм knn к ограниченному набору признаков

In [None]:
%%time
pipeline_knn_with_slice = Pipeline([('knn', KNeighborsClassifier())])
trained_model_knn_slice, y_preds_knn_slice, metrics_knn_slice = pipeline_preds(pipeline_knn_with_slice, 
                                    X_train_slice, y_train, X_test_slice, y_test_real)
metrics_knn_slice

Добавим к алгоритму knn для ограниченного набора признаков стандартизацию

In [None]:
# %%time
# pipeline_knn_with_slice_std = Pipeline([('standard_scaler', StandardScaler()), 
#                                 ('knn', KNeighborsClassifier())])
# trained_model_knn_slice_with_std, y_preds_knn_slice_with_std, metrics_knn_slice_with_std = pipeline_preds(pipeline_knn_with_slice_std, 
#                                     X_train_slice, y_train, X_test_slice, y_test_real)
# metrics_knn_slice_with_std

Рассмотрим модель knn со стандартизацией данных и PCA для отобранной части признаков

In [None]:
# %%time
# pipeline_knn_with_slice_std_pca = Pipeline([('pca', PCA()),
#                                 ('standard_scaler', StandardScaler()), 
#                                 ('knn', KNeighborsClassifier())])
# trained_model_knn_slice_with_std_pca, y_preds_knn_slice_with_std_pca, metrics_knn_slice_with_std_pca = pipeline_preds(pipeline_knn_with_slice_std_pca, 
#                                     X_train_slice, y_train, X_test_slice, y_test_real)
# metrics_knn_slice_with_std_pca

In [None]:
metrics_knn_with_std_pca = np.nan
metrics_knn_slice_with_std = np.nan
metrics_knn_slice_with_std_pca = np.nan
result_for_knn = ['K Neighbors Classifier', metrics_knn, metrics_knn_with_std, metrics_knn_with_std_pca, 
                metrics_knn_slice, metrics_knn_slice_with_std, metrics_knn_slice_with_std_pca]
df_result_for_knn = pd.DataFrame([result_for_knn], columns=columns_name)
df_metrics_for_models = pd.concat([df_metrics_for_models, df_result_for_knn], axis=0, ignore_index=True)
df_metrics_for_models

Наилучший результат дала модель GradientBoostingClassifier без стандартизации. Подберем гиперпараметры для этой модели

In [None]:
%%time
parameters = {'n_estimators': [100, 200, 300], 'max_depth': [1, 3, 5, 10]}
model_gb = GradientBoostingClassifier(random_state=21)
f1 = make_scorer(f1_score, average='macro')
grid_search_for_gb = GridSearchCV(model_gb, parameters, scoring=f1)
grid_search_for_gb.fit(X_train, y_train)

In [None]:
grid_search_for_gb.best_params_

In [None]:
grid_search_for_gb.cv_results_

### Финальное обучение модели, соранение модели и выполнение предсказания

In [None]:
X_train_final = full_train.drop(columns=['target'])
y_train_final = full_train.target

In [None]:
del full_train

In [None]:
best_model = GradientBoostingClassifier(random_state=21, max_depth=3, n_estimators=100)

In [None]:
best_model.fit(X_train_final, y_train_final)

In [None]:
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [None]:
y_predict_final = best_model.predict(full_test)

In [None]:
full_test_result['target'] = y_predict_final

In [None]:
result = full_test_result[['id', 'vas_id', 'buy_time', 'target']].reset_index(drop=True)

In [None]:
result.head()

In [None]:
result.to_csv('answers_test.csv')

In [None]:
with open('best_model.pkl', 'rb') as file: 
    pickle_model = pickle.load(file) 

In [None]:
pickle_model.predict(full_test)