In [None]:
!pip3 install -q numpy==1.22.4
!pip3 install -q pandas==1.5.3
!pip3 install -q scikit-learn
!pip3 install joblib

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import joblib 

import time
import os

---

Чтение данных

In [None]:
path1 = 'datasets/tree/train_tree.csv'

def read_file(path):
    df = pd.DataFrame()
    if os.path.exists(path):
        df = pd.read_csv(path, sep=',')
    elif os.path.exists(path[1:]):
        df = pd.read_csv(path[1:], sep=',')
    else:
        print('No such file or directory') 
        raise FileNotFoundError('No such file or directory')
    return df

df_train = read_file(path1)

---

## Разбиение тренировочной выборки на тренировочную и владиационную

In [None]:
X_tree = df_train.drop(columns=['Transported'], axis=1)
y_tree = df_train.Transported

print(y_tree.shape[0])

In [None]:
X_train_tree, X_val_tree, y_train_tree, y_val_tree = \
        train_test_split(X_tree, y_tree, test_size = 0.3, shuffle=True, random_state = 42)

print(y_train_tree.shape, y_val_tree.shape)

In [None]:
new_directory = 'datasets/for_comparison'

os.makedirs(new_directory, exist_ok=True)

In [None]:
X_val_tree.to_csv('datasets/for_comparison/X_val_tree.csv', index=False)
y_val_tree.to_csv('datasets/for_comparison/y_val_tree.csv', index=False)

---

# Вспомогательные функции

In [None]:
def grid_search_cv(model, cv, param_grid, X_train_set, y_train_set, scores, refit=True):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scores, cv=cv, refit=refit)

    start_time = time.time()

    grid_search.fit(X_train_set, y_train_set)

    total_time = time.time() - start_time
    print(f"Total time spent on grid search (wall-clock): {total_time:.2f} seconds or \
            {(total_time // 60):.0f} minutes {(total_time % 60):.2f} seconds")
    
    return grid_search

In [None]:
def custom_refit(cv_results):
    sorted_indices = np.lexsort((-cv_results["mean_test_roc_auc"], -cv_results["mean_test_f1"]))
    return sorted_indices[0]

In [None]:
def print_top_low(grid_search, show_columns, sort_by = ['rank_test_score', 'index'], n_rows = 10):
    df_results = pd.DataFrame(grid_search.cv_results_)
    df_results.index.name = 'index'
    df_results = df_results.sort_values(by=sort_by).reset_index(drop=True)

    print('Top ' + str(n_rows))
    display(df_results[:n_rows][show_columns])

    print('Low ' + str(n_rows))
    display(df_results[-n_rows:][show_columns])
    
    return df_results

In [None]:
def custom_threshold_scorer(estimator, X, y, threshold=0.5, scorer=f1_score):
    probabilities = estimator.predict_proba(X)

    predictions = (probabilities[:, 1] >= threshold).astype(int)
    score = scorer(y, predictions)

    return score

In [None]:
def findBestThreshold(model, features, target):
    best_result = {'f1': 0, 'roc_auc': 0}
    best_threshold = 0

    for threshold in np.arange(0, 1.05, 0.05):
        print(f'threshold: {threshold:.2f}')
        
        f1      = custom_threshold_scorer(model, features, target, threshold, f1_score)
        roc_auc = roc_auc_score(target, model.predict_proba(features)[:, 1])
        print(f"f1 score: {f1:.6f}")

        if f1 > best_result['f1']:
            best_result['f1'] = f1
            best_result['roc_auc'] = roc_auc
            best_threshold = threshold

    return best_threshold, best_result

In [None]:
def findBestEstimators(model, features, target, threshold=0.5):
    best_result = {'f1': 0, 'roc_auc': 0}
    best_params = model.get_params()
    cur_params  = best_params.copy()
    result = {}

    for estimators in range(10, 151, 10):
        print('estimators:', estimators)
        cur_params['n_estimators'] = estimators 

        new_model = model.__class__(**cur_params)
        new_model.fit(features, target)
        f1      = custom_threshold_scorer(new_model, features, target, threshold, f1_score)
        roc_auc = roc_auc_score(target, new_model.predict_proba(features)[:, 1])

        print(f"f1 score: {f1:.6f}")

        if f1 > best_result['f1']:
            best_result['f1'] = f1
            best_result['roc_auc'] = roc_auc
            best_params['n_estimators'] = estimators

    return best_params, best_result

In [None]:
from sklearn.exceptions import NotFittedError

def is_fitted(estimator, X):
    try:
        estimator.predict(X)
        return True
    except NotFittedError as e:
        print(repr(e))
        return False

# Обучение модели

In [None]:
param_grid = [
    {
        "max_depth": np.arange(3, 11, 1),
        "criterion": ['gini', 'entropy', 'log_loss'],
        "max_features": [None, 'sqrt', 'log2'],
    }
]

rfc = RandomForestClassifier(n_estimators=30, n_jobs = 4, random_state=42)
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

scores = ['f1', 'roc_auc']
scoring = {'f1': 'f1',
           'roc_auc': 'roc_auc'
          }

columns = ['param_criterion', 'param_max_depth', \
                         'mean_test_f1', 'mean_test_roc_auc', 'rank_test_f1', 'rank_test_roc_auc']

In [None]:
grid_search = grid_search_cv(rfc, sss, param_grid, X_train_tree, y_train_tree, scores, custom_refit)

In [None]:
df_results = print_top_low(grid_search, columns, ['rank_test_f1', 'index'])

In [None]:
is_fitted(grid_search.best_estimator_, X_train_tree)

In [None]:
best_estimator = findBestEstimators(grid_search.best_estimator_, X_train_tree, y_train_tree)

In [None]:
# %%pipeline
best_rfc_threshold = findBestThreshold(grid_search.best_estimator_, X_train_tree, y_train_tree)

In [None]:
new_directory = 'models/'

os.makedirs(new_directory, exist_ok=True)

In [None]:
rfc_model = RandomForestClassifier(**best_estimator[0]).fit(X_train_tree, y_train_tree)

joblib_logr = 'models/random_forest.pkl'
joblib.dump(rfc_model, joblib_logr)