In [1]:
!pip3 install -q numpy==1.22.4
!pip3 install -q pandas==1.5.3
!pip3 install -q scikit-learn
!pip3 install joblib

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib 

import time
import os

---

Чтение данных

In [3]:
path1 = 'datasets/regression/train_log.csv'
path2 = 'datasets/regression/test_log.csv'

def read_file(path):
    df = pd.DataFrame()
    if os.path.exists(path):
        df = pd.read_csv(path, sep=',')
    elif os.path.exists(path[1:]):
        df = pd.read_csv(path[1:], sep=',')
    else:
        print('No such file or directory') 
        raise FileNotFoundError('No such file or directory')
    return df

df_train = read_file(path1)
df_test  = read_file(path2)

---

## Разбиение тренировочной выборки на тренировочную и владиационную

In [4]:
X_log = df_train.drop(columns=['Transported'], axis=1)
y_log = df_train.Transported

print(y_log.shape[0])

8673


In [5]:
X_train_log, X_val_log, y_train_log, y_val_log = \
        train_test_split(X_log, y_log, test_size = 0.3, shuffle=True, random_state = 42)

print(y_train_log.shape, y_val_log.shape)

(6071,) (2602,)


---

In [6]:
df_test.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,has_services,HomePlanet_Europa,HomePlanet_Mars,Cabin_S,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,0,False,False,True,False,True
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,1,False,False,True,False,True
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,True,False,True,False,False
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,1,True,False,True,False,True
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,1,False,False,True,False,True


Скалирование

In [7]:
scaler = StandardScaler()
scaler.fit(X_train_log)

X_train_log = scaler.transform(X_train_log)
X_val_log   = scaler.transform(X_val_log)
X_test_log  = scaler.transform(df_test)

Сразу сохраняем скалированную тестовую выборку для следующего шага выбора лучшей модели.

In [8]:
new_directory = 'datasets/for_comparison'

os.makedirs(new_directory, exist_ok=True)

In [9]:
np.save('datasets/for_comparison/X_test_log.npy', X_test_log)
np.save('datasets/for_comparison/X_val_log.npy', X_val_log)
y_val_log.to_csv('datasets/for_comparison/y_val_log.csv', index=False)

del df_test

---

# Вспомогательные функции

In [10]:
def grid_search_cv(model, cv, param_grid, X_train_set, y_train_set, scores, refit=True):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scores, cv=cv, refit=refit)

    start_time = time.time()

    grid_search.fit(X_train_set, y_train_set)

    total_time = time.time() - start_time
    print(f"Total time spent on grid search (wall-clock): {total_time:.2f} seconds or \
            {(total_time // 60):.0f} minutes {(total_time % 60):.2f} seconds")
    
    return grid_search

In [11]:
def custom_refit(cv_results):
    sorted_indices = np.lexsort((-cv_results["mean_test_roc_auc"], -cv_results["mean_test_f1"]))
    return sorted_indices[0]

In [12]:
def print_top_low(grid_search, show_columns, sort_by = ['rank_test_score', 'index'], n_rows = 10):
    df_results = pd.DataFrame(grid_search.cv_results_)
    df_results.index.name = 'index'
    df_results = df_results.sort_values(by=sort_by).reset_index(drop=True)

    print('Top ' + str(n_rows))
    display(df_results[:n_rows][show_columns])

    print('Low ' + str(n_rows))
    display(df_results[-n_rows:][show_columns])
    
    return df_results

In [13]:
def custom_threshold_scorer(estimator, X, y, threshold=0.5, scorer=f1_score):
    probabilities = estimator.predict_proba(X)

    predictions = (probabilities[:, 1] >= threshold).astype(int)
    score = scorer(y, predictions)

    return score

In [14]:
def findBestThreshold(model, features, target):
    best_result = {'f1': 0, 'roc_auc': 0}
    best_threshold = 0

    for threshold in np.arange(0, 1.05, 0.05):
        print(f'threshold: {threshold:.2f}')
        
        f1      = custom_threshold_scorer(model, features, target, threshold, f1_score)
        roc_auc = roc_auc_score(target, model.predict_proba(features)[:, 1])
        print(f"f1 score: {f1:.6f}")

        if f1 > best_result['f1']:
            best_result['f1'] = f1
            best_result['roc_auc'] = roc_auc
            best_threshold = threshold

    return best_threshold, best_result

In [15]:
from sklearn.exceptions import NotFittedError

def is_fitted(estimator, X):
    try:
        estimator.predict(X)
        return True
    except NotFittedError as e:
        print(repr(e))
        return False

# Обучение модели

In [16]:
param_grid = [
    {
        "n_jobs": [4],
        "penalty": ['l2'],
        "C": np.logspace(-5, 5, 20),
        "max_iter": [100, 1000, 10000]
    }
]

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

scores = ['f1', 'roc_auc']

columns = ['mean_fit_time', 'param_max_iter', 'param_penalty', 'param_C',\
                'mean_test_f1', 'mean_test_roc_auc', 'rank_test_f1', 'rank_test_roc_auc']

log_r = LogisticRegression(random_state=42, multi_class='ovr', solver='lbfgs')

In [17]:
grid_search = grid_search_cv(log_r, sss, param_grid, X_train_log, y_train_log, scores, custom_refit)

Total time spent on grid search (wall-clock): 8.02 seconds or             0 minutes 8.02 seconds


In [18]:
df_results = print_top_low(grid_search, columns, ['rank_test_f1', 'index'])

Top 10


Unnamed: 0,mean_fit_time,param_max_iter,param_penalty,param_C,mean_test_f1,mean_test_roc_auc,rank_test_f1,rank_test_roc_auc
0,0.013729,100,l2,69.51928,0.798911,0.875774,1,13
1,0.012858,1000,l2,69.51928,0.798911,0.875774,1,13
2,0.013343,10000,l2,69.51928,0.798911,0.875774,1,13
3,0.012087,100,l2,233.572147,0.798911,0.875776,1,1
4,0.012823,1000,l2,233.572147,0.798911,0.875776,1,1
5,0.013085,10000,l2,233.572147,0.798911,0.875776,1,1
6,0.013014,100,l2,784.75997,0.798911,0.875776,1,1
7,0.014362,1000,l2,784.75997,0.798911,0.875776,1,1
8,0.013215,10000,l2,784.75997,0.798911,0.875776,1,1
9,0.01313,100,l2,2636.650899,0.798911,0.875775,1,7


Low 10


Unnamed: 0,mean_fit_time,param_max_iter,param_penalty,param_C,mean_test_f1,mean_test_roc_auc,rank_test_f1,rank_test_roc_auc
50,0.008261,10000,l2,0.001274,0.760851,0.85566,49,46
51,0.008622,100,l2,0.000379,0.750685,0.848637,52,49
52,0.008916,1000,l2,0.000379,0.750685,0.848637,52,49
53,0.008482,10000,l2,0.000379,0.750685,0.848637,52,49
54,0.009051,100,l2,0.000113,0.743187,0.841567,55,52
55,0.008891,1000,l2,0.000113,0.743187,0.841567,55,52
56,0.008782,10000,l2,0.000113,0.743187,0.841567,55,52
57,0.537815,100,l2,1e-05,0.688155,0.834999,58,58
58,0.009418,1000,l2,1e-05,0.688155,0.834999,58,58
59,0.008263,10000,l2,1e-05,0.688155,0.834999,58,58


In [19]:
is_fitted(grid_search.best_estimator_, X_train_log)

True

In [20]:
# %%parameters
best_threshold_log = findBestThreshold(grid_search.best_estimator_, X_train_log, y_train_log)

threshold: 0.00
f1 score: 0.675466
threshold: 0.05
f1 score: 0.716067
threshold: 0.10
f1 score: 0.734388
threshold: 0.15
f1 score: 0.758446
threshold: 0.20
f1 score: 0.778515
threshold: 0.25
f1 score: 0.793790
threshold: 0.30
f1 score: 0.802998
threshold: 0.35
f1 score: 0.810372
threshold: 0.40
f1 score: 0.811437
threshold: 0.45
f1 score: 0.810260
threshold: 0.50
f1 score: 0.805433
threshold: 0.55
f1 score: 0.791763
threshold: 0.60
f1 score: 0.776732
threshold: 0.65
f1 score: 0.765372
threshold: 0.70


f1 score: 0.724967
threshold: 0.75


f1 score: 0.717032
threshold: 0.80
f1 score: 0.612601
threshold: 0.85
f1 score: 0.560256
threshold: 0.90
f1 score: 0.437798
threshold: 0.95
f1 score: 0.336464
threshold: 1.00


f1 score: 0.000000


In [21]:
new_directory = 'models/'

os.makedirs(new_directory, exist_ok=True)

In [22]:
joblib_logr = 'models/logistic_regression.pkl'
joblib.dump(grid_search.best_estimator_, joblib_logr)

['models/logistic_regression.pkl']