In [58]:
import pandas as pd
import numpy as np
import sys
sys.path.append( 'C:\Machine Learning\mylib' )
import LassoRegression as LS
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt
from sklearn.metrics import classification_report

In [59]:
df_regress = pd.read_csv('../datasets/trip_duration_task_pred.csv')
df_classif = pd.read_csv('../datasets/csgo_task_pred.csv')

In [60]:
X_regress, y_regress = df_regress.drop(["trip_duration"], axis=1), df_regress["trip_duration"]
X_classif, y_classif = df_classif.drop(["bomb_planted"], axis=1), df_classif["bomb_planted"]

In [72]:
pd.DataFrame(y_regress).count()

trip_duration    729322
dtype: int64

In [62]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_regress = pd.DataFrame(min_max_scaler.fit_transform(X_regress))
X_classif = pd.DataFrame(min_max_scaler.fit_transform(X_classif))

In [89]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_classif, y_classif = ros.fit_resample(X_classif, y_classif)
y_classif.shape

(217074,)

In [64]:
def check_regress(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
    ls = Lasso(alpha=0.9).fit(X_train, y_train)
    y_pred = ls.predict(X_test)
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
    print(f'R^2: {ls.score(X_test, y_test)}')

In [65]:
def check_classif(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 42)
    lr = LogisticRegression(random_state=42, C=0.5, solver='liblinear').fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    print(classification_report(y_test, y_pred))

In [66]:
check_regress(X_regress, y_regress)

MAE: 627.7963278566577
MSE: 34944969.55392936
RMSE: 5911.427031938173
MAPE: 1.2566166668679035
R^2: 0.0002160738286709485


In [67]:
check_classif(X_classif, y_classif)

              precision    recall  f1-score   support

           0       0.97      0.92      0.94     21714
           1       0.92      0.98      0.95     21701

    accuracy                           0.95     43415
   macro avg       0.95      0.95      0.95     43415
weighted avg       0.95      0.95      0.95     43415



In [76]:
X_regress.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0
mean,0.5,0.53677,0.502584,0.499998,0.184673,0.855876,0.351724,0.855877,0.730044
std,0.288676,0.498646,0.283273,0.281794,0.145827,0.001245,0.001955,0.001242,0.00307
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.262537,0.261199,0.111111,0.855549,0.350936,0.855558,0.728694
50%,0.5,1.0,0.504007,0.501415,0.111111,0.855729,0.351907,0.855764,0.730277
75%,0.75,1.0,0.74341,0.739602,0.222222,0.855986,0.352734,0.856063,0.731574
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [90]:
X_regress.shape[1]

9

In [81]:
X_classif.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
count,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,...,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0,217074.0
mean,0.498694,0.372835,0.206878,0.203478,0.621826,0.543809,0.509281,0.554733,0.089428,0.117036,...,0.668719,0.61027,0.000972,0.18683,0.208252,0.151911,0.145927,0.10853,0.107456,0.090121
std,0.289015,0.317869,0.150666,0.144222,0.333898,0.27262,0.323309,0.32544,0.118966,0.135802,...,0.311593,0.251556,0.031162,0.389776,0.406059,0.358936,0.353034,0.311049,0.309693,0.286356
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.249704,0.133951,0.09375,0.090909,0.33,0.331667,0.2,0.334,0.010625,0.0175,...,0.4,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.49797,0.211784,0.1875,0.181818,0.6,0.583333,0.4,0.588,0.040625,0.06375,...,0.6,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.747698,0.542545,0.3125,0.30303,1.0,0.833333,0.8,0.8,0.125625,0.1725,...,1.0,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [91]:
X_classif.shape[1]

23

Отбор через дисперсию

In [78]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(0.2)
X_regress_vt = vt.fit_transform(X_regress)
X_regress_vt.shape

(729322, 1)

In [86]:
check_regress(X_regress_vt, y_regress)

MAE: 628.7979717292304
MSE: 34945424.441685766
RMSE: 5911.465507104458
MAPE: 1.2577842790132197
R^2: 0.00020305938130216195


In [104]:
vt = VarianceThreshold(0.10) #меньше 10 признаков сильно проседает
X_classif_vt = vt.fit_transform(X_classif)
X_classif_vt.shape

(217074, 10)

In [105]:
check_classif(X_classif_vt, y_classif)

              precision    recall  f1-score   support

           0       0.98      0.91      0.94     21714
           1       0.92      0.98      0.95     21701

    accuracy                           0.95     43415
   macro avg       0.95      0.95      0.95     43415
weighted avg       0.95      0.95      0.95     43415



FutureSelection

In [107]:
from sklearn.feature_selection import SelectKBest

In [124]:
skb = SelectKBest(k=6)
X_regress_skb = skb.fit_transform(X_regress, y_regress)

In [125]:
check_regress(X_regress_skb, y_regress)

MAE: 622.0201184558114
MSE: 34952322.68292834
RMSE: 5912.048941181757
MAPE: 1.261458291543036
R^2: 5.6990515834298705e-06


In [120]:
skb = SelectKBest(k=3) #на 2 уже сильно проседает
X_classif_skb = skb.fit_transform(X_classif, y_classif)

In [121]:
check_classif(X_classif_skb, y_classif)

              precision    recall  f1-score   support

           0       0.97      0.91      0.94     21714
           1       0.91      0.98      0.94     21701

    accuracy                           0.94     43415
   macro avg       0.94      0.94      0.94     43415
weighted avg       0.94      0.94      0.94     43415



Рекурсивынй отбор

In [126]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [127]:
tree_cls = DecisionTreeClassifier().fit(X_classif, y_classif)
tree_reg = DecisionTreeRegressor().fit(X_regress, y_regress)

In [130]:
rfe = RFE(estimator=tree_reg, n_features_to_select=4, step=1).fit(X_regress, y_regress)
X_regress_rfe = pd.DataFrame(rfe.transform(X_regress), columns=rfe.get_feature_names_out())

In [131]:
check_regress(X_regress_rfe, y_regress)

MAE: 622.0201184558114
MSE: 34952322.68292834
RMSE: 5912.048941181757
MAPE: 1.261458291543036
R^2: 5.6990515834298705e-06


In [136]:
rfe = RFE(estimator=tree_cls, n_features_to_select=1, step=1).fit(X_classif, y_classif)
X_classif_rfe = pd.DataFrame(rfe.transform(X_classif), columns=rfe.get_feature_names_out())

In [137]:
check_classif(X_classif_rfe, y_classif)

              precision    recall  f1-score   support

           0       1.00      0.90      0.95     21714
           1       0.91      1.00      0.95     21701

    accuracy                           0.95     43415
   macro avg       0.95      0.95      0.95     43415
weighted avg       0.95      0.95      0.95     43415

