In [3]:
import sys
sys.path.append('./modules/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from logistic_regression import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [4]:
def cross_val_score(model, X,y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict_classes(X_test)
        acc_score = accuracy_score(y_test,y_pred)
        prec_score = precision_score(y_test,y_pred)
        roc_score = roc_auc_score(y_test,y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
    


In [5]:
data = pd.read_csv('./data/hotel_data_review.csv')
X = data.drop(['is_canceled'],axis=1).values
y = data['is_canceled'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Implemented Logistic Regression

In [6]:
def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict_classes(X_test)
    y_pred_train = model.predict_classes(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[1]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')
def search_parametrs_C(penalty,C_list):
    best_C = None
    best_roc = -10
    
    for C in C_list:
        lr = LogisticRegression(penalty=penalty, C=C)
        accuracy,precision,roc_auc = cross_val_score(lr,X,y)
        if roc_auc > best_roc:
            best_C = C
    return best_C

## Without regularization

In [9]:
lr = LogisticRegression()

In [11]:
accuracy,precision,roc_auc = cross_val_score(lr,X,y)
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.6338100909322868
precision = 0.584378671285857
roc auc = 0.5692628608657975


In [10]:
check_retrain(lr,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.7244918809120302 --- 0.7234802048027692
precision = 0.3806966299192814 --- 0.3814297760704582
roc auc = 0.7502104845361435 --- 0.7500780201265398


## L1 regularization

### Настройка параметров

In [15]:
C = search_parametrs_C(penalty='l1', C_list=np.arange(0.05,1.05,0.05))

In [16]:
print(f'C = {C}')

C = 1.0


### Обучение

In [12]:
lr_l1 = LogisticRegression(penalty='l1',C=С)

In [9]:
accuracy,precision,roc_auc = cross_val_score(lr_l1,X,y)
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.5780922833004926
precision = 0.5042834802615656
roc auc = 0.6039025344644553


In [13]:
check_retrain(lr_l1,X_train,X_test, y_train, y_test)


Ошибки на Train --- Test выборках
accuracy = 0.6726402961574057 --- 0.6690704550371386
precision = 0.21913376676403154 --- 0.2154970486273775
roc auc = 0.7047294546034382 --- 0.7003878294633096


## L2 regularization

### Настройка параметров

In [17]:
C = search_parametrs_C(penalty='l2', C_list=np.arange(0.05,1.05,0.05))

In [18]:
print(f'C = {C}')

C = 1.0


### Обучение

In [16]:
lr_l2 = LogisticRegression(penalty='l2',C=С)


In [12]:
accuracy,precision,roc_auc = cross_val_score(lr_l1,X,y)
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.6162504865352665
precision = 0.5184112372925739
roc auc = 0.6100648645608078


In [17]:
check_retrain(lr_l2,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.6284931309270544 --- 0.6296964015288095
precision = 0.8235811426238261 --- 0.819450950997845
roc auc = 0.6658751793238991 --- 0.6653836279059693


# Sklearn Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

In [26]:
def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[1]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')
    

## Without regularization

In [27]:
clf = LogisticRegression(penalty='none',random_state=42)
scores = cross_validate(clf,X,y,cv=5,scoring=scoring)

In [28]:
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.6762188692443609
test_precision_macro = 0.6627387972147134
test_roc_auc = 0.7317061087095794


In [29]:
check_retrain(clf,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.7281818291085229 --- 0.7276988533929473
precision = 0.4978799585414115 --- 0.49554951747399983
roc auc = 0.7209326461509085 --- 0.7218719739255142


## L1 regularization

### Настройка параметров

In [30]:
clf_l1 = LogisticRegression(penalty='l1',random_state=42,solver='saga')
parameters = {'C':np.arange(0.05,1,0.05)}
clf_cv = GridSearchCV(clf_l1, parameters)


In [31]:
clf_cv.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l1',
                                          random_state=42, solver='saga',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [32]:
clf_cv.best_estimator_

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
C = 0.5

### Обучение

In [34]:
clf_l1 = LogisticRegression(penalty='l1',C = C, random_state=42,solver='saga')
scores = cross_validate(clf,X,y,cv=5,scoring=scoring)

In [35]:
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.6762188692443609
test_precision_macro = 0.6627387972147134
test_roc_auc = 0.7317061087095794


In [36]:
check_retrain(clf_l1,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.7279294222286325 --- 0.7291411264152304
precision = 0.5057947799868087 --- 0.5047315656329054
roc auc = 0.719308471615182 --- 0.7224474054654223


In [37]:
С = 0.5

## L2 regularization

### Настройка параметров

In [38]:
clf_l2 = LogisticRegression(penalty='l2',random_state=42)
parameters = {'C':np.arange(0.05,1,0.05)}
clf_cv = GridSearchCV(clf_l2, parameters)


In [39]:
clf_cv.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [40]:
clf_cv.best_estimator_

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Обучение

In [41]:
clf_l2 = LogisticRegression(penalty='l2',C = С,random_state=42)
scores = cross_validate(clf,X,y,cv=5,scoring=scoring)

In [42]:
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.6762188692443609
test_precision_macro = 0.6627387972147134
test_roc_auc = 0.7317061087095794


In [43]:
check_retrain(clf_l2,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.7316794673012896 --- 0.7316651042042258
precision = 0.5029052419988065 --- 0.5009837908741684
roc auc = 0.7252877933793803 --- 0.7268640168250026
