In [1]:
import sys
sys.path.append('./modules/')
import warnings
warnings.simplefilter("ignore")


In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from svm import SVM
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [3]:
data = pd.read_csv('./data/hotel_data_review.csv')
X = data.drop(['is_canceled'],axis=1).values
y = data['is_canceled'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Implemented SVM

In [4]:
def cross_val_score(model, X, y):
    acc_scores = []
    prec_scores = []
    roc_scores = []

    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        prec_score = precision_score(y_test, y_pred)
        roc_score = roc_auc_score(y_test, y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
def search_parametrs_C(X,y,C_list):
    best_C = None
    best_roc = -10
    
    for C in C_list:
        lr = SVM(C=C)
        accuracy,precision,roc_auc = cross_val_score(lr,X,y)
        if roc_auc > best_roc:
            best_C = C
    return best_C

def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[1]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')

### Настройка параметров

In [5]:
C = search_parametrs_C(X,y,C_list=np.arange(1,30,5))

In [6]:
print(C)

26


### Обучение

In [7]:
model = SVM(C = C)


In [8]:
accuracy,precision,roc_auc = cross_val_score(model,X,y)

In [9]:
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.6269506210885225
precision = 0.6022078848635231
roc auc = 0.5017249056251604


In [10]:
check_retrain(model,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.6221469008040962 --- 0.6192399221172568
precision = 0.016614843431012282 --- 0.016209125831537524
roc auc = 0.7135424961801137 --- 0.68034248618037


# Sklear SVM

In [11]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

### Настройка параметров

In [12]:
clf = svm.SVC()
parameters = {'C':np.arange(1,30,5)}
clf_gs = GridSearchCV(clf, parameters)

In [13]:
clf_gs.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([ 1,  6, 11, 16, 21, 26])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [14]:
clf_gs.best_estimator_

SVC(C=6, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
C = 6

### Обучение

In [16]:
clf = svm.SVC(C=C)


In [17]:
scores = cross_validate(clf,X,y,cv=5,scoring=scoring)
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.6274415852849835
test_precision_macro = 0.5931191534298028
test_roc_auc = 0.6449447954475727


In [18]:
check_retrain(clf,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.70935948749384 --- 0.7070743491742987
precision = 0.36204026508370235 --- 0.3554764358662044
roc auc = 0.724563598058007 --- 0.7249103784925696
