In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# prepering modal data

In [2]:
df = pd.read_csv('results_data//model_data.csv')
drop = ['game_id', 'season', 'date', 'arena', 'H_team', 'H_team_id', 'A_team',
       'A_team_id', 'H_score', 'A_score', 'ref1', 'ref2', 'ref3', 'ties',
       'leadChange', 'nugget','H_game_id','A_game_id']

clean_df = df.drop(drop,axis=1)
clean_df.dropna(inplace=True)

In [10]:
rand_st=8

# Prepering the data
X = clean_df.drop(columns = 'result')
y = clean_df['result']

# Split to train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rand_st)

# Standard Scaling
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

# comparing models

In [11]:
def run_exps(X_train, y_train, X_test, y_test):
    
    dfs = [] 
    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
        ]
    
    for name, model in models:
        
        kfold = KFold(n_splits=10, random_state=rand_st, shuffle=True)
        cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'roc_auc'])

        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        
    final = pd.concat(dfs, ignore_index=True)
    
    return final

final = run_exps(X_train, y_train, X_test, y_test)
final_results = final.iloc[:,2:].groupby('model').mean()
final_results.sort_values(by='test_accuracy', ascending=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0_level_0,test_accuracy,test_roc_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1
LogReg,0.637956,0.676677
GNB,0.620155,0.649148
SVM,0.618286,0.654841
RF,0.607071,0.636142
XGB,0.57243,0.597698
KNN,0.559372,0.59211


# diving into logistic regression

In [12]:
def classification_model (name, model): 
    
    kfold = KFold(n_splits=10, random_state=rand_st, shuffle=True)
    cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'roc_auc'])
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    

    print(name)
    print(classification_report(y_test, y_pred, target_names=['loss', 'win']))
    print('Cross Validation Results:')
    print(pd.DataFrame(cv_results).iloc[:,2:].mean())
    return clf

log_reg = classification_model('LogReg', LogisticRegression())


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogReg
              precision    recall  f1-score   support

        loss       0.59      0.48      0.53       159
         win       0.64      0.73      0.68       198

    accuracy                           0.62       357
   macro avg       0.61      0.61      0.60       357
weighted avg       0.62      0.62      0.61       357

Cross Validation Results:
test_accuracy    0.637956
test_roc_auc     0.676677
dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
coefficients = pd.DataFrame.from_dict({'feature':X.columns,'coef':log_reg.coef_[0].tolist()})
coefficients.sort_values(by='coef', key=abs, ascending = False)[0:20]

Unnamed: 0,feature,coef
44,A_mean_tpm,-0.803688
45,A_mean_tpa,0.682107
1,H_Team_Elo_Before,0.628
46,A_mean_tpp,0.573831
4,H_mean_pointsInPaint,-0.519807
12,H_mean_fgp,0.458633
42,A_mean_fta,-0.39052
33,A_mean_biggestLead,-0.389811
39,A_mean_fga,-0.336951
5,H_mean_biggestLead,0.335824
