In [1]:
#depenencies

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV

In [2]:
#import data
training_data = pd.read_csv('data/training_data.csv')

training_data

Unnamed: 0,dates,results,locations,game_nums,opponent,date_diff,rest,opp_date_diff,opp_rest,rest_diff,abbrv
0,2017-10-18,Win,Away,1,BOS,10,9,10.0,9.0,0.0,MIL
1,2017-10-20,Loss,Home,2,CLE,2,1,3.0,2.0,-1.0,MIL
2,2017-10-21,Win,Home,3,POR,1,0,1.0,0.0,0.0,MIL
3,2017-10-23,Win,Home,4,CHO,2,1,2.0,1.0,0.0,MIL
4,2017-10-26,Loss,Home,5,BOS,3,2,2.0,1.0,1.0,MIL
...,...,...,...,...,...,...,...,...,...,...,...
7375,2016-04-05,Win,Away,78,MIL,2,1,2.0,1.0,0.0,CLE
7376,2016-04-06,Loss,Away,79,IND,1,0,2.0,1.0,-1.0,CLE
7377,2016-04-09,Loss,Away,80,CHI,3,2,2.0,1.0,1.0,CLE
7378,2016-04-11,Win,Home,81,ATL,2,1,2.0,1.0,0.0,CLE


In [3]:
#feature engineering

# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column

training_data['opponent_lab'] = labelencoder.fit_transform(training_data['opponent'])
training_data['locations_lab'] = labelencoder.fit_transform(training_data['locations'])
training_data['abbrv_lab'] = labelencoder.fit_transform(training_data['abbrv'])
training_data['results_lab'] = labelencoder.fit_transform(training_data['results'])

training_data

Unnamed: 0,dates,results,locations,game_nums,opponent,date_diff,rest,opp_date_diff,opp_rest,rest_diff,abbrv,opponent_lab,locations_lab,abbrv_lab,results_lab
0,2017-10-18,Win,Away,1,BOS,10,9,10.0,9.0,0.0,MIL,1,0,16,1
1,2017-10-20,Loss,Home,2,CLE,2,1,3.0,2.0,-1.0,MIL,5,1,16,0
2,2017-10-21,Win,Home,3,POR,1,0,1.0,0.0,0.0,MIL,24,1,16,1
3,2017-10-23,Win,Home,4,CHO,2,1,2.0,1.0,0.0,MIL,4,1,16,1
4,2017-10-26,Loss,Home,5,BOS,3,2,2.0,1.0,1.0,MIL,1,1,16,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7375,2016-04-05,Win,Away,78,MIL,2,1,2.0,1.0,0.0,CLE,16,0,5,1
7376,2016-04-06,Loss,Away,79,IND,1,0,2.0,1.0,-1.0,CLE,11,0,5,0
7377,2016-04-09,Loss,Away,80,CHI,3,2,2.0,1.0,1.0,CLE,3,0,5,0
7378,2016-04-11,Win,Home,81,ATL,2,1,2.0,1.0,0.0,CLE,0,1,5,1


In [4]:
training_data.columns

Index(['dates', 'results', 'locations', 'game_nums', 'opponent', 'date_diff',
       'rest', 'opp_date_diff', 'opp_rest', 'rest_diff', 'abbrv',
       'opponent_lab', 'locations_lab', 'abbrv_lab', 'results_lab'],
      dtype='object')

In [5]:
#separate x and y
data = training_data[['game_nums', 'date_diff',
       'rest', 'opp_date_diff', 'opp_rest', 'rest_diff',
       'opponent_lab', 'locations_lab', 'abbrv_lab', 'results_lab']]

X = data.iloc[:,0:8]
y = data.iloc[:,9].to_list()

In [6]:
#svm estimator and recursive feature selection

svm_estimator = svm.SVC(kernel='linear') 


rfe = RFE(estimator= svm_estimator, 
          verbose=1, 
          n_features_to_select= 4
         )

rfe.fit(X, y) 


Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.


RFE(estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                  coef0=0.0, decision_function_shape='ovr', degree=3,
                  gamma='scale', kernel='linear', max_iter=-1,
                  probability=False, random_state=None, shrinking=True,
                  tol=0.001, verbose=False),
    n_features_to_select=4, step=1, verbose=1)

In [7]:
#top 4 features
cols = rfe.get_support(indices=True)
X_new = X.iloc[:,cols]
X_new.columns

Index(['rest', 'opp_rest', 'rest_diff', 'locations_lab'], dtype='object')

In [8]:
#Standardize the input features
scaler = MinMaxScaler(feature_range=(.1,.9) #TODO: provide the min and max values for the features
                     )

X_stand = scaler.fit_transform(X)

In [9]:
#standardized rfe

svm_estimator = svm.SVC(kernel='linear') #initialize the SVM estimator

#TODO: Recursive Feature Elimination
rfe = RFE(estimator= svm_estimator, #TODO: provide the SVM estimator 
          verbose=1, 
          n_features_to_select= 4#TODO: enter the number of required features
         )

rfe.fit(X_stand, y) #Note: this may take a few minutes to complete


Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.


RFE(estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                  coef0=0.0, decision_function_shape='ovr', degree=3,
                  gamma='scale', kernel='linear', max_iter=-1,
                  probability=False, random_state=None, shrinking=True,
                  tol=0.001, verbose=False),
    n_features_to_select=4, step=1, verbose=1)

In [10]:
#TODO: display the names of the selected features below
cols = rfe.get_support(indices=True)
X_new = X.iloc[:,cols]
X_new.columns

Index(['game_nums', 'rest', 'opponent_lab', 'locations_lab'], dtype='object')

In [13]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.56097561, 0.59823848, 0.58197832, 0.58604336, 0.59146341])

In [26]:
#random forest ensemble
#separate x and y
data = training_data[['game_nums', 'date_diff',
       'rest', 'opp_date_diff', 'opp_rest', 'rest_diff',
       'opponent_lab', 'locations_lab', 'abbrv_lab', 'results_lab']]

X = data.iloc[:,0:8].to_numpy()
y = data.iloc[:,9].to_list()

In [36]:
#splitting into training and test sets
seed = 42

#Standardize the input features
scaler = MinMaxScaler(feature_range=(.0,.99) #TODO: provide the min and max values for the features
                     )

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = .35, 
                                                    random_state = seed,
                                                    stratify = y
                                                   ) 

In [37]:
#Create the RandomForestClassifier

rf_classifier = RandomForestClassifier(criterion= 'gini', 
                                    max_depth= 15,     
                                    max_features= "auto",  
                                    min_samples_split= 10, 
                                    n_estimators= 100, 
                                    random_state= 42 
                                    )

#Train classifer using the `fit` method
rf_classifier.fit(X_train,y_train)

#Make predictions with the test set
y_pred = rf_classifier.predict(X_test)

#evaluate the accuracy
round(accuracy_score(y_test,y_pred),4)

0.585

In [38]:
#log res

clf = LogisticRegressionCV(cv=5, random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# clf.score(X, y)
round(accuracy_score(y_test,y_pred),4)

0.583

In [39]:
from sklearn.model_selection import cross_val_score
svc = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(svc, X_train, y_train, cv=5)
scores

array([0.57291667, 0.584375  , 0.58498436, 0.58602711, 0.59228363])

In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(SVC(gamma='auto'))
clf.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

NameError: name 'Pipeline' is not defined