<h1 align="center"><font color='green'>NBA Predictions</font></h1>

### <font color='289C4E'>Table of contents<font><a class='anchor' id='top'></a>
- [Processing Data Cleaning](#1)
- [Feature Selection](#2)
- [Modelling](#3)
- [Conclusion](#4)

<h2 align="center"> <font color='grey'>Processing Data Cleaning</font></h2>

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector


 # setting output lengths for panda DFs
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_row',50)


In [4]:
df = pd.read_csv('Model.csv').set_index('date').sort_index()

In [7]:
X=df.drop('won_next',axis=1)
y = df['won_next']

In [8]:
y_train, y_test =None,None
X_train, X_test = None,None
tss = TimeSeriesSplit(n_splits = 3)
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [9]:
logreg=LogisticRegression(class_weight='balanced',solver='lbfgs',random_state=42,n_jobs=-1,max_iter=500)
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_train)
train_acc = accuracy_score(y_train,y_pred)
print("Train accuracy :",train_acc)
y_pred = logreg.predict(X_test)
test_acc = accuracy_score(y_test,y_pred)
print("Testing accuracy :",test_acc)

Train accuracy : 0.5770220292651552
Testing accuracy : 0.5656054027978775


In [10]:
logreg=RidgeClassifier(random_state=42,alpha=1)
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_train)
train_acc = accuracy_score(y_train,y_pred)
print("Train accuracy :",train_acc)
y_pred = logreg.predict(X_test)
test_acc = accuracy_score(y_test,y_pred)
print("Testing accuracy :",test_acc)

Train accuracy : 0.6002572760894035
Testing accuracy : 0.5824891461649783


In [11]:
pca = PCA(.95)
prin = pca.fit(X_train)
train_pca = pca.transform(X_train)
test_pca = pca.transform(X_test)
logreg.fit(train_pca,y_train)
y_pred = logreg.predict(train_pca)
train_acc = accuracy_score(y_train,y_pred)
print("Train accuracy :",train_acc)
y_pred = logreg.predict(test_pca)
test_acc = accuracy_score(y_test,y_pred)
print("Testing accuracy :",test_acc)

Train accuracy : 0.578951599935681
Testing accuracy : 0.5653642064640617


In [None]:
pca.n_components_

8

In [17]:
rf = RandomForestClassifier(max_depth=3,n_estimators=200,random_state=42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_train)
train_acc = accuracy_score(y_train,y_pred)
print("Train accuracy :",train_acc)
y_pred = rf.predict(X_test)
test_acc = accuracy_score(y_test,y_pred)
print("Testing accuracy :",test_acc)

Train accuracy : 0.5784692072680495
Testing accuracy : 0.5670525808007718


In [14]:
param_grid = {'n_estimators':np.arange(50,200,15),
              'max_features':np.arange(0.1, 1, 0.1),
              'max_depth': [3, 5, 7, 9],
              'max_samples': [0.3, 0.5, 0.8]}

model = RSCV(RandomForestClassifier(), param_grid, n_iter = 15).fit(X_train, y_train)
model = model.best_estimator_

In [16]:
model

RandomForestClassifier(max_depth=7, max_features=0.8, max_samples=0.5,
                       n_estimators=50)

In [15]:
model.fit(X_train,y_train)
y_pred = model.predict(X_train)
train_acc = accuracy_score(y_train,y_pred)
print("Train accuracy :",train_acc)
y_pred = model.predict(X_test)
test_acc = accuracy_score(y_test,y_pred)
print("Testing accuracy :",test_acc)

Train accuracy : 0.6777616980221901
Testing accuracy : 0.5670525808007718
