In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

In [47]:
df = pd.read_csv("../Data/volley_for_nba_prepro.csv")

In [48]:
df = df.drop(["away_team","home_team"], axis=1)

In [49]:
df.head()

Unnamed: 0,home_Pct,away_Pct,home_form,away_form,Roadwon,Roadlost,Homewon,Homelost,result
0,100.0,0.0,0.4,-0.2,1.0,0.0,1.0,0.0,0
1,100.0,0.0,0.2,-0.4,1.0,0.0,1.0,0.0,1
2,100.0,50.0,0.2,-0.28,0.0,2.0,1.0,0.0,1
3,0.0,0.0,-0.2,-0.68,1.0,1.0,0.0,1.0,0
4,0.0,66.666667,-1.064,0.104,0.0,1.0,0.0,2.0,1


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   home_Pct   1235 non-null   float64
 1   away_Pct   1235 non-null   float64
 2   home_form  1235 non-null   float64
 3   away_form  1235 non-null   float64
 4   Roadwon    1235 non-null   float64
 5   Roadlost   1235 non-null   float64
 6   Homewon    1235 non-null   float64
 7   Homelost   1235 non-null   float64
 8   result     1235 non-null   int64  
dtypes: float64(8), int64(1)
memory usage: 87.0 KB


In [51]:
y = df["result"]
X = df.drop(["result"], axis=1)

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 121)

In [52]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)


scaled_X_train = pd.DataFrame(data=scaled_X_train, columns=X_train.columns)
scaled_X_test = pd.DataFrame(data=scaled_X_test, columns=X_test.columns)

# decision trees


In [53]:
# # Define the decision tree classifier

# dt_classifier = DecisionTreeClassifier()

# # Define the grid search parameters
# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best', 'random'],
#     'max_depth': [None, 5, 10, 15, 20],
#     'min_samples_split': [2, 5, 10, 15],
#     'min_samples_leaf': [1, 2, 4, 8],
#     'max_features': ['auto', 'sqrt', 'log2', None],
#     'class_weight': [None, 'balanced'],
#     'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3]
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5)
# grid_search.fit(scaled_X_train, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

best_params =  {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}


In [54]:
# Train the model with the best parameters
best_dt_classifier = DecisionTreeClassifier(**best_params)
best_dt_classifier.fit(scaled_X_train, y_train)


y_pred = best_dt_classifier.predict(scaled_X_test)


test_accuracy = accuracy_score(y_pred,y_test)
test_f1 = f1_score(y_pred, y_test)
test_auc = roc_auc_score(y_pred,y_test)

print("Best Parameters:", best_params)
# print("Best Score:", best_score)
print("Test Accuracy:", test_accuracy)
print("Test f1 score:", test_f1)
print("Test roc auc :", test_auc)

Best Parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}
Test Accuracy: 0.6437246963562753
Test f1 score: 0.7066666666666667
Test roc auc : 0.6299789621318374


# k nearest neighbors


In [55]:
# Define the K-Nearest Neighbors classifier

# knn_classifier = KNeighborsClassifier()

# # Define the grid search parameters
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to consider
#     'weights': ['uniform', 'distance'],  # Weight function used in prediction
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
#     'leaf_size': [10, 20, 30, 40],  # Leaf size passed to BallTree or KDTree
#     # 'p': [1, 2, 3],  # Power parameter for the Minkowski metric
#     # 'metric': ['euclidean', 'manhattan', 'chebyshev']  # Distance metric to use for the tree
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, cv=5)
# grid_search.fit(scaled_X_train, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

best_params = {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 11, 'weights': 'uniform'}

In [56]:
# Train the model with the best parameters
best_knn_classifier = KNeighborsClassifier(**best_params)
best_knn_classifier.fit(scaled_X_train, y_train)



y_pred = best_knn_classifier.predict(scaled_X_test)


test_accuracy = accuracy_score(y_pred,y_test)
test_f1 = f1_score(y_pred, y_test)
test_auc = roc_auc_score(y_pred,y_test)

print("Best Parameters:", best_params)
# print("Best Score:", best_score)
print("Test Accuracy:", test_accuracy)
print("Test f1 score:", test_f1)
print("Test roc auc :", test_auc)

Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 11, 'weights': 'uniform'}
Test Accuracy: 0.6720647773279352
Test f1 score: 0.7344262295081967
Test roc auc : 0.6603448275862069


# Naive Bayes 


In [57]:
# # Define the Gaussian Naive Bayes classifier

# nb_classifier = GaussianNB()

# # Define the grid search parameters
# param_grid = {
#     'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5],
#     'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]]  # Prior probabilities for each class
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5)
# grid_search.fit(scaled_X_train, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

best_params = {'priors': None, 'var_smoothing': 1e-09}

In [58]:
# Train the model with the best parameters
best_nb_classifier = GaussianNB(**best_params)
best_nb_classifier.fit(scaled_X_train, y_train)

y_pred = best_nb_classifier.predict(scaled_X_test)


test_accuracy = accuracy_score(y_pred,y_test)
test_f1 = f1_score(y_pred, y_test)
test_auc = roc_auc_score(y_pred,y_test)

print("Best Parameters:", best_params)
# print("Best Score:", best_score)
print("Test Accuracy:", test_accuracy)
print("Test f1 score:", test_f1)
print("Test roc auc :", test_auc)

Best Parameters: {'priors': None, 'var_smoothing': 1e-09}
Test Accuracy: 0.659919028340081
Test f1 score: 0.7142857142857143
Test roc auc : 0.6482673606355294


# support vector machines

In [59]:
# # Define the Support Vector Machine classifier

# svm_classifier = SVC()

# # Define the grid search parameters
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     # 'degree': [2, 3, 4],  # Only for polynomial kernel
#     # 'gamma': ['scale', 'auto', 0.1, 1, 10]  # 'scale' and 'auto' are default, adding more values for gamma
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5)
# grid_search.fit(scaled_X_train, y_train)

# # Get the best parameters and best score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

best_params = {'C': 10, 'kernel': 'poly'}

In [60]:
# Train the model with the best parameters






best_svm_classifier = SVC(**best_params)
best_svm_classifier.fit(scaled_X_train, y_train)

# Evaluate the model on the test set

y_pred = best_svm_classifier.predict(scaled_X_test)


test_accuracy = accuracy_score(y_pred,y_test)
test_f1 = f1_score(y_pred, y_test)
test_auc = roc_auc_score(y_pred,y_test)

print("Best Parameters:", best_params)
# print("Best Score:", best_score)
print("Test Accuracy:", test_accuracy)
print("Test f1 score:", test_f1)
print("Test roc auc :", test_auc)

Best Parameters: {'C': 10, 'kernel': 'poly'}
Test Accuracy: 0.6558704453441295
Test f1 score: 0.7157190635451506
Test roc auc : 0.6431015221337801
