<a href="https://colab.research.google.com/github/JPT35/CS430_Projects/blob/main/FinalProject_Regression_JPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear Regression and SVM Models

# Imports:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR

from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

# Function used to plot the confusion matrix:

In [5]:
def plot_cm(y_test, y_pred):
    cm = confusion_matrix(y_test,y_pred)
    fig = plt.figure(figsize=(10,10))
    heatmap = sns.heatmap(cm, annot=True, fmt='.2f', cmap='RdYlGn')
    plt.ylabel('True label')
    plt.xlabel('Predicted Label')

# Function for plotting the roc_curve:

In [6]:
def plot_roc_curve(fpr,tpr):
  plt.plot(fpr, tpr, linewidth=2)
  plt.plot([0,1],[0,1], 'k--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.0])

# Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
df = pd.read_csv('/content/gdrive/MyDrive/CS-430/regseason2223clean.csv')

In [18]:
df.head()

Unnamed: 0,Matchup,HomeSpread,Pdmargin,Pdspread,Osmarg,Osspread,Kpmar,Kpsp,Tomarg,blocks,...,fta,orbs,NF,BPI,TR,OSP,Nfp,BPIp,Kpp,marg
0,#6 Gonzaga vs. #15 St Marys,3.5,-1.719617,1.780383,-3.8,-0.3,1,4.5,0.35,-0.55,...,-2.05,0.6,-0.29,-1.7,-5.0,-5.29,0.496,0.441,51.0,-26
1,#127 Notre Dame vs. #56 VA Tech,-6.5,4.992462,-1.507538,19.1,12.6,7,0.5,-0.75,0.1,...,2.25,1.1,7.47,7.7,4.8,30.88,0.783,0.827,75.0,3
2,#158 NC-Wilmgton vs. #71 Col Charlestn,-10.0,11.470956,1.470956,11.3,1.3,8,-2.0,0.65,0.0,...,2.4,1.85,7.76,7.0,7.2,26.73,0.716,0.794,76.0,5
3,#130 GA Tech vs. #154 Florida St,2.0,-6.560043,-4.560043,-2.1,-0.1,-2,0.0,1.75,0.5,...,3.3,-1.85,-2.6,-0.9,0.6,-33.35,0.418,0.448,42.0,-1
4,#164 U Mass vs. #145 Richmond,-2.5,-5.439945,-7.939945,4.0,1.5,1,-1.5,-0.5,-1.0,...,1.45,-1.55,0.72,1.2,-3.5,-9.16,0.552,0.552,53.0,33


In [19]:
statistics = df.describe()

print(statistics)

        HomeSpread     Pdmargin     Pdspread       Osmarg     Osspread  \
count  3508.000000  3508.000000  3508.000000  3508.000000  3508.000000   
mean     -3.512258     9.302367     5.790109     0.355844    -3.156471   
std       7.559723     9.803201     5.949606     9.842773     5.501565   
min     -39.500000   -29.276972   -29.276972   -28.800000   -26.800000   
25%      -8.000000     2.900866     2.125262    -6.400000    -6.400000   
50%      -3.500000     9.116699     5.546606     0.100000    -3.300000   
75%       2.000000    15.408211     9.342969     7.200000     0.300000   
max      21.000000    57.967747    47.272536    39.000000    26.900000   

             Kpmar         Kpsp       Tomarg       blocks        steal  ...  \
count  3508.000000  3508.000000  3508.000000  3508.000000  3508.000000  ...   
mean      3.842075     0.329818    -0.861759     0.775086     0.417588  ...   
std       7.440257     1.779145     1.796320     1.025580     1.324433  ...   
min     -20.00000

# Splitting into test and train: 

In [20]:
X = df.drop(['Matchup', 'Pdmargin'], axis=1)
y = df['Pdmargin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Creating column transformer:

In [21]:
num_cols = ['HomeSpread', 'Pdspread', 'Osmarg', 'Osspread', 'Kpmar', 'Kpsp', 'Tomarg', 'blocks', 'steal', 'fgp', 'fta', 'orbs', 'NF', 'BPI', 'TR', 'OSP', 'Nfp', 'BPIp', 'Kpp', 'marg']
cat_cols = []

col_transf = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ])

# Creating pipeline

In [22]:
pipeline = Pipeline([
    ('transform', col_transf),
    ('mlr_model', LinearRegression())
])

In [23]:
set_config(display = 'diagram')
pipeline

# Running fit into pipeline

In [24]:
pipeline.fit(X_train, y_train)

In [25]:
y_pred = pipeline.predict(X_test)

# R2 score:

In [26]:
r2_score(y_pred, y_test)

1.0

# Interpretation: 

The R2 score of 1.o would suggest that this model is perfectly able to classify the data presented in the dataset.  I would interpret this to mean that the data being tested has not been fitted to the model correctly, or that some overfitting has occured in this model. 

# SVM Regression

# Accessing drive: 

In [8]:
df = pd.read_csv('/content/gdrive/MyDrive/CS-430/regseason2223clean.csv')

# Splitting data into Train/test:

In [9]:
X = df.drop(['Matchup', 'Pdmargin'], axis=1)
y = df['Pdmargin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Building the pipeline:

In [10]:
num_cols = ['HomeSpread', 'Pdspread', 'Osmarg', 'Osspread', 'Kpmar', 'Kpsp', 'Tomarg', 'blocks', 'steal', 'fgp', 'fta', 'orbs', 'NF', 'BPI', 'TR', 'OSP', 'Nfp', 'BPIp', 'Kpp', 'marg']
cat_cols = []

In [11]:
col_transf = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ])

In [12]:
pipeline = Pipeline(steps=[('col_transf', col_transf),
                           ('reg_model', SVR(kernel='rbf'))])


# Fitting pipeline:

In [13]:
pipeline.fit(X_train, y_train)

# Evaluating model:

In [14]:
y_pred_train = pipeline.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

y_pred_test = pipeline.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

In [15]:
print("RMSE for training set:", rmse_train)
print("RMSE for test set:", rmse_test)

RMSE for training set: 2.0080261997543523
RMSE for test set: 2.118220343637086


# Interpretation of reults:

Using an SVM regression model, the RMSE for the training and test sets are within a range of ~.1 of each other. I interpret this to mean that the model is working as intended. I believe the RMSE is indicative of an average error of around 2.0 for the predicted margin of victory, which would be extremely close for a basketball game. 