In [None]:
import pandas_gbq
from pandas.io import gbq
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
service_path = "/Users/liamhettinger/Documents/Spring_MSBA/Capstone/"
service_file =  "capstone-ncaa-bball-a453a0e2a7f2.json"
gbq_proj_id = "capstone-ncaa-bball"
gbq_dataset_id = ""

private_key = service_path + service_file

In [None]:
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

In [None]:
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

In [None]:
sql ="""
SELECT *
FROM `capstone-ncaa-bball.boxscores.BS*`
"""

In [None]:
fulldf = pandas_gbq.read_gbq(sql, project_id=gbq_proj_id)

In [None]:
#dropping NAN values. These values are from d1 vs non-d1 games. They do not store advanced stats for these games.
columns_no_rank = [elem for elem in list(fulldf.columns) if elem not in ['away_ranking','home_ranking']]
fulldf.dropna(axis = 0, how='any', subset= columns_no_rank, inplace = True)

In [None]:
awaydf = fulldf[fulldf.columns.drop(list(fulldf.filter(regex='home')))]
y_away = awaydf['away_points']
X_away = awaydf.drop(['away_points','date','location','losing_abbr','losing_name','winner', 'winning_abbr', 'winning_name'], axis = 1)
# catagorical features = 'away_ranking'

In [None]:
processor = ColumnTransformer(transformers = [
    ('encoder', OneHotEncoder(), ['away_ranking'])],
    remainder='passthrough') 

In [None]:
pipe = Pipeline(steps=[
    ('processor', processor),
    ('reg', LinearRegression())
])

In [None]:
pipe.fit(X_away, y_away)

In [None]:
numerical_away_x = X_away.drop('away_ranking', axis = 1)

In [None]:
feature_names = list(pipe['processor'].named_transformers_['encoder'].get_feature_names())+ list(numerical_away_x)
#feature_names

In [None]:
coefficients = pipe['reg'].coef_
coefficients

In [None]:
coeff_df = pd.DataFrame(coefficients, feature_names, columns=['coefficient'])
coeff_df['coefficient'].sort_values(ascending = False)

In [None]:
plt.figure(figsize=(12,10))
coeff_df.coefficient.plot(kind='barh',figsize=(12,10))

In [None]:
homedf = fulldf[fulldf.columns.drop(list(fulldf.filter(regex='away')))]
Y_home = homedf['home_points']
X_home = homedf.drop(['home_points','date','location','losing_abbr','losing_name','winner', 'winning_abbr', 'winning_name'], axis = 1)


In [None]:
fulldf.dropna(axis = 0, how='any', inplace = True)
y = fulldf['away_points'] + fulldf['home_points']
X = fulldf.drop(['away_points','home_points','date','location','losing_abbr','losing_name','winner', 'winning_abbr', 'winning_name'], axis = 1)

In [None]:
processor = ColumnTransformer(transformers = [
    ('encoder', OneHotEncoder(), ['away_ranking', 'home_ranking'])],
    remainder='passthrough') 

In [None]:
pipe = Pipeline(steps=[
    ('processor', processor),
    ('reg', LinearRegression())
])

In [None]:
pipe.fit(X, y)

In [None]:
numerical_full_x = X.drop(['away_ranking','home_ranking'], axis = 1)
feature_names = list(pipe['processor'].named_transformers_['encoder'].get_feature_names())+ list(numerical_full_x)
coefficients = pipe['reg'].coef_
coeff_df = pd.DataFrame(coefficients, feature_names, columns=['coefficient'])
coeff_df['coefficient'].sort_values(ascending = False)

In [None]:
plt.figure(figsize=(15,50))
coeff_df.coefficient.plot(kind='barh',figsize=(12,10))

In [None]:
fulldf.corr()[['away_points','home_points']].sort_values('away_points', ascending = False).head(30)

In [None]:
top_30_cor_home = list(fulldf.corr()[['away_points','home_points']].sort_values('home_points', ascending = False).head(30)['home_points'].index)

In [None]:
fulldf.dropna(axis = 0, how='any', inplace = True)
y = fulldf['home_points']
X = fulldf[top_30_cor_home]

In [None]:
cat_features = ['away_ranking', 'home_ranking']
num_cat_features = set(cat_features) & set(X.columns)

In [None]:
if len(num_cat_features) != 0:
    processor = ColumnTransformer(transformers = [
        ('encoder', OneHotEncoder(), ['away_ranking', 'home_ranking'])],
        remainder='passthrough') 
else:
    print('all good')

In [None]:
pipe = Pipeline(steps=[
    ('processor', processor),
    ('reg', LinearRegression())
])

In [None]:
pipe.fit(X, y)

In [None]:
top_30_cor_away = list(fulldf.corr()[['away_points','home_points']].sort_values('away_points', ascending = False).head(30)['away_points'].index)