In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# import excel file and add year column and values.
# each excel sheet is imported and put into a dictionary with the key being the respective year
sheets = ['2019','2018','2017','2016','2015','2014']
dat = {}
for s in sheets:
    dat[s] = pd.read_excel('March Madness.xlsx',sheet_name=s)
    dat[s]['Year'] = int(s)
    dat[s] = dat[s].dropna()

In [None]:
dat[s].head()

In [None]:
# compress all the data into the same dataframe and delete original dictionary of dataframes
df = dat[sheets[0]]
for s in sheets[1:]:
    df = df.append(dat[s])
df = df.reset_index()
df = df.drop(['index'],axis=1)
del(dat)

In [None]:
# Graphing parameters
label_angle = 90
figs = (5,13)

# Functions

In [None]:
# getScore takes a dataframe and returns what the score of that dataframe is based on how it is sorted
# The score is calculated as 10 points for each win of team 1, 9 points for each win of team 2, ...
# 1 point for each win of team 10. If a team is a Cinderella, 5 points are added.
def getScore(data):
    # get the top 10 rows
    data = data.reset_index()
    score = 0
    # calculate the score
    for i in range(0,10):

        # calculate for top 10
        score += (10-i) * data.iloc[i]['Number of Tournament Wins']
        
        # account for cinderella teams
        if data.iloc[i,:]['Cinderella'] > 0:
            score += 5
    # end for
    return score

In [None]:
# returns the score for a given dataframe sorted based on given columns
# data is the dataframe to be sorted
# sorting_keys is a list of columns in data that the sorting will happen on
# ascending is a boolean for sorting based on ascending or descending values

def sortingResults(data, sorting_keys,ascending = True):
    data = data.sort_values(sorting_keys,ascending = ascending)
    return getScore(data)
    

In [None]:
# quickly display graph with preset values and format
# data(dict)
    # data.keys() are the yticks
    # data.values() are the horizontal length of the bars
# num_sorting(int) is the number of variables used to sort the data
# ascending(str) is if the plot is ascending or descending
def graphDict(data, num_sorting = 0, ascending = 'Ascending'):
    plt.figure(figsize=figs)
    plt.barh(list(data.keys()),data.values())
    plt.xticks(rotation = label_angle)
    plt.axvline(x=50, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=150, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=250, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=350, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=100, linewidth = 0.8, dashes = [10,15],color='red')
    plt.axvline(x=200, linewidth = 0.8, dashes = [10,15],color='red')
    plt.axvline(x=300, linewidth = 0.8, dashes = [10,15],color='red')
    plt.axvline(x=400, linewidth = 0.8, dashes = [10,15],color='red')
    plt.title("".join(["Sorting with ",str(num_sorting), " variable(s) ", ascending]))
    plt.show()

In [None]:
df.info()

# Data Prep

In [None]:
# dummies for Region
dums = pd.get_dummies(df['Region'])
df[dums.columns] = dums
df = df.drop(['Region'], axis=1)

In [None]:
# conference
df['Conference'].nunique()

In [None]:
df.head()

# Sorting

In [None]:
# get and save scores for sorting data based on each combination of 1 column
oneVarAscending = {}
oneVarDescending = {}
# for each column
for c in df.columns:
    oneVarAscending[c] = sortingResults(data=df,sorting_keys=[c],ascending=True)
    oneVarDescending[c] = sortingResults(data=df,sorting_keys=[c],ascending=False)
# end for

# Graph results from oneVarAscending/Descending
#graphDict(data = oneVarAscending, num_sorting = 1, ascending = 'Ascending')
#graphDict(data = oneVarDescending, num_sorting = 1, ascending = 'Descending')

In [None]:
# get and save scores for sorting data based on each combination of 2 columns
twoVarOverallAsc = {}
twoVarOverallDes = {}
# for each column
for c in df.columns:
    twoVarAscending = {}
    twoVarDescending = {}
    for d in df.columns:
        twoVarAscending[str(c+' * '+d)] = sortingResults(data=df,sorting_keys=[c,d],ascending=True)
        twoVarDescending[str(c+' * '+d)] = sortingResults(data=df,sorting_keys=[c,d],ascending=False)
    # end for
    
    #graphDict(data = twoVarAscending, num_sorting = 2, ascending = 'Ascending')
    #graphDict(data = twoVarDescending, num_sorting = 2, ascending = 'Descending')
    twoVarOverallAsc[c] = twoVarAscending
    twoVarOverallDes[c] = twoVarDescending
    del twoVarAscending
    del twoVarDescending
# end for
 

In [None]:
# Create a dataframe for ascending/descending two variable sorting that contains the scores for
# sorting by (column, row)

# Ascending
twoAsc = pd.DataFrame(0, columns=twoVarOverallAsc.keys(), index=twoVarOverallAsc.keys())
for c in twoAsc.columns:
    twoAsc[c] = twoVarOverallAsc[c].values()

# Descending
twoDes = pd.DataFrame(0, columns=twoVarOverallDes.keys(), index=twoVarOverallDes.keys())
for c in twoDes.columns:
    twoDes[c] = twoVarOverallDes[c].values()

In [None]:
# Look at max for each column of twoAsc and twoDes
print("Ascending:")
for c in twoAsc.columns:
    print(c,twoAsc[c].max())
    
print("\n\nDescending")
for c in twoDes.columns:
    print(c,twoDes[c].max())

# Linear Correlation

In [None]:
# get a matrix of how each column is correlated to each other column
cor = df.corr()

In [None]:
# R value
cor['Number of Tournament Wins']

In [None]:
# calculate R-Squared value
r2 = cor['Number of Tournament Wins']**2

In [None]:
# Sort values and get rid of 'Number of Tournament Wins'
r2 = r2.sort_values(ascending=False)[1:]

In [None]:
r2

In [None]:
# R-Squared values that are greater than 10%
r2_10 = r2[r2.values >= 0.10]
r2_10

In [None]:
r2_10.index

In [None]:
# R-Squared values that are greater than 5%
r2_5 = r2[r2.values >= 0.05]
r2_5

# Linear Regression
No significant effort was spent on optimizing the linear model, I used it as a rough baseline for what performance I could expect for attempting to predict number of tournament wins for each team.

In [None]:
# import libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
# Function for train-test-splitting X and y, fitting a linear model, and returning the score
# X is a dataframe of predictor values
# y is a dataframe of response values

def linReg(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 33, test_size = 0.30)
    lm = LinearRegression()
    lm.fit(X_train,y_train)
    return lm.score(X_test,y_test)

In [None]:
# LinReg on all data
y = df['Number of Tournament Wins']
X = df.drop(['Number of Tournament Wins','Team','Conference'],axis=1)
print(linReg(X,y))

In [None]:
# LinReg on columns with R-Squared greater than 10%
X = df[r2_10.index]
y = df['Number of Tournament Wins']
print(linReg(X,y))

In [None]:
# LinReg on columns with R-Squared greater than 5%
X = df[r2_5.index]
y = df['Number of Tournament Wins']
print(linReg(X,y))

# Neural Network
No significant effort was spent on optimizing the neural network, I used it as a rough baseline for what performance I could expect for attempting to predict number of tournament wins for each team.

In [None]:
# import libraries
from sklearn.neural_network import MLPClassifier

In [None]:
# Function for train-test-splitting X and y, training a neural network, and returning the score
# X is a dataframe of predictor values
# y is a dataframe of response values
def N_Net(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 33, test_size = 0.30)
    nn = MLPClassifier(solver='lbfgs', max_iter=100000)
    nn.fit(X_train,y_train)
    return nn.score(X_test,y_test)

In [None]:
# Testing NNet on all columns
y = df['Number of Tournament Wins']
X = df.drop(['Number of Tournament Wins','Team','Conference'],axis=1)
print(N_Net(X,y))

In [None]:
# Testing NNet on columns with R-Squared greater than 10%
y = df['Number of Tournament Wins']
X = df[r2_10.index]
print(N_Net(X,y))

In [None]:
# Testing NNet on columns with R-Squared greater than 5%
y = df['Number of Tournament Wins']
X = df[r2_5.index]
print(N_Net(X,y))