In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#import dataframe and add year column and values
sheets = ['2019','2018','2017','2016','2015','2014']
dat = {}
for s in sheets:
    dat[s] = pd.read_excel('March Madness.xlsx',sheet_name=s)
    dat[s]['Year'] = int(s)
    dat[s] = dat[s].dropna()

In [3]:
dat[s].head()

Unnamed: 0,Team,Conference,Region,Cinderella,Conference Tournament Champion,Number of Tournament Wins,Made Tournament Previous Year,Game Count,Wins,Losses,...,ESPN Strength of Schedule,Wins Against Top 25 RPI Teams,Losses Against Top 25 RPI Teams,Total Points,Average PPG,Total Opp Points,Average Opp PPG,Total Scoring Differential,Scoring Differential Per Game,Year
0,Albany (NY),America East,South,1.0,1.0,0.0,1.0,32.0,18.0,14.0,...,287.0,0.0,0.0,2113.0,66.0,2043.0,63.8,70.0,2.2,2014
1,American,Patriot,West,1.0,1.0,0.0,0.0,32.0,20.0,12.0,...,234.0,0.0,1.0,2046.0,63.9,1876.0,58.6,170.0,5.3,2014
2,Arizona,Pac-12,West,0.0,0.0,3.0,1.0,34.0,30.0,4.0,...,5.0,4.0,1.0,2484.0,73.1,1977.0,58.1,507.0,15.0,2014
3,Arizona St.,Pac-12,Midwest,1.0,0.0,0.0,0.0,32.0,21.0,11.0,...,58.0,1.0,3.0,2401.0,75.0,2204.0,68.9,197.0,6.1,2014
4,Baylor,Big 12,West,0.0,0.0,2.0,0.0,35.0,24.0,11.0,...,10.0,2.0,5.0,2632.0,75.2,2393.0,68.4,239.0,6.8,2014


In [4]:
# get all the data into the same dataframe and delete original dictionary of dataframes
df = dat[sheets[0]]
for s in sheets[1:]:
    df = df.append(dat[s])
df = df.reset_index()
df = df.drop(['index'],axis=1)
del(dat)

In [5]:
# Graphing parameters
label_angle = 90
figs = (5,13)

In [6]:
# this is the function that should be used to get the results for a given algorithm/sorting of the data
def getScore(data):
    # get the top 10 rows
    top10 = data.iloc[0:10,:]
    score = 0
    # calculate the score
    for i in range(0,len(top10)):
        # calculate for top 10
        score += abs(10-i) * top10.iloc[i]['Number of Tournament Wins']
        # calculate for cinderella
        if top10.iloc[i]['Cinderella'] > 0:
            score += 5
    return score

In [7]:
# output results from combination of given variables for sorting
# data is the dataframe to be sorted
# sorting_keys is a list of columns in data that the sorting will happen on
# ascending is a boolean for sorting based on ascending or descending values

def sortingResults(data, sorting_keys,ascending = True):
    data = data.sort_values(sorting_keys,ascending = ascending)
    return getScore(data)
    

In [8]:
# quickly display graph with preset values and format
# data(dict)
    # data.keys() are the yticks
    # data.values() are the horizontal length of the bars
# num_sorting(int) is the number of variables used to sort the data
# ascending(str) is if the plot is ascending or descending
def graphDict(data, num_sorting = 0, ascending = 'Ascending'):
    plt.figure(figsize=figs)
    plt.barh(list(data.keys()),data.values())
    plt.xticks(rotation = label_angle)
    plt.axvline(x=50, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=150, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=250, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=350, linewidth = 0.8, dashes = [10,15],color='black')
    plt.axvline(x=100, linewidth = 0.8, dashes = [10,15],color='red')
    plt.axvline(x=200, linewidth = 0.8, dashes = [10,15],color='red')
    plt.axvline(x=300, linewidth = 0.8, dashes = [10,15],color='red')
    plt.axvline(x=400, linewidth = 0.8, dashes = [10,15],color='red')
    plt.title("".join(["Sorting with ",str(num_sorting), " variable(s) ", ascending]))
    plt.show()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Team                             410 non-null    object 
 1   Conference                       410 non-null    object 
 2   Region                           410 non-null    object 
 3   Cinderella                       410 non-null    float64
 4   Conference Tournament Champion   410 non-null    float64
 5   Number of Tournament Wins        410 non-null    float64
 6   Made Tournament Previous Year    410 non-null    float64
 7   Game Count                       410 non-null    float64
 8   Wins                             410 non-null    float64
 9   Losses                           410 non-null    float64
 10  3-Pointers Made                  410 non-null    float64
 11  3-Pointers Attempted             410 non-null    float64
 12  3-Point Percentage    

In [10]:
# dummies for Region
dums = pd.get_dummies(df['Region'])
df[dums.columns] = dums
df = df.drop(['Region'], axis=1)

In [11]:
# conference
df['Conference'].nunique()

46

In [12]:
df.head()

Unnamed: 0,Team,Conference,Cinderella,Conference Tournament Champion,Number of Tournament Wins,Made Tournament Previous Year,Game Count,Wins,Losses,3-Pointers Made,...,Average PPG,Total Opp Points,Average Opp PPG,Total Scoring Differential,Scoring Differential Per Game,Year,East,Midwest,South,West
0,Florida,SEC,1.0,0.0,1.0,1.0,34.0,19.0,15.0,277.0,...,68.3,2164.0,63.6,157.0,4.6,2019,0,0,0,1
1,Ohio St.,Big Ten,1.0,0.0,1.0,1.0,33.0,19.0,14.0,249.0,...,69.6,2185.0,66.2,113.0,3.4,2019,0,1,0,0
2,Oklahoma,Big 12,1.0,0.0,1.0,1.0,32.0,19.0,13.0,212.0,...,71.2,2183.0,68.2,94.0,2.9,2019,0,0,1,0
3,Louisville,ACC,0.0,0.0,0.0,0.0,33.0,20.0,13.0,285.0,...,74.5,2238.0,67.8,222.0,6.7,2019,1,0,0,0
4,Minnesota,Big Ten,1.0,0.0,1.0,0.0,34.0,21.0,13.0,178.0,...,70.8,2352.0,69.2,55.0,1.6,2019,1,0,0,0


In [13]:
# Sorting based on each variable
oneVarAscending = {}
oneVarDescending = {}
# for each column
for c in df.columns:
    oneVarAscending[c] = sortingResults(data=df,sorting_keys=[c],ascending=True)
    oneVarDescending[c] = sortingResults(data=df,sorting_keys=[c],ascending=False)
    
# Graph results from oneVarAscending/Descending
#graphDict(data = oneVarAscending, num_sorting = 1, ascending = 'Ascending')
#graphDict(data = oneVarDescending, num_sorting = 1, ascending = 'Descending')

In [14]:
# Sorting with 2 variables
twoVarOverallAsc = {}
twoVarOverallDes = {}
for c in df.columns:
    twoVarAscending = {}
    twoVarDescending = {}
    for d in df.columns:
        twoVarAscending[str(c+' * '+d)] = sortingResults(data=df,sorting_keys=[c,d],ascending=True)
        twoVarDescending[str(c+' * '+d)] = sortingResults(data=df,sorting_keys=[c,d],ascending=False)
    #graphDict(data = twoVarAscending, num_sorting = 2, ascending = 'Ascending')
    #graphDict(data = twoVarDescending, num_sorting = 2, ascending = 'Descending')
    twoVarOverallAsc[c] = twoVarAscending
    twoVarOverallDes[c] = twoVarDescending
    del twoVarAscending
    del twoVarDescending
 

In [15]:
# Create a dataframe for ascending/descending two variable sorting that contains the scores for
# sorting by (column, row)

# Ascending
twoAsc = pd.DataFrame(0, columns=twoVarOverallAsc.keys(), index=twoVarOverallAsc.keys())
for c in twoAsc.columns:
    twoAsc[c] = twoVarOverallAsc[c].values()

# Descending
twoDes = pd.DataFrame(0, columns=twoVarOverallDes.keys(), index=twoVarOverallDes.keys())
for c in twoDes.columns:
    twoDes[c] = twoVarOverallDes[c].values()

In [16]:
# Look at max for each column of twoAsc and twoDes
print("Ascending:")
for c in twoAsc.columns:
    print(c,twoAsc[c].max())
print("\n\nDescending")
for c in twoDes.columns:
    print(c,twoDes[c].max())

Ascending:
Team 63.0
Conference 98.0
Cinderella 155.0
Conference Tournament Champion 140.0
Number of Tournament Wins 50.0
Made Tournament Previous Year 87.0
Game Count 73.0
Wins 50.0
Losses 156.0
3-Pointers Made 41.0
3-Pointers Attempted 50.0
3-Point Percentage 75.0
Free Throws Made 43.0
Free Throws Attempted 59.0
Free Throw Percentage 83.0
Rebounds 64.0
Opponent's Rebounds 74.0
Rebound Differential 55.0
Offensive Rebounds 91.0
Assists 53.0
Turnovers 148.0
Assist to Turnover Ratio 54.0
ESPN Strength of Schedule 104.0
Wins Against Top 25 RPI Teams 90.0
Losses Against Top 25 RPI Teams 165.0
Total Points 52.0
Average PPG 51.0
Total Opp Points 93.0
Average Opp PPG 98.0
Total Scoring Differential 50.0
Scoring Differential Per Game 50.0
Year 142.0
East 162.0
Midwest 151.0
South 138.0
West 128.0


Descending
Team 93.0
Conference 88.0
Cinderella 224.0
Conference Tournament Champion 275.0
Number of Tournament Wins 320.0
Made Tournament Previous Year 315.0
Game Count 142.0
Wins 174.0
Losses 53.0

In [17]:
cor = df.corr()

In [18]:
# R value
cor['Number of Tournament Wins']

Cinderella                        -0.474215
Conference Tournament Champion    -0.181267
Number of Tournament Wins          1.000000
Made Tournament Previous Year      0.324842
Game Count                         0.152163
Wins                               0.384278
Losses                            -0.358408
3-Pointers Made                    0.091400
3-Pointers Attempted               0.052335
3-Point Percentage                 0.129300
Free Throws Made                  -0.018188
Free Throws Attempted             -0.047565
Free Throw Percentage              0.090790
Rebounds                           0.153141
Opponent's Rebounds               -0.117758
Rebound Differential               0.223794
Offensive Rebounds                 0.105593
Assists                            0.212540
Turnovers                         -0.195152
Assist to Turnover Ratio           0.306475
ESPN Strength of Schedule         -0.405170
Wins Against Top 25 RPI Teams      0.430109
Losses Against Top 25 RPI Teams 

In [19]:
# R^2
r2 = cor['Number of Tournament Wins']**2

In [20]:
# Sort values and get rid of 'Number of Tournament Wins'
r2 = r2.sort_values(ascending=False)[1:]

In [21]:
r2

Cinderella                         2.248802e-01
Total Scoring Differential         1.993482e-01
Scoring Differential Per Game      1.918516e-01
Wins Against Top 25 RPI Teams      1.849940e-01
ESPN Strength of Schedule          1.641624e-01
Wins                               1.476699e-01
Losses                             1.284560e-01
Made Tournament Previous Year      1.055225e-01
Assist to Turnover Ratio           9.392674e-02
Average Opp PPG                    5.167243e-02
Rebound Differential               5.008368e-02
Assists                            4.517324e-02
Turnovers                          3.808444e-02
Total Points                       3.794208e-02
Conference Tournament Champion     3.285781e-02
Rebounds                           2.345208e-02
Game Count                         2.315359e-02
Total Opp Points                   2.174504e-02
Average PPG                        2.088043e-02
3-Point Percentage                 1.671855e-02
Losses Against Top 25 RPI Teams    1.494

In [22]:
# 10% R^2 values
r2_10 = r2[r2.values >= 0.10]
r2_10

Cinderella                       0.224880
Total Scoring Differential       0.199348
Scoring Differential Per Game    0.191852
Wins Against Top 25 RPI Teams    0.184994
ESPN Strength of Schedule        0.164162
Wins                             0.147670
Losses                           0.128456
Made Tournament Previous Year    0.105523
Name: Number of Tournament Wins, dtype: float64

In [23]:
r2_10.index

Index(['Cinderella', 'Total Scoring Differential',
       'Scoring Differential Per Game', 'Wins Against Top 25 RPI Teams',
       'ESPN Strength of Schedule', 'Wins', 'Losses',
       'Made Tournament Previous Year'],
      dtype='object')

In [24]:
# 5% R^2 values
r2_5 = r2[r2.values >= 0.05]
r2_5

Cinderella                       0.224880
Total Scoring Differential       0.199348
Scoring Differential Per Game    0.191852
Wins Against Top 25 RPI Teams    0.184994
ESPN Strength of Schedule        0.164162
Wins                             0.147670
Losses                           0.128456
Made Tournament Previous Year    0.105523
Assist to Turnover Ratio         0.093927
Average Opp PPG                  0.051672
Rebound Differential             0.050084
Name: Number of Tournament Wins, dtype: float64

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [26]:
# Function for train test splitting and getting score of 2 dataframes
# X is a dataframe of predictor values
# y is a dataframe of response values

def linReg(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 33, test_size = 0.30)
    lm = LinearRegression()
    lm.fit(X_train,y_train)
    return lm.score(X_test,y_test)

In [27]:
# LinReg on all data

y = df['Number of Tournament Wins']
X = df.drop(['Number of Tournament Wins','Team','Conference'],axis=1)
print(linReg(X,y))

0.342194792800263


In [28]:
X = df[r2_10.index]
y = df['Number of Tournament Wins']
print(linReg(X,y))

0.38485781699991095


In [29]:
X = df[r2_5.index]
y = df['Number of Tournament Wins']
print(linReg(X,y))

0.3823726453303451


In [30]:
from sklearn.neural_network import MLPClassifier

In [31]:
def N_Net(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 33, test_size = 0.30)
    nn = MLPClassifier(solver='lbfgs', max_iter=100000)
    nn.fit(X_train,y_train)
    return nn.score(X_test,y_test)

In [36]:
y = df['Number of Tournament Wins']
X = df.drop(['Number of Tournament Wins','Team','Conference'],axis=1)
print(N_Net(X,y))

0.5691056910569106


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [33]:
y = df['Number of Tournament Wins']
X = df[r2_10.index]
print(N_Net(X,y))

0.5121951219512195


In [35]:
y = df['Number of Tournament Wins']
X = df[r2_5.index]
print(N_Net(X,y))

0.4959349593495935
