# Weighted Averages for March Madness




In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [3]:
#import dataframe and add year column and values
sheets = ['2019','2018','2017','2016','2015','2014']
dat = {}
for s in sheets:
    dat[s] = pd.read_excel('March Madness.xlsx',sheet_name=s)
    dat[s]['Year'] = int(s)
    dat[s] = dat[s].dropna()

In [4]:
# get all the data into the same dataframe and delete original dictionary of dataframes
df = dat[sheets[0]]
for s in sheets[1:]:
    df = df.append(dat[s])
df = df.reset_index()
df = df.drop(['index'],axis=1)
del(dat)

In [5]:
df.head()

Unnamed: 0,Team,Conference,Region,Cinderella,Conference Tournament Champion,Number of Tournament Wins,Made Tournament Previous Year,Game Count,Wins,Losses,...,ESPN Strength of Schedule,Wins Against Top 25 RPI Teams,Losses Against Top 25 RPI Teams,Total Points,Average PPG,Total Opp Points,Average Opp PPG,Total Scoring Differential,Scoring Differential Per Game,Year
0,Virginia,ACC,South,0,0,6,1,32,29,3,...,16,13,3,2299,71.8,1762,55.1,537,16.8,2019
1,Texas Tech,Big 12,West,0,0,5,1,32,26,6,...,52,11,5,2339,73.1,1897,59.3,442,13.8,2019
2,Auburn,SEC,Midwest,0,1,4,1,35,26,9,...,43,6,8,2785,79.6,2384,68.1,401,11.5,2019
3,Michigan St.,Big Ten,East,0,1,4,1,34,28,6,...,3,17,5,2680,78.8,2228,65.5,452,13.3,2019
4,Purdue,Big Ten,South,0,0,3,1,32,23,9,...,10,13,6,2438,76.2,2138,66.8,300,9.4,2019


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Team                             410 non-null    object 
 1   Conference                       410 non-null    object 
 2   Region                           410 non-null    object 
 3   Cinderella                       410 non-null    int64  
 4   Conference Tournament Champion   410 non-null    int64  
 5   Number of Tournament Wins        410 non-null    int64  
 6   Made Tournament Previous Year    410 non-null    int64  
 7   Game Count                       410 non-null    int64  
 8   Wins                             410 non-null    int64  
 9   Losses                           410 non-null    int64  
 10  3-Pointers Made                  410 non-null    int64  
 11  3-Pointers Attempted             410 non-null    int64  
 12  3-Point Percentage    

In [7]:
# this is the function that should be used to get the results for a given algorithm/sorting of the data
def getScore(data):
    # get the top 10 rows
    data = data.reset_index()
    top10 = data.iloc[0:10,:]
    score = 0
    # calculate the score
    for i in range(0,len(top10)):
        # calculate for top 10
        score += abs(10-i) * top10.iloc[i]['Number of Tournament Wins']
        # calculate for cinderella
        if top10.iloc[i,-1] > 0:
            score += 5
    return score

In [8]:
# get list of column names of top x correlations with column cor in dataframe df
def topCorrelations(x,df,cor):
    top = df.corr()[cor]
    top = top**2
    return list(top.sort_values(ascending = False)[1:x+1].index)

In [9]:
# scale data in passed in columns based on specified scaling method
def scaleData(df, cols, kind):
    if kind == "Std":
        for c in cols:
            df[c] = StandardScaler().fit_transform(df[[c]])
    elif kind == "MinMax":
        for c in cols:
            df[c] = MinMaxScaler((0,1)).fit_transform(df[[c]])
    return df

In [10]:
# Recursive function for iterating through all possible coefficients of each column 
# and storing results in the results dataframe 
def weightTests(df, curr_coefs, coef_range, n_columns, results):
    print(curr_coefs)
    # is final coef to be set
    if len(curr_coefs) == n_columns:

        # calculate new coefficients
        df.loc[:,'Score'] = 0
        for cl in range(0,n_columns):
            df.loc[:,'Score'] += df.iloc[:,cl] * curr_coefs[cl]
        # end for

        # calculate new score
        df = df.sort_values(['Score'],ascending=False)
        endScore = getScore(data = df)

        # create new results entry
        res = {"Score":endScore}
        for cl in range(n_columns):
            res[df.columns[cl]] = curr_coefs[cl]
        #end for
        print(res)

        results = results.append(res, ignore_index = True)
    # end if
    
    # needs to set more coefficients
    else:
        for c in coef_range:
            curr_coefs.append(c)
            weightTests(df = df, curr_coefs = curr_coefs,
                        coef_range = coef_range,results = results, 
                        n_columns = n_columns)
        #end for
    #end else

    return results

In [11]:
# run the weighted averages model based on top number of correlations, dataframe data, and a range of weights
def runModel(topCorr, weightRange, data, corVar):
    # Get top correlation columns
    cols = topCorrelations(x=topCorr,df=data,cor=corVar)
    cols.append('Score')
    print(cols)
    
    # Create results dataframe
    results = pd.DataFrame(columns = cols)
    
    # Scale data Std
    df_s = data.copy()
    df_s = scaleData(df = df_s,cols = cols[:-1],kind = "Std")
    # Scale data MinMax
    df_m = data.copy()
    df_m = scaleData(df = df_m, cols = cols[:-1],kind = "MinMax")
    
    cols.append(corVar)
    cols.append('Cinderella')
    data = data[cols]
    df_s = df_s[cols]
    df_m = df_m[cols]
    
    
    #print("Normal data\n",data.head())
    #print("Std scaled data\n",df_s.head())
    #print("Std scaled data\n",df_m.head())
    
    # normal tests
    print("Normal Tests")
    results = results.append(weightTests(df = data, curr_coefs = [], coef_range = weightRange, 
                                         results = results, n_columns = topCorr))
    # std tests
    #print("Std Tests")
    #results = results.append(weightTests(df = df_s, curr_coefs = [], coef_range = weightRange, 
    #                                     results = results, columns = topCorr))
    # minmax tests
    #print("MinMax Tests")
    #results = results.append(weightTests(df = df_m, curr_coefs = [], coef_range = weightRange, 
    #                                     results = results, columns = topCorr))
    #print("Results:",results)
    return results

In [12]:
# Model parameters
ranges = [0,0.25,0.5,0.75,1]
topCorr = 5
corrTo = "Number of Tournament Wins"

In [None]:
results = pd.DataFrame()
df['Score'] = 0
results = results.append(runModel(topCorr = topCorr, weightRange = ranges, data = df.copy(), corVar = corrTo).copy())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [None]:
results

In [None]:
3 *5**5