In [2]:
import os
import pandas as pd
import numpy as np
import sklearn.decomposition as skl

In [11]:
def get_game_data(relative=None, stats_filter=None, datasets_dir=None, games_file_name=None, stats_file_name=None):
    """
    Combine game data and regular season statistics to form a DataFrame where each row represents one game from an NCAA
    tournament. Ensure the Team1 and Team2 values are randomly swapped in games_file_name so that the first team is not
    always the higher seed. (Done already for provided data).
    :param relative: If True, use relative change in each category [(Team A - Team B)/avg(Team A, Team B)]. If False,
        use absolute change. Defaults to True.
    :param stats_filter: A function to filter which statistics are kept. Takes a list of stat names and should return
        the filtered list. If None, keep all stats. Defaults to None.
    :param datasets_dir: Name of the directory in cwd containing the CSVs with games_file and stats_file. Defaults to
        'datasets'.
    :param games_file_name: Name of the CSV file containing game info (which teams played in each game).
    :param stats_file_name: Name of the CSV file containing regular season statistics for each team.
    :return: pandas DataFrame containing data for each game detailed in games_file_name. Each row represents one game
        from an NCAA tournament, with statistics in the form Team A - Team B (or relative difference if relative=True).
    """
    #relative = True if relative is None else relative
    datasets_dir = 'datasets' if datasets_dir is None else datasets_dir
    games_file_name = 'NCAA tournament games 2010-2018.csv' if games_file_name is None else games_file_name
    if games_file_name[-4:] != '.csv':
        raise Warning('games_file_name must end in .csv. A CSV file is required.')
    stats_file_name = 'Stats by team and year 2010-2018.csv' if stats_file_name is None else stats_file_name
    if stats_file_name[-4:] != '.csv':
        raise Warning('stats_file_name must end in .csv. A CSV file is required.')
    games_file_path = os.path.join(datasets_dir, games_file_name)
    stats_file_path = os.path.join(datasets_dir, stats_file_name)

    df_games = pd.read_csv(games_file_path, sep=',')
    df_games = df_games.loc[:, ~df_games.columns.str.contains('^Unnamed')]
    df_stats = pd.read_csv(stats_file_path, sep=',')
    df_stats = df_stats.loc[:, ~df_stats.columns.str.contains('^Unnamed')]
    if stats_filter is not None:
        stats = list(df_stats.columns.drop(['School ID', 'Year']))
        stats = stats_filter(stats)
        stats.insert(0, 'School ID')
        df_stats = df_stats[stats]
    else:
        df_stats = df_stats.drop('Year', axis=1)

    df_1 = df_games.join(df_stats.set_index('School ID'), on='Team1 ID')
    df_2 = df_games.join(df_stats.set_index('School ID'), on='Team2 ID')
    df_stats_1 = df_1.loc[:, 'Game Num':].drop('Game Num', axis=1)
    df_stats_2 = df_2.loc[:, 'Game Num':].drop('Game Num', axis=1)
    #if relative:
     #   df_game_data = df_games.join(((df_stats_1 - df_stats_2) / ((df_stats_1 + df_stats_2) / 2)).add_suffix(' Diff'))
    #else:
     #   df_game_data = df_games.join((df_stats_1 - df_stats_2).add_suffix(' Diff'))
    return df_game_data

In [12]:
# The first step is getting all the data from Game Results and Team Stats in a Pandas DataFrame
game_data = get_game_data(relative=None, stats_filter=None, datasets_dir=None, games_file_name='NCAA tournament games 2010-2018.csv', stats_file_name='Stats by team and year 2010-2018.csv')
game_data

NameError: name 'df_game_data' is not defined

In [9]:
# However, for unsupervised learning, we are attempting to predict tournament seeding based on the team stats for that year
# We need to isolate the stats for the year 2018
game_data_2018 = game_data.loc[game_data['Year']==2018]
game_data_2018

Unnamed: 0,Team1 ID,Team2 ID,Round,Round Num,Team1 result,Year,Game Num,W-L% Diff,SOS Diff,ORtg Diff,...,Seed Norm Diff,WL% * SOS Norm Diff,3P% Norm Diff,FT% Norm Diff,PPG Norm Diff,OPPG Norm Diff,APG Norm Diff,TOVPG Norm Diff,Adjusted WL% Norm Diff,Adjusted Margin Norm Diff
504,Virginia/2018,UMBC/2018,Round of 64,1,loss,2018,0,0.271482,4.936902,0.044444,...,2.000000,1.371931,-0.059674,1.894494,-1.711400,0.709476,-0.471405,0.730935,0.640890,0.893000
505,Creighton/2018,Kansas State/2018,Round of 64,1,loss,2018,1,-0.060976,-0.076010,0.067273,...,0.133200,-0.070717,0.805140,0.142823,1.167665,-0.634449,0.787148,0.046543,-0.337014,0.557264
506,Kentucky/2018,Davidson/2018,Round of 64,1,win,2018,2,0.100075,1.227068,-0.053286,...,0.933200,0.605598,-0.581734,-0.952465,0.042887,-0.187281,-0.813174,-0.822108,0.112456,-0.254531
507,Arizona/2018,Buffalo/2018,Round of 64,1,loss,2018,3,0.027613,1.858956,0.020096,...,1.200000,0.643986,-0.019410,0.705134,-0.258977,0.577239,-0.271344,0.069026,0.007698,0.095238
508,Miami (FL)/2018,Loyola-Chicago/2018,Round of 64,1,loss,2018,4,-0.201307,1.174458,-0.004623,...,0.666800,0.392140,-0.516081,-1.247391,0.360040,-0.325370,-0.632632,0.273492,-0.685993,-0.368139
509,Tennessee/2018,Wright State/2018,Round of 64,1,win,2018,5,0.039808,4.246914,0.074988,...,1.466800,1.326441,1.061579,0.451667,0.401200,0.010824,0.685166,0.372042,0.029782,0.333333
510,Nevada/2018,Texas/2018,Round of 64,1,win,2018,6,0.335071,-0.735897,0.105743,...,0.400000,-0.208285,1.894779,1.278872,0.985426,-0.397945,1.429479,0.440609,1.903217,0.723222
511,Cincinnati/2018,Georgia State/2018,Round of 64,1,win,2018,7,0.226244,16.826667,0.031689,...,1.733200,0.862600,-0.627600,0.333272,-0.055556,0.508012,0.757356,0.015204,0.590385,0.705262
512,UMBC/2018,Kansas State/2018,Round of 32,2,loss,2018,32,0.026277,-5.134694,-0.006607,...,-2.000000,-1.262637,1.000000,-1.872053,0.364405,-0.109546,0.369893,-0.219370,0.313626,0.055074
513,Kentucky/2018,Buffalo/2018,Round of 32,2,win,2018,33,-0.064694,1.908925,-0.033199,...,1.142827,0.814324,-0.285586,-0.018154,-0.560953,0.698221,-0.797234,-0.212680,-0.350042,-0.153780


In [10]:
game_data_2018.iloc[0,:]

Team1 ID                     Virginia/2018
Team2 ID                         UMBC/2018
Round                          Round of 64
Round Num                                1
Team1 result                          loss
Year                                  2018
Game Num                                 0
W-L% Diff                         0.271482
SOS Diff                            4.9369
ORtg Diff                        0.0444444
Drtg Diff                        -0.108626
FG% Diff                         0.0376523
Seed Diff                         -1.76471
WL% * SOS Diff                     3.90126
3P% Diff                        -0.0103896
FT% Diff                          0.154286
PPG Diff                        -0.0842851
OPPG Diff                        -0.236735
APG Diff                         -0.120272
TOVPG Diff                       -0.355905
Adjusted WL% Diff                 0.234401
Adjusted Margin Diff               1.06383
W-L% Norm Diff                    0.607222
SOS Norm Di