# Generating Team Data
In this notebook, we will use the raw sportspiy data and create csvs for each team.

# Imports

In [1]:
import numpy as np
import os
import pandas as pd
import shutil
from sportsipy.ncaab.teams import Teams
from sportsipy.ncaab.schedule import Schedule
from tqdm import tqdm

pd.set_option("display.max_columns", None)

# Utils

In [2]:
def clean_dir(path):
    """Makes a clean directory, removes all files and folders in the specified path"""
    
    if not os.path.exists(path):
        os.mkdir(path)
    
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")

# Load Example CSV
Load raw file to see structure of data

In [3]:
pd.read_csv("../data/raw_data/2010_data.csv").head()

Unnamed: 0,away_assist_percentage,away_assists,away_block_percentage,away_blocks,away_defensive_rating,away_defensive_rebound_percentage,away_defensive_rebounds,away_effective_field_goal_percentage,away_field_goal_attempts,away_field_goal_percentage,away_field_goals,away_free_throw_attempt_rate,away_free_throw_attempts,away_free_throw_percentage,away_free_throws,away_losses,away_minutes_played,away_offensive_rating,away_offensive_rebound_percentage,away_offensive_rebounds,away_personal_fouls,away_points,away_ranking,away_steal_percentage,away_steals,away_three_point_attempt_rate,away_three_point_field_goal_attempts,away_three_point_field_goal_percentage,away_three_point_field_goals,away_total_rebound_percentage,away_total_rebounds,away_true_shooting_percentage,away_turnover_percentage,away_turnovers,away_two_point_field_goal_attempts,away_two_point_field_goal_percentage,away_two_point_field_goals,away_win_percentage,away_wins,date,home_assist_percentage,home_assists,home_block_percentage,home_blocks,home_defensive_rating,home_defensive_rebound_percentage,home_defensive_rebounds,home_effective_field_goal_percentage,home_field_goal_attempts,home_field_goal_percentage,home_field_goals,home_free_throw_attempt_rate,home_free_throw_attempts,home_free_throw_percentage,home_free_throws,home_losses,home_minutes_played,home_offensive_rating,home_offensive_rebound_percentage,home_offensive_rebounds,home_personal_fouls,home_points,home_ranking,home_steal_percentage,home_steals,home_three_point_attempt_rate,home_three_point_field_goal_attempts,home_three_point_field_goal_percentage,home_three_point_field_goals,home_total_rebound_percentage,home_total_rebounds,home_true_shooting_percentage,home_turnover_percentage,home_turnovers,home_two_point_field_goal_attempts,home_two_point_field_goal_percentage,home_two_point_field_goals,home_win_percentage,home_wins,location,losing_abbr,losing_name,pace,winner,winning_abbr,winning_name
0,50.0,7,8.2,2,125.7,22.2,11,0.328,49,0.327,16,0.164,10,0.7,7,0,125,63.5,27.3,7,11,42,,9.5,5,0.279,13,0.231,3,25.4,18,0.357,23.4,12,36,0.361,13,0.0,0,"November 16, 2009",61.8,21,22.7,10,63.5,72.7,32,0.572,69,0.493,34,0.275,19,0.737,14,0,,125.7,77.8,21,14,93,,17.6,13,0.29,20,0.55,11,74.6,53,0.596,18.9,17,49,0.469,23,1.0,1,"Convocation Center, Jonesboro, Arkansas",MacMurray\n\t\t\t,MacMurray\n\t\t\t,,Home,ARKANSAS-STATE,Arkansas State
1,50.0,10,8.5,4,69.0,83.8,31,0.489,45,0.444,20,0.956,43,0.674,29,0,200,102.8,22.2,6,20,73,,11.3,8,0.289,13,0.308,4,57.8,37,0.558,15.8,12,32,0.5,16,1.0,2,"November 18, 2009",35.3,6,6.3,2,102.8,77.8,23,0.324,54,0.315,17,0.426,23,0.609,14,2,200.0,69.0,16.2,4,29,49,,11.3,8,0.13,7,0.143,1,42.2,27,0.377,15.8,12,47,0.34,16,0.0,0,"Show Me Center, Cape Girardeau, Missouri",SOUTHEAST-MISSOURI-STATE,Southeast Missouri State,71.2,Away,ARKANSAS-STATE,Arkansas State
2,68.4,13,6.4,3,113.9,72.7,19,0.375,56,0.339,19,0.482,27,0.519,14,1,200,77.8,34.1,11,20,56,,2.8,2,0.375,21,0.19,4,47.6,30,0.407,21.0,18,35,0.429,15,0.667,2,"November 21, 2009",64.5,20,11.4,4,77.8,65.9,28,0.573,55,0.564,31,0.455,25,0.76,19,0,200.0,113.9,27.3,5,25,82,,13.9,10,0.145,8,0.125,1,52.4,33,0.613,13.2,10,47,0.638,30,1.0,3,"Don Haskins Center, El Paso, Texas",ARKANSAS-STATE,Arkansas State,71.9,Home,TEXAS-EL-PASO,UTEP
3,32.0,8,7.7,3,112.7,51.6,21,0.418,61,0.41,25,0.393,24,0.625,15,3,200,93.0,42.1,11,21,66,,11.3,8,0.197,12,0.083,1,46.4,32,0.456,17.3,15,49,0.49,24,0.25,1,"November 25, 2009",55.6,15,16.3,8,93.0,57.9,26,0.525,61,0.443,27,0.377,23,0.696,16,1,200.0,112.7,48.4,11,18,80,,9.9,7,0.361,22,0.455,10,53.6,37,0.556,16.4,14,39,0.436,17,0.75,3,"Convocation Center, Jonesboro, Arkansas",TENNESSEE-MARTIN,UT-Martin,71.2,Home,ARKANSAS-STATE,Arkansas State
4,44.0,11,12.2,5,104.3,68.3,30,0.582,49,0.51,25,0.51,25,0.8,20,2,200,111.6,47.6,8,26,77,,4.3,3,0.286,14,0.5,7,61.3,38,0.632,24.1,18,35,0.514,18,0.714,5,"November 29, 2009",48.0,12,5.7,2,111.6,52.4,14,0.46,62,0.403,25,0.468,29,0.517,15,2,200.0,104.3,31.7,10,18,72,,11.6,8,0.339,21,0.333,7,38.7,24,0.475,7.4,6,41,0.439,18,0.6,3,"Convocation Center, Jonesboro, Arkansas",ARKANSAS-STATE,Arkansas State,69.3,Away,INDIANA-STATE,Indiana State


# Compile CSVs into One CSV
Compile all the separate CSVs into one. **Note, this will not prune out any entries that may have null values. This will require further data cleaning.**

In [4]:
# Make a list of all the CSVs to be compiled
csvs = []
for file in sorted(os.listdir("../data/raw_data/")):
    csv = pd.read_csv(os.path.join("../data/raw_data/", file))    
    csvs.append(csv)

# Drop duplicates
data = pd.concat(csvs).drop_duplicates()

# Get rid of any preceding/trailing whitespace in the following fields
data["losing_abbr"] = data["losing_abbr"].apply(lambda x: x.strip())
data["losing_name"] = data["losing_name"].apply(lambda x: x.strip())
data["winning_abbr"] = data["winning_abbr"].apply(lambda x: x.strip())
data["winning_name"] = data["winning_name"].apply(lambda x: x.strip())

# Make columns for home team and away team
data["away"] = np.where(data["winner"]=="Away",
                        data["winning_abbr"],
                        data["losing_abbr"])

data["home"] = np.where(data["winner"]=="Home",
                        data["winning_abbr"],
                        data["losing_abbr"])

# Drop the following labels
data.drop(columns=["away_ranking", "home_ranking"], inplace=True)

# Convert the date to datetime
data["date"] = pd.to_datetime(data["date"])
data["date"] = data["date"].dt.strftime("%Y-%m-%d")
data.sort_values(by=["date"])
data.reset_index(drop=True, inplace=True)

# This will help later
reordered_labels = ['date',
                    'location',
                    'losing_abbr',
                    'losing_name',
                    'pace',
                    'winning_abbr',
                    'winning_name',
                    'away',
                    'home',
                    'winner',
                    'away_assist_percentage',
                    'away_assists', 
                    'away_block_percentage', 
                    'away_blocks', 
                    'away_defensive_rating', 
                    'away_defensive_rebound_percentage', 
                    'away_defensive_rebounds',
                    'away_effective_field_goal_percentage',
                    'away_field_goal_attempts',
                    'away_field_goal_percentage',
                    'away_field_goals',
                    'away_free_throw_attempt_rate',
                    'away_free_throw_attempts',
                    'away_free_throw_percentage',
                    'away_free_throws',
                    'away_losses',
                    'away_minutes_played',
                    'away_offensive_rating',
                    'away_offensive_rebound_percentage',
                    'away_offensive_rebounds',
                    'away_personal_fouls',
                    'away_points',
                    'away_steal_percentage',
                    'away_steals',
                    'away_three_point_attempt_rate',
                    'away_three_point_field_goal_attempts',
                    'away_three_point_field_goal_percentage',
                    'away_three_point_field_goals',
                    'away_total_rebound_percentage',
                    'away_total_rebounds',
                    'away_true_shooting_percentage',
                    'away_turnover_percentage',
                    'away_turnovers',
                    'away_two_point_field_goal_attempts',
                    'away_two_point_field_goal_percentage',
                    'away_two_point_field_goals',
                    'away_win_percentage',
                    'away_wins',
                    'home_assist_percentage',
                    'home_assists',
                    'home_block_percentage',
                    'home_blocks',
                    'home_defensive_rating',
                    'home_defensive_rebound_percentage',
                    'home_defensive_rebounds',
                    'home_effective_field_goal_percentage',
                    'home_field_goal_attempts',
                    'home_field_goal_percentage',
                    'home_field_goals',
                    'home_free_throw_attempt_rate',
                    'home_free_throw_attempts',
                    'home_free_throw_percentage',
                    'home_free_throws',
                    'home_losses',
                    'home_minutes_played',
                    'home_offensive_rating',
                    'home_offensive_rebound_percentage',
                    'home_offensive_rebounds',
                    'home_personal_fouls',
                    'home_points',
                    'home_steal_percentage',
                    'home_steals',
                    'home_three_point_attempt_rate',
                    'home_three_point_field_goal_attempts',
                    'home_three_point_field_goal_percentage',
                    'home_three_point_field_goals',
                    'home_total_rebound_percentage',
                    'home_total_rebounds',
                    'home_true_shooting_percentage',
                    'home_turnover_percentage',
                    'home_turnovers',
                    'home_two_point_field_goal_attempts',
                    'home_two_point_field_goal_percentage',
                    'home_two_point_field_goals',
                    'home_win_percentage',
                    'home_wins']

data = data[reordered_labels]

# Clean save directory and save the CSV
clean_dir("../data/raw_compiled/")
data.to_csv(os.path.join("../data/raw_compiled/", "data.csv"), index=False)

# Separate Stats on Team Basis

In [5]:
new_labels = ['date',
              'location',
              'away',
              'home',
              'winner',
              'assist_percentage',
              'assists', 
              'block_percentage', 
              'blocks', 
              'defensive_rating', 
              'defensive_rebound_percentage', 
              'defensive_rebounds',
              'effective_field_goal_percentage',
              'field_goal_attempts',
              'field_goal_percentage',
              'field_goals',
              'free_throw_attempt_rate',
              'free_throw_attempts',
              'free_throw_percentage',
              'free_throws',
              'losses',
              'minutes_played',
              'offensive_rating',
              'offensive_rebound_percentage',
              'offensive_rebounds',
              'personal_fouls',
              'points',
              'steal_percentage',
              'steals',
              'three_point_attempt_rate',
              'three_point_field_goal_attempts',
              'three_point_field_goal_percentage',
              'three_point_field_goals',
              'total_rebound_percentage',
              'total_rebounds',
              'true_shooting_percentage',
              'turnover_percentage',
              'turnovers',
              'two_point_field_goal_attempts',
              'two_point_field_goal_percentage',
              'two_point_field_goals',
              'win_percentage',
              'wins']

def generate_team_stats(dataframe, teams, folder='../data/team_data/'):
    """For each team, generate the games that they have played
    
    Args:
        dataframe (pandas.Dataframe): dataframe that contains all the games
        teams (list): a list of all the teams
        folder (string): path to where the files will be stored
    """
    
    # Clean the folder
    clean_dir(folder)
    
    # Loop through all of the teams
    for team in tqdm(teams, unit="teams"):
        file_name = f'{team}.csv'
        path = os.path.join(folder, file_name)
        
        # Get home games and away games
        away_games = dataframe.loc[(dataframe["away"] == team)].copy()
        home_games = dataframe.loc[(dataframe["home"] == team)].copy()
        
        home_games.drop(home_games.filter(regex="away_").columns, axis=1, inplace=True)
        home_games.drop(columns=['losing_abbr',
                                 'losing_name',
                                 'pace',
                                 'winning_abbr',
                                 'winning_name'],
                        inplace=True)
        home_games.columns = new_labels
        
        away_games.drop(away_games.filter(regex="home_").columns, axis=1, inplace=True)
        away_games.drop(columns=['losing_abbr',
                                 'losing_name',
                                 'pace',
                                 'winning_abbr',
                                 'winning_name'],
                        inplace=True)
        away_games.columns = new_labels
        
        # Shaoe for both must match same columns
        assert away_games.shape[1] == home_games.shape[1]
        
        # Join the home games and away games, sort by date
        team_stats = pd.concat([home_games, away_games])
        team_stats.drop_duplicates(inplace=True)
        team_stats.dropna(inplace=True)
        team_stats.sort_values(by=["date"], inplace=True)
        
        # Save the stats
        team_stats.to_csv(path, index=None)

In [6]:
# Get teams that are division 1 schoold and are in the tournament
teams = [team.abbreviation for team in Teams()]

In [7]:
generate_team_stats(data, teams)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 356/356 [00:06<00:00, 54.24teams/s]


In [8]:
pd.read_csv("../data/team_data/CONNECTICUT.csv").head()

Unnamed: 0,date,location,away,home,winner,assist_percentage,assists,block_percentage,blocks,defensive_rating,defensive_rebound_percentage,defensive_rebounds,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,losses,minutes_played,offensive_rating,offensive_rebound_percentage,offensive_rebounds,personal_fouls,points,steal_percentage,steals,three_point_attempt_rate,three_point_field_goal_attempts,three_point_field_goal_percentage,three_point_field_goals,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_field_goal_attempts,two_point_field_goal_percentage,two_point_field_goals,win_percentage,wins
0,2009-11-13,"Harry A. Gampel Pavilion, Storrs, Connecticut",WILLIAM-MARY,CONNECTICUT,Home,55.2,16,26.9,7,101.5,59.4,20,0.583,54,0.537,29,0.389,21,0.571,12,0,200.0,115.4,40.0,9,10,75,18.5,12,0.296,16.0,0.313,5.0,50.9,29,0.586,13.7,10,38.0,0.632,24.0,1.0,1
1,2009-11-16,"Harry A. Gampel Pavilion, Storrs, Connecticut",COLGATE,CONNECTICUT,Home,62.5,20,21.6,8,100.0,42.9,13,0.621,58,0.552,32,0.172,10,0.5,5,0,200.0,122.2,54.2,9,10,77,11.1,7,0.328,19.0,0.421,8.0,48.9,22,0.614,17.2,13,39.0,0.615,24.0,1.0,2
2,2009-11-17,"Harry A. Gampel Pavilion, Storrs, Connecticut",HOFSTRA,CONNECTICUT,Home,54.5,12,28.3,13,94.4,57.8,28,0.434,53,0.415,22,0.679,36,0.833,30,0,200.0,107.0,31.0,7,13,76,8.5,6,0.151,8.0,0.25,2.0,47.3,35,0.542,12.7,10,45.0,0.444,20.0,1.0,3
3,2009-11-25,"Madison Square Garden (IV), New York, New York",CONNECTICUT,LOUISIANA-STATE,Away,63.3,19,25.0,13,79.7,69.2,29,0.5,65,0.462,30,0.354,23,0.696,16,0,200.0,117.4,55.9,17,17,81,13.0,9,0.154,10.0,0.5,5.0,63.0,46,0.533,13.8,12,55.0,0.455,25.0,1.0,4
4,2009-11-27,"Madison Square Garden (IV), New York, New York",DUKE,CONNECTICUT,Away,59.1,13,16.1,9,91.9,46.8,26,0.373,59,0.373,22,0.475,28,0.536,15,1,200.0,79.7,37.8,10,21,59,5.4,4,0.068,4.0,0.0,0.0,42.9,36,0.408,18.3,16,55.0,0.4,22.0,0.8,4
