In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [18]:
print('Numpy version: {}'.format(np.__version__))
print('Pandas version: {}'.format(pd.__version__))
print('Tensorflow version: {}'.format(tf.__version__))

Numpy version: 1.19.5
Pandas version: 1.1.5
Tensorflow version: 2.5.0


In [50]:
# read csv into dfs
games = pd.read_csv('./drive/MyDrive/Datasets/BasketballScores/Games.csv', names=['date', 'home', 'home score', 'away', 'away score'], parse_dates=['date'])
teams = pd.read_csv('./drive/MyDrive/Datasets/BasketballScores/Teams.csv', names=['conference', 'team'])

In [57]:
# check df
games

Unnamed: 0,date,home,home score,away,away score
0,2015-11-13,Hawaii,87,Montana St.,76
1,2015-11-13,Eastern Mich.,70,Vermont,50
2,2015-11-13,Columbia,107,Kean,62
3,2015-11-13,La.-Monroe,88,McMurry,43
4,2015-11-13,Yale,70,Fairfield,57
...,...,...,...,...,...
23488,2019-03-21,Hampton,81,St. Francis (B&#039;klyn),72
23489,2019-03-21,La.-Monroe,87,Kent State,77
23490,2019-03-21,CSU Fullerton,58,CSU Bakersfield,66
23491,2019-03-22,Southern Utah,80,Drake,73


In [52]:
teams.head(5)

Unnamed: 0,conference,team
0,America East,Vermont
1,America East,Stony Brook
2,America East,UMBC
3,America East,Hartford
4,America East,Albany


In [56]:
# clean data
games = games.dropna()
teams = teams.dropna()

# remove rows without game score
games = games.drop(games[(games['home score'] == 0)].index)
games = games.drop(games[(games['away score'] == 0)].index)

games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23290 entries, 0 to 23492
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        23290 non-null  datetime64[ns]
 1   home        23290 non-null  object        
 2   home score  23290 non-null  int64         
 3   away        23290 non-null  object        
 4   away score  23290 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 1.7+ MB


In [44]:
teams.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353 entries, 0 to 352
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   home    353 non-null    object
 1   away    353 non-null    object
dtypes: object(2)
memory usage: 8.3+ KB


In [66]:
# break df into seasons
season2015 = games[(games['date'] > '2015-11-01') & (games['date'] < '2016-04-15')].copy()
season2016 = games[(games['date'] > '2016-11-01') & (games['date'] < '2017-04-15')].copy()
season2017 = games[(games['date'] > '2017-11-01') & (games['date'] < '2018-04-15')].copy()
season2018 = games[(games['date'] > '2018-11-01') & (games['date'] < '2019-04-15')].copy()

In [67]:
print('2015 ({}): {} to {}'.format(season2015.shape[0], np.min(season2015['date']), np.max(season2015['date'])))
print('2016 ({}): {} to {}'.format(season2016.shape[0], np.min(season2016['date']), np.max(season2016['date'])))
print('2017 ({}): {} to {}'.format(season2017.shape[0], np.min(season2017['date']), np.max(season2017['date'])))
print('2018 ({}): {} to {}'.format(season2018.shape[0], np.min(season2018['date']), np.max(season2018['date'])))

2015 (5881): 2015-11-13 00:00:00 to 2016-04-01 00:00:00
2016 (5904): 2016-11-11 00:00:00 to 2017-03-31 00:00:00
2017 (5612): 2017-11-10 00:00:00 to 2018-03-31 00:00:00
2018 (5893): 2018-11-06 00:00:00 to 2019-03-22 00:00:00


In [62]:
season2015['date']

0      2015-11-13
1      2015-11-13
2      2015-11-13
3      2015-11-13
4      2015-11-13
          ...    
5878   2016-03-29
5879   2016-03-30
5880   2016-03-30
5881   2016-03-31
5882   2016-04-01
Name: date, Length: 5881, dtype: datetime64[ns]

In [71]:
# rename teams to match entry
def rename_teams(df_games, column_name):
  df_games.loc[ df_games[column_name] == 'A&M-Corpus Chris'		, column_name ] = 		'Texas A&M-CC'	
  df_games.loc[ df_games[column_name] == 'Alabama St.'		, column_name ] = 		'Alabama State'		
  df_games.loc[ df_games[column_name] == 'Albany (NY)'		, column_name ] = 		'Albany'				
  df_games.loc[ df_games[column_name] == 'Alcorn St.'			, column_name ] = 		'Alcorn State'		
  df_games.loc[ df_games[column_name] == 'American'			, column_name ] = 		'American University'
  df_games.loc[ df_games[column_name] == 'Appalachian St.'			, column_name ] = 		'Appalachian State'	
  df_games.loc[ df_games[column_name] == 'Arizona St.'		, column_name ] = 		'Arizona State'						
  df_games.loc[ df_games[column_name] == 'Army West Point'		, column_name ] = 		'Army'					
  df_games.loc[ df_games[column_name] == 'Ark.-Pine Bluff'		, column_name ] = 		'Arkansas-Pine Bluff'
  df_games.loc[ df_games[column_name] == 'UALR'				, column_name ] = 		'Arkansas-Little Rock'	
  df_games.loc[ df_games[column_name] == 'Little Rock'				, column_name ] = 		'Arkansas-Little Rock'			
  df_games.loc[ df_games[column_name] == 'Arkansas St.'		, column_name ] = 		'Arkansas State'		
  df_games.loc[ df_games[column_name] == 'Ball St.'			, column_name ] = 		'Ball State'			
  df_games.loc[ df_games[column_name] == 'Boise St.'			, column_name ] = 		'Boise State'		
  df_games.loc[ df_games[column_name] == 'Boston U.'			, column_name ] = 		'Boston University'			
  df_games.loc[ df_games[column_name] == 'Cal Baptist'	, column_name ] = 		'California Baptist'			
  df_games.loc[ df_games[column_name] == 'Charleston So.'	, column_name ] = 		'Charleston Southern'			
  df_games.loc[ df_games[column_name] == 'Cent. Conn. St.'	, column_name ] = 		'Central Connecticut State'	
  df_games.loc[ df_games[column_name] == 'Central Conn. St.'	, column_name ] = 		'Central Connecticut State'	
  df_games.loc[ df_games[column_name] == 'Central Mich.'	, column_name ] = 		'Central Michigan'	
  df_games.loc[ df_games[column_name] == 'Col. of Charleston'	, column_name ] = 		'Charleston'			
  df_games.loc[ df_games[column_name] == 'Chicago St.'		, column_name ] = 		'Chicago State'		
  df_games.loc[ df_games[column_name] == 'Cleveland St.'		, column_name ] = 		'Cleveland State'		
  df_games.loc[ df_games[column_name] == 'Coastal Caro.'		, column_name ] = 		'Coastal Carolina'				
  df_games.loc[ df_games[column_name] == 'Colorado St.'		, column_name ] = 		'Colorado State'	
  df_games.loc[ df_games[column_name] == 'Coppin St.'			, column_name ] = 		'Coppin State'			
  df_games.loc[ df_games[column_name] == 'Bakersfield'		, column_name ] = 		'Cal State Bakersfield'	
  df_games.loc[ df_games[column_name] == 'CSU Bakersfield'		, column_name ] = 		'Cal State Bakersfield'		
  df_games.loc[ df_games[column_name] == 'Bryant'		, column_name ] = 		'Bryant University'	
  df_games.loc[ df_games[column_name] == 'Cal St. Fullerton'	, column_name ] = 		'Cal State Fullerton'
  df_games.loc[ df_games[column_name] == 'CSU Fullerton'	, column_name ] = 		'Cal State Fullerton'		
  df_games.loc[ df_games[column_name] == 'CSUN'	, column_name ] = 		'Cal State Northridge'	
  df_games.loc[ df_games[column_name] == 'Cal St. Northridge'	, column_name ] = 		'Cal State Northridge'						
  df_games.loc[ df_games[column_name] == 'Central Ark.'		, column_name ] = 		'Central Arkansas'						
  df_games.loc[ df_games[column_name] == 'Delaware St.'		, column_name ] = 		'Delaware State'		
  df_games.loc[ df_games[column_name] == 'Detroit'			, column_name ] = 		'Detroit Mercy'		
  df_games.loc[ df_games[column_name] == 'East Tenn. St.'		, column_name ] = 		'East Tennessee State'
  df_games.loc[ df_games[column_name] == 'Eastern Ill.'		, column_name ] = 		'Eastern Illinois'		
  df_games.loc[ df_games[column_name] == 'Eastern Ky.'		, column_name ] = 		'Eastern Kentucky'		
  df_games.loc[ df_games[column_name] == 'Eastern Mich.'		, column_name ] = 		'Eastern Michigan'	
  df_games.loc[ df_games[column_name] == 'Eastern Wash.'		, column_name ] = 		'Eastern Washington'
  df_games.loc[ df_games[column_name] == "Fairleigh D'son"		, column_name ] = 		'Fairleigh Dickinson'				
  df_games.loc[ df_games[column_name] == 'FGCU'		, column_name ] = 		'Florida Gulf Coast'						
  df_games.loc[ df_games[column_name] == 'FIU'				, column_name ] = 		'Florida International'					
  df_games.loc[ df_games[column_name] == 'Fla. Atlantic'		, column_name ] = 		'Florida Atlantic'
  df_games.loc[ df_games[column_name] == 'Florida St.'		, column_name ] = 		'Florida State'			
  df_games.loc[ df_games[column_name] == 'Fresno St.'			, column_name ] = 		'Fresno State'		
  df_games.loc[ df_games[column_name] == 'Fort Wayne'		, column_name ] = 		'Purdue Fort Wayne'		
  df_games.loc[ df_games[column_name] == 'IPFW'				, column_name ] = 		'Purdue Fort Wayne'				
  df_games.loc[ df_games[column_name] == 'Ga. Southern'		, column_name ] = 		'Georgia Southern'			
  df_games.loc[ df_games[column_name] == 'Georgia St.'		, column_name ] = 		'Georgia State'			
  df_games.loc[ df_games[column_name] == 'Geo. Washington'		, column_name ] = 		'George Washington'				
  df_games.loc[ df_games[column_name] == 'Grambling'		, column_name ] = 		'Grambling State'		
  df_games.loc[ df_games[column_name] == 'Humboldt St.'		, column_name ] = 		'Humboldt State'		
  df_games.loc[ df_games[column_name] == 'Idaho St.'			, column_name ] = 		'Idaho State'			
  df_games.loc[ df_games[column_name] == 'Illinois St.'		, column_name ] = 		'Illinois State'		
  df_games.loc[ df_games[column_name] == 'Iowa St.'			, column_name ] = 		'Iowa State'			
  df_games.loc[ df_games[column_name] == 'Indiana St.'		, column_name ] = 		'Indiana State'		
  df_games.loc[ df_games[column_name] == 'Jackson St.'		, column_name ] = 		'Jackson State'		
  df_games.loc[ df_games[column_name] == 'Jacksonville St.'		, column_name ] = 		'Jacksonville State'			
  df_games.loc[ df_games[column_name] == 'Kansas St.'		, column_name ] = 		'Kansas State'
  df_games.loc[ df_games[column_name] == 'Kennesaw St.'		, column_name ] = 		'Kennesaw State'		
  df_games.loc[ df_games[column_name] == 'Kent St.'			, column_name ] = 		'Kent State'			
  df_games.loc[ df_games[column_name] == 'Louisiana'		, column_name ] = 		'Louisiana-Lafayette'
  df_games.loc[ df_games[column_name] == 'Lamar University'		, column_name ] = 		'Lamar'	
  df_games.loc[ df_games[column_name] == 'La.-Monroe'		, column_name ] = 		'Louisiana-Monroe'		
  df_games.loc[ df_games[column_name] == 'Long Beach St.'		, column_name ] = 		'Long Beach State'	
  df_games.loc[ df_games[column_name] == 'Long Island'		, column_name ] = 		'LIU Brooklyn'
  df_games.loc[ df_games[column_name] == 'LMU'	, column_name ] = 		'Loyola Marymount'					
  df_games.loc[ df_games[column_name] == 'Loyola Chicago'	, column_name ] = 		'Loyola (IL)'			
  df_games.loc[ df_games[column_name] == 'Loyola Maryland'	, column_name ] = 		'Loyola (MD)'			
  df_games.loc[ df_games[column_name] == 'Loyola (Md.)'	, column_name ] = 		'Loyola (MD)'		
  df_games.loc[ df_games[column_name] == 'UMES'		, column_name ] = 		'Maryland-Eastern Shore'
  df_games.loc[ df_games[column_name] == 'Miami (Fla.)'		, column_name ] = 		'Miami (FL)'
  df_games.loc[ df_games[column_name] == 'Miami (Ohio)'		, column_name ] = 		'Miami (OH)'
  df_games.loc[ df_games[column_name] == "Mt. St. Mary's"		, column_name ] = 		"Mount St Mary's"			
  df_games.loc[ df_games[column_name] == 'Mass.-Lowell'		, column_name ] = 		'Massachusetts-Lowell'				
  df_games.loc[ df_games[column_name] == 'McNeese'		, column_name ] = 		'McNeese State'										
  df_games.loc[ df_games[column_name] == 'McNeese '		, column_name ] = 		'McNeese State'						
  df_games.loc[ df_games[column_name] == 'McNeese St.'		, column_name ] = 		'McNeese State'			
  df_games.loc[ df_games[column_name] == 'Middle Tenn.'		, column_name ] = 		'Middle Tennessee'		
  df_games.loc[ df_games[column_name] == 'Mississippi St.'		, column_name ] = 		'Mississippi State'	
  df_games.loc[ df_games[column_name] == 'Mississippi Val.'		, column_name ] = 		'Mississippi Valley State'			
  df_games.loc[ df_games[column_name] == 'Mich. St. '		, column_name ] = 		'Michigan State'		
  df_games.loc[ df_games[column_name] == 'Michigan St.'		, column_name ] = 		'Michigan State'			
  df_games.loc[ df_games[column_name] == 'Mississippi'			, column_name ] = 		'Ole Miss'			
  df_games.loc[ df_games[column_name] == 'Missouri St.'		, column_name ] = 		'Missouri State'	
  df_games.loc[ df_games[column_name] == 'Montana St.'		, column_name ] = 		'Montana State'		
  df_games.loc[ df_games[column_name] == 'Morehead St.'		, column_name ] = 		'Morehead State'		
  df_games.loc[ df_games[column_name] == 'Morgan St.'			, column_name ] = 		'Morgan State'			
  df_games.loc[ df_games[column_name] == 'Murray St.'			, column_name ] = 		'Murray State'		
  df_games.loc[ df_games[column_name] == 'N.C. A&T'	, column_name ] = 		'North Carolina A&T'
  df_games.loc[ df_games[column_name] == 'N.C. Central'	, column_name ] = 		'North Carolina Central'
  df_games.loc[ df_games[column_name] == 'New Mexico St.'	, column_name ] = 		'New Mexico State'				
  df_games.loc[ df_games[column_name] == 'NC State'	, column_name ] = 		'North Carolina State'
  df_games.loc[ df_games[column_name] == 'North Carolina St.'	, column_name ] = 		'North Carolina State'	
  df_games.loc[ df_games[column_name] == 'North Dakota St.'	, column_name ] = 		'North Dakota State'				
  df_games.loc[ df_games[column_name] == 'Northern Ariz.'	, column_name ] = 		'Northern Arizona'				
  df_games.loc[ df_games[column_name] == 'Northern Colo.'	, column_name ] = 		'Northern Colorado'		
  df_games.loc[ df_games[column_name] == 'Northern Ill.'	, column_name ] = 		'Northern Illinois'			
  df_games.loc[ df_games[column_name] == "N'western St."	, column_name ] = 		"Northwestern State"	
  df_games.loc[ df_games[column_name] == 'Northwestern St.'	, column_name ] = 		"Northwestern State"				
  df_games.loc[ df_games[column_name] == 'Nicholls St.'		, column_name ] = 		'Nicholls State'		
  df_games.loc[ df_games[column_name] == 'Norfolk St.'		, column_name ] = 		'Norfolk State'		
  df_games.loc[ df_games[column_name] == 'Northern Ky.'		, column_name ] = 		'Northern Kentucky'	
  df_games.loc[ df_games[column_name] == 'Ohio St.'			, column_name ] = 		'Ohio State'			
  df_games.loc[ df_games[column_name] == 'Ohio St. '			, column_name ] = 		'Ohio State'			
  df_games.loc[ df_games[column_name] == 'Oklahoma St.'		, column_name ] = 		'Oklahoma State'	
  df_games.loc[ df_games[column_name] == 'Oregon St.'		, column_name ] = 		'Oregon State'	
  df_games.loc[ df_games[column_name] == 'Neb. Omaha'			, column_name ] = 		'Nebraska-Omaha'	
  df_games.loc[ df_games[column_name] == 'Omaha'			, column_name ] = 		'Nebraska-Omaha'					
  df_games.loc[ df_games[column_name] == 'Penn'			, column_name ] = 		'Pennsylvania'								
  df_games.loc[ df_games[column_name] == 'Penn St.'			, column_name ] = 		'Penn State'				
  df_games.loc[ df_games[column_name] == 'Prairie View'		, column_name ] = 		'Prairie View A&M'		
  df_games.loc[ df_games[column_name] == 'Portland St.'		, column_name ] = 		'Portland State'		
  df_games.loc[ df_games[column_name] == 'S.C. Upstate'	, column_name ] = 		'USC Upstate'	
  df_games.loc[ df_games[column_name] == 'S. Carolina St.'	, column_name ] = 		'South Carolina State'
  df_games.loc[ df_games[column_name] == 'South Carolina St.'	, column_name ] = 		'South Carolina State'			
  df_games.loc[ df_games[column_name] == 'Sacramento St.'		, column_name ] = 		'Sacramento State'			
  df_games.loc[ df_games[column_name] == 'Sam Houston St.'		, column_name ] = 		'Sam Houston State'				
  df_games.loc[ df_games[column_name] == 'San Diego St.'		, column_name ] = 		'San Diego State'		
  df_games.loc[ df_games[column_name] == 'San Jose St.'		, column_name ] = 		'San Jose State'		
  df_games.loc[ df_games[column_name] == 'Savannah St.'		, column_name ] = 		'Savannah State'		
  df_games.loc[ df_games[column_name] == 'Seattle U'			, column_name ] = 		'Seattle'			
  df_games.loc[ df_games[column_name] == 'SFA'	, column_name ] = 		'Stephen F Austin'
  df_games.loc[ df_games[column_name] == 'Stephen F. Austin'	, column_name ] = 		'Stephen F Austin'					
  df_games.loc[ df_games[column_name] == 'SIU Edwardsville'	, column_name ] = 		'SIU-Edwardsville'						
  df_games.loc[ df_games[column_name] == 'SIUE'	, column_name ] = 		'SIU-Edwardsville'					
  df_games.loc[ df_games[column_name] == 'South Ala.'			, column_name ] = 		'South Alabama'			
  df_games.loc[ df_games[column_name] == 'South Dakota St.'			, column_name ] = 		'South Dakota State'	
  df_games.loc[ df_games[column_name] == 'South Fla.'			, column_name ] = 		'South Florida'		
  df_games.loc[ df_games[column_name] == 'Southern Ill.'		, column_name ] = 		'Southern Illinois'	
  df_games.loc[ df_games[column_name] == 'Southeast Mo. St.'		, column_name ] = 		'Southeast Missouri State'
  df_games.loc[ df_games[column_name] == 'Southeastern La.'		, column_name ] = 		'Southeastern Louisiana'		
  df_games.loc[ df_games[column_name] == 'Southern Miss.'		, column_name ] = 		'Southern Miss'		
  df_games.loc[ df_games[column_name] == 'Southern U.'		, column_name ] = 		'Southern University'	
  df_games.loc[ df_games[column_name] == 'Southern Univ.'		, column_name ] = 		'Southern University'	
  df_games.loc[ df_games[column_name] == "St. Bonaventure"	, column_name ] = 		"St Bonaventure"			
  df_games.loc[ df_games[column_name] == "St. Francis (B'klyn)"	, column_name ] = 		"St Francis (BKN)"			
  df_games.loc[ df_games[column_name] == 'St. Francis (NY)'	, column_name ] = 		"St Francis (BKN)"
  df_games.loc[ df_games[column_name] == 'St. Francis (PA)'	, column_name ] = 		"St Francis (PA)"	
  df_games.loc[ df_games[column_name] == 'St. Francis (Pa.)'	, column_name ] = 		"St Francis (PA)"	
  df_games.loc[ df_games[column_name] == "Saint Joseph's"	, column_name ] = 		"Saint Joseph's (PA)"
  df_games.loc[ df_games[column_name] == "St. Mary's (CA)"	, column_name ] = 		"Saint Mary's"	
  df_games.loc[ df_games[column_name] == "St. Mary's (Cal.)"	, column_name ] = 		"Saint Mary's"	
  df_games.loc[ df_games[column_name] == "St. Peter's"	, column_name ] = 		"St Peter's"	
  df_games.loc[ df_games[column_name] == "St. John's (NY)"	, column_name ] = 		"St John's"		
  df_games.loc[ df_games[column_name] == "St. John's "	, column_name ] = 		"St John's"				
  df_games.loc[ df_games[column_name] == 'Tennessee St.'		, column_name ] = 		'Tennessee State'		
  df_games.loc[ df_games[column_name] == 'Texas A&M-C.C.'			, column_name ] = 		'Texas A&M-CC'			
  df_games.loc[ df_games[column_name] == 'Texas St.'			, column_name ] = 		'Texas State'			
  df_games.loc[ df_games[column_name] == 'UC Santa Barbara'	, column_name ] = 		'UC Santa Barb.'		
  df_games.loc[ df_games[column_name] == 'Ill.-Chicago'		, column_name ] = 		'UIC'					
  df_games.loc[ df_games[column_name] == 'Md.-East. Shore'	, column_name ] = 		'Maryland-Eastern Shore'					
  df_games.loc[ df_games[column_name] == 'UNCG'		, column_name ] = 		'UNC Greensboro'					
  df_games.loc[ df_games[column_name] == 'UNCW'		, column_name ] = 		'North Carolina-Wilmington'					
  df_games.loc[ df_games[column_name] == 'UNC Wilmington'		, column_name ] = 		'North Carolina-Wilmington'					
  df_games.loc[ df_games[column_name] == 'Southern California', column_name ] = 		'USC'					
  df_games.loc[ df_games[column_name] == 'UConn'			, column_name ] = 		'Connecticut'					
  df_games.loc[ df_games[column_name] == 'UC Santa Barb.'			, column_name ] = 		'UC Santa Barbara'						
  df_games.loc[ df_games[column_name] == 'UIC'			, column_name ] = 		'Illinois-Chicago'						
  df_games.loc[ df_games[column_name] == 'UNI'			, column_name ] = 		'Northern Iowa'							
  df_games.loc[ df_games[column_name] == 'UT Arlington'			, column_name ] = 		'Texas-Arlington'		
  df_games.loc[ df_games[column_name] == 'UT Arlington '			, column_name ] = 		'Texas-Arlington'		
  df_games.loc[ df_games[column_name] == 'UT Martin'			, column_name ] = 		'Tennessee-Martin'		
  df_games.loc[ df_games[column_name] == 'UTRGV'			, column_name ] = 		'Texas Rio Grande Valley'		
  df_games.loc[ df_games[column_name] == 'Utah St.'			, column_name ] = 		'Utah State'				
  df_games.loc[ df_games[column_name] == 'VCU'			, column_name ] = 		'Virginia Commonwealth'		
  df_games.loc[ df_games[column_name] == 'VMI'			, column_name ] = 		'Virginia Military'		
  df_games.loc[ df_games[column_name] == 'Washington St.'			, column_name ] = 		'Washington State'			
  df_games.loc[ df_games[column_name] == 'Weber St.'			, column_name ] = 		'Weber State'			
  df_games.loc[ df_games[column_name] == 'Western Caro.'		, column_name ] = 		'Western Carolina'
  df_games.loc[ df_games[column_name] == 'Western Ill.'		, column_name ] = 		'Western Illinois'	
  df_games.loc[ df_games[column_name] == 'Western Ky.'		, column_name ] = 		'Western Kentucky'	
  df_games.loc[ df_games[column_name] == 'Western Mich.'		, column_name ] = 		'Western Michigan'		
  df_games.loc[ df_games[column_name] == 'Wichita St.'		, column_name ] = 		'Wichita State'		
  df_games.loc[ df_games[column_name] == 'Wright St.'			, column_name ] = 		'Wright State'	
  df_games.loc[ df_games[column_name] == 'Youngstown St.'			, column_name ] = 		'Youngstown State'

In [72]:
rename_teams(season2015, 'home')
rename_teams(season2015, 'away')
rename_teams(season2016, 'home')
rename_teams(season2016, 'away')
rename_teams(season2017, 'home')
rename_teams(season2017, 'away')
rename_teams(season2018, 'home')
rename_teams(season2018, 'away')

In [78]:
# calculate team's: 1) scoring avg at home, 2) scoring avg away, 3) home defensive avg, 4) away defensive avg
def calculate_scores(df):
  for index, row in teams.iterrows():
    # take rolling means for home scores and defence
    rolling_home_score = df[df['home'] == row['team']]['home score'].expanding().mean()
    df.loc[df['home'] == row['team'], 'home score average'] = rolling_home_score

    # defence score is the opposing team
    rolling_home_defence = df[df['home'] == row['team']]['away score'].expanding().mean()
    df.loc[df['home'] == row['team'], 'home defence average'] = rolling_home_defence

    # take rolling means for away scores and defence
    rolling_away_score = df[df['away'] == row['team']]['away score'].expanding().mean()
    df.loc[df['away'] == row['team'], 'away score average'] = rolling_away_score

    rolling_away_defence = df[df['away'] == row['team']]['home score'].expanding().mean()
    df.loc[df['away'] == row['team'], 'average away defence'] = rolling_away_defence


In [79]:
calculate_scores(season2015)
calculate_scores(season2016)
calculate_scores(season2017)
calculate_scores(season2018)

In [80]:
season2015

Unnamed: 0,date,home,home score,away,away score,home score average,home defence average,away score average,average away defence
0,2015-11-13,Hawaii,87,Montana State,76,87.000000,76.000000,76.000000,87.000000
1,2015-11-13,Eastern Michigan,70,Vermont,50,70.000000,50.000000,50.000000,70.000000
2,2015-11-13,Columbia,107,Kean,62,107.000000,62.000000,,
3,2015-11-13,Louisiana-Monroe,88,McMurry,43,88.000000,43.000000,,
4,2015-11-13,Yale,70,Fairfield,57,70.000000,57.000000,57.000000,70.000000
...,...,...,...,...,...,...,...,...,...
5878,2016-03-29,Oakland,104,East Tennessee State,81,88.750000,81.625000,72.052632,75.263158
5879,2016-03-30,Nevada,77,Morehead State,68,78.736842,68.000000,70.105263,71.631579
5880,2016-03-30,Oakland,67,Old Dominion,68,87.470588,80.823529,65.294118,65.411765
5881,2016-03-31,Valparaiso,60,George Washington,76,76.809524,61.952381,71.666667,68.333333


In [81]:
season2015[season2015['home score'] != season2015['home score average']]

Unnamed: 0,date,home,home score,away,away score,home score average,home defence average,away score average,average away defence
13,2015-11-13,St. John&#039;s (NY),66,Wagner,57,,,57.000000,66.000000
43,2015-11-13,Florida A&amp;M,103,Fla. Christian,71,,,,
79,2015-11-13,Saint Joseph&#039;s,82,Drexel,81,,,81.000000,82.000000
106,2015-11-13,St. Peter&#039;s,77,Brown,65,,,65.000000,77.000000
107,2015-11-13,A&amp;M-Corpus Chris,94,Our Lady Of The Lake,72,,,,
...,...,...,...,...,...,...,...,...,...
5878,2016-03-29,Oakland,104,East Tennessee State,81,88.750000,81.625000,72.052632,75.263158
5879,2016-03-30,Nevada,77,Morehead State,68,78.736842,68.000000,70.105263,71.631579
5880,2016-03-30,Oakland,67,Old Dominion,68,87.470588,80.823529,65.294118,65.411765
5881,2016-03-31,Valparaiso,60,George Washington,76,76.809524,61.952381,71.666667,68.333333


In [85]:
# combine all dfs
all_seasons = pd.concat([season2015, season2016, season2017, season2018])

# remove nans
print('Size before dropping nans: {}'.format(len(all_seasons)))
all_seasons.dropna(inplace=True)
print('Size after dropping nans: {}'.format(len(all_seasons)))

Size before dropping nans: 23290
Size after dropping nans: 20160


In [86]:
all_seasons

Unnamed: 0,date,home,home score,away,away score,home score average,home defence average,away score average,average away defence
0,2015-11-13,Hawaii,87,Montana State,76,87.000000,76.000000,76.000000,87.000000
1,2015-11-13,Eastern Michigan,70,Vermont,50,70.000000,50.000000,50.000000,70.000000
4,2015-11-13,Yale,70,Fairfield,57,70.000000,57.000000,57.000000,70.000000
6,2015-11-13,Central Michigan,89,Jacksonville State,83,89.000000,83.000000,83.000000,89.000000
8,2015-11-13,Iowa State,68,Colorado,62,68.000000,62.000000,62.000000,68.000000
...,...,...,...,...,...,...,...,...,...
23487,2019-03-21,Charleston Southern,68,Florida Atlantic,66,80.058824,63.529412,67.777778,70.666667
23489,2019-03-21,Louisiana-Monroe,87,Kent State,77,85.062500,74.562500,71.466667,75.200000
23490,2019-03-21,Cal State Fullerton,58,Cal State Bakersfield,66,71.437500,65.500000,67.388889,72.666667
23491,2019-03-22,Southern Utah,80,Drake,73,80.666667,72.333333,70.200000,70.066667


In [87]:
# save df
file_name = 'clean_games.csv'

all_seasons.to_csv(file_name, header=False, index=False)
