In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sportsipy.ncaab.boxscore import Boxscore
from sportsipy.ncaab.boxscore import Boxscores
from sportsipy.ncaab.schedule import Schedule

In [2]:
df = pd.read_csv('NCAABasketballData.csv', low_memory = False)

In [3]:
for i in df.columns:
    print(i)

game_id
season
status
coverage
neutral_site
scheduled_date
gametime
conference_game
tournament
tournament_type
tournament_round
tournament_game_no
attendance
lead_changes
times_tied
periods
possession_arrow
venue_id
venue_city
venue_state
venue_address
venue_zip
venue_country
venue_name
venue_capacity
h_name
h_market
h_id
h_alias
h_league_id
h_league_name
h_league_alias
h_conf_id
h_conf_name
h_conf_alias
h_division_id
h_division_name
h_division_alias
h_logo_large
h_logo_medium
h_logo_small
h_points_game
h_rank
h_minutes
h_field_goals_made
h_field_goals_att
h_field_goals_pct
h_three_points_made
h_three_points_att
h_three_points_pct
h_two_points_made
h_two_points_att
h_two_points_pct
h_blocked_att
h_free_throws_made
h_free_throws_att
h_free_throws_pct
h_offensive_rebounds
h_defensive_rebounds
h_rebounds
h_assists
h_turnovers
h_steals
h_blocks
h_assists_turnover_ratio
h_personal_fouls
h_ejections
h_foulouts
h_points
h_fast_break_pts
h_second_chance_pts
h_team_turnovers
h_points_off_turnov

In [4]:
cols_to_drop = ['game_id', 'status','coverage','possession_arrow', 'venue_id',
                'venue_city','venue_state','venue_address','venue_zip','venue_country',
                'venue_name', 'created', 'h_league_id','h_league_name','h_league_alias',
                'h_conf_id','h_conf_name','h_conf_alias','h_division_id',
                'h_division_name','h_division_alias','h_logo_large','h_logo_medium',
                'h_logo_small','a_league_name','a_league_alias','a_conf_id',
                'a_conf_name','a_conf_alias','a_division_id','a_division_name',
                'a_division_alias','a_logo_large','a_logo_medium','a_logo_small','created']

In [3]:
df.tournament.value_counts()

Conference    1448
NCAA           712
Name: tournament, dtype: int64

In [6]:
df.columns

Index(['season', 'neutral_site', 'scheduled_date', 'gametime',
       'conference_game', 'tournament', 'tournament_type', 'tournament_round',
       'tournament_game_no', 'attendance', 'lead_changes', 'times_tied',
       'periods', 'venue_capacity', 'h_name', 'h_market', 'h_id', 'h_alias',
       'h_points_game', 'h_rank', 'h_minutes', 'h_field_goals_made',
       'h_field_goals_att', 'h_field_goals_pct', 'h_three_points_made',
       'h_three_points_att', 'h_three_points_pct', 'h_two_points_made',
       'h_two_points_att', 'h_two_points_pct', 'h_blocked_att',
       'h_free_throws_made', 'h_free_throws_att', 'h_free_throws_pct',
       'h_offensive_rebounds', 'h_defensive_rebounds', 'h_rebounds',
       'h_assists', 'h_turnovers', 'h_steals', 'h_blocks',
       'h_assists_turnover_ratio', 'h_personal_fouls', 'h_ejections',
       'h_foulouts', 'h_points', 'h_fast_break_pts', 'h_second_chance_pts',
       'h_team_turnovers', 'h_points_off_turnovers', 'h_team_rebounds',
       'h_flag

In [7]:
df['home_win'] = df.h_points_game.ge(df.a_points_game)

In [8]:
df.home_win.value_counts(normalize = True)

True     0.647822
False    0.352178
Name: home_win, dtype: float64

In [9]:
df.groupby('conference_game').mean()

Unnamed: 0_level_0,season,attendance,lead_changes,times_tied,periods,venue_capacity,h_points_game,h_rank,h_field_goals_made,h_field_goals_att,...,a_fast_break_pts,a_second_chance_pts,a_team_turnovers,a_points_off_turnovers,a_team_rebounds,a_flagrant_fouls,a_player_tech_fouls,a_team_tech_fouls,a_coach_tech_fouls,home_win
conference_game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,2015.324397,6414.726427,3.650558,2.815741,2.063804,10320.10019,76.102547,1.480429,26.684662,57.834076,...,5.441703,9.561206,0.392489,12.683844,3.321043,0.030383,0.097919,0.034337,0.035093,0.704692
True,2015.348477,5479.099369,4.363689,3.330643,2.089003,8170.156981,73.320411,0.944633,25.701276,56.992686,...,5.339686,10.071336,0.376939,13.116928,3.100373,0.020077,0.12448,0.053624,0.032854,0.601835


In [10]:
df.a_personal_fouls.corr(df['a_foulouts'])

0.28734198430175684

In [11]:
stats = ['h_field_goals_made',
       'h_field_goals_att', 'h_field_goals_pct', 'h_three_points_made',
       'h_three_points_att', 'h_three_points_pct', 'h_two_points_made',
       'h_two_points_att', 'h_two_points_pct', 'h_blocked_att',
       'h_free_throws_made', 'h_free_throws_att', 'h_free_throws_pct',
       'h_offensive_rebounds', 'h_defensive_rebounds', 'h_rebounds',
       'h_assists', 'h_turnovers', 'h_steals', 'h_blocks',
       'h_assists_turnover_ratio', 'h_personal_fouls', 'h_ejections',
       'h_foulouts', 'h_points', 'h_fast_break_pts', 'h_second_chance_pts',
       'h_team_turnovers', 'h_points_off_turnovers', 'h_team_rebounds',
       'h_flagrant_fouls', 'h_player_tech_fouls', 'h_team_tech_fouls',
       'h_coach_tech_fouls', 'a_field_goals_made', 'a_field_goals_att', 'a_field_goals_pct',
       'a_three_points_made', 'a_three_points_att', 'a_three_points_pct',
       'a_two_points_made', 'a_two_points_att', 'a_two_points_pct',
       'a_blocked_att', 'a_free_throws_made', 'a_free_throws_att',
       'a_free_throws_pct', 'a_offensive_rebounds', 'a_defensive_rebounds',
       'a_rebounds', 'a_assists', 'a_turnovers', 'a_steals', 'a_blocks',
       'a_assists_turnover_ratio', 'a_personal_fouls', 'a_ejections',
       'a_foulouts', 'a_points', 'a_fast_break_pts', 'a_second_chance_pts',
       'a_team_turnovers', 'a_points_off_turnovers', 'a_team_rebounds',
       'a_flagrant_fouls', 'a_player_tech_fouls', 'a_team_tech_fouls',
       'a_coach_tech_fouls']

In [12]:
stats = df[stats]

In [13]:
stats = stats.fillna(0)

In [14]:
stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27594 entries, 0 to 27593
Data columns (total 68 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   h_field_goals_made        27594 non-null  float64
 1   h_field_goals_att         27594 non-null  float64
 2   h_field_goals_pct         27594 non-null  float64
 3   h_three_points_made       27594 non-null  float64
 4   h_three_points_att        27594 non-null  float64
 5   h_three_points_pct        27594 non-null  float64
 6   h_two_points_made         27594 non-null  float64
 7   h_two_points_att          27594 non-null  float64
 8   h_two_points_pct          27594 non-null  float64
 9   h_blocked_att             27594 non-null  float64
 10  h_free_throws_made        27594 non-null  float64
 11  h_free_throws_att         27594 non-null  float64
 12  h_free_throws_pct         27594 non-null  float64
 13  h_offensive_rebounds      27594 non-null  float64
 14  h_defe

In [15]:
from sklearn.model_selection import train_test_split
X = stats
y = df.home_win
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = .25)

In [16]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 42, max_iter = 10000)
model = logreg.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)
accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)

array([[2040,  352],
       [   3, 4504]], dtype=int64)

In [21]:
from sklearn.neighbors import KNeighborsClassifier

knnmodel = KNeighborsClassifier()
knnmodel.fit(X_train, y_train)
kn_y_pred = knnmodel.predict(X_test)

accuracy_score(y_test, kn_y_pred)
confusion_matrix(y_test, kn_y_pred)

array([[1766,  626],
       [ 147, 4360]], dtype=int64)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_y_pred = knn.predict(X_test)