In [None]:
# Dependencies and Setup
import pandas as pd


import matplotlib.pyplot as plt
import scipy.stats as st
pd.set_option("display.precision", 2)
import numpy as np
from scipy import stats
import os
import datetime as dt
import random
from scipy.stats import linregress
import scipy.stats as stats
import seaborn as sns


In [None]:
#set working directory
os.chdir("C://Users//ksaville//Desktop/NFL-Betting-Analysis/")
os.getcwd()
nfl = pd.read_csv("raw_data/spreadspoke_scores.csv")

# Introduction:  

The data for this analysis was obtained from https://www.kaggle.com/tobycrabtree/nfl-scores-and-betting-data.  Below is the desription of this data from Kaggle.

National Football League (NFL) game results since 1966 with betting odds information since 1979. Dataset was created from a variety of sources including games and scores from a variety of public websites such as ESPN, NFL.com, and Pro Football Reference. Weather information is from NOAA data with NFLweather.com a good cross reference. Betting data was used from http://www.repole.com/sun4cast/data.html for 1978-2013 seasons. Pro-football-reference.com data was then cross referenced for betting lines and odds as well as weather data. From 2013 on betting data reflects lines available at sportsline.com.

Our goal is to investigate the performance of teams relative to the betting spread and over/under line as well as to investigate the potential effects of weather on overall scoring.

# Exploring the Data

In [None]:
nfl.shape

#shows there are 17 columns and 12934 rows of data

In [None]:
nfl.head()

#Shows that the first games were from 1966.  (note: Betting info is only available starting in 1979)

In [None]:
nfl.columns
# shows the names of all the columns

In [None]:
#Check data types
nfl.dtypes
# Note - schedule date, week, over_under line, and humidity are object type.  
# May need to change this to date format in subsequent analyses

In [None]:
#check to see of there are null values in the data
nfl.isnull().sum()
# yes there are:  137 in score_home and score_away, 
#2616 in team favorite, spread favorite
#2626 in over_under.This probably reflects that betting data wasn't included before 1979
# not sure about the 2626 vs 2616 discrepancy
# 926 in weather_temp, weather_wind
#4546 in humidity
#10233 in weather_detail (this means rain, snow, fog, etc.)

#Drop nas
nfl = nfl.dropna(subset=['weather_temperature', 'weather_wind_mph', 'score_home', 'score_away'])
print(nfl.isnull().sum())

In [None]:
#Check out some basic statistics
nfl.describe()

In [None]:
nfl.head()

In [None]:
#convert schedule_date to date format
nfl['schedule_date']=pd.to_datetime(nfl['schedule_date'])

# convert over_under line to float
nfl["over_under_line"] = pd.to_numeric(nfl["over_under_line"], errors='coerce')

#check to see that the above worked
#nfl.dtypes
#nfl.head()

In [None]:
#Make a boxplot for away scores and home scores (after nulls have been dropped)

scores = [nfl['score_home'], nfl['score_away']]

fig = plt.figure(figsize =(10, 10)) 
  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
  
# Creating plot 
bp = ax.boxplot(scores, patch_artist = True)

# fill with colors - not quite sure how this works, just trying things from google searches
colors = ['lightgreen', 'lightblue']

for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)

ax.set_xticklabels(['Home Scores', 'Away Scores']) 

plt.title("A comparison of home and away NFL scores from seasons 1966-2019") 



In [None]:
#compare means and do a t-test

home_mean = round(np.mean(nfl["score_home"]),2)
away_mean = round(np.mean(nfl["score_away"]),2)
diff = round(home_mean - away_mean,2)

print(f'Home mean score = {home_mean}; Away mean score = {away_mean}')
print(f'The difference in means is:  {diff}')

print("Is this a significant difference?")

print("Yes the mean diff is significant")
stats.ttest_ind(nfl['score_home'], nfl['score_away'])

In [None]:
#make a scatter plot for home and away scores as a function of temperature and wind speed.
# use function for making scatter plots
#Define function for making  scatter plots

def make_scatter_plot(df, x_col, y_col):
    "This creates a scatter plot given a list of x values, y values, x label, y label and title"
    x = df[x_col]
    y = df[y_col]
  
    
    #generating random number to be included in fig name so that new figure is unlikely to 
    #be named the same as a previous figure 
    
    #fig_num = random.randint(1, 100)
    plt.figure(figsize=(20,10))
    plt.scatter(x, y)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f"{y_col} as a function of {x_col}")
    plt.ylim(min(y)-10,max(y)+10)
    plt.savefig(f"{x_col}_{y_col}.png")
   
    linregress(x,y)
    reg = linregress(x,y)
    reg_line = reg[0]*x + reg[1]

    plt.plot(x,reg_line, color="red")
    r_squared = round(reg[2]**2,2)
    print(f"r squared = {r_squared}")
    
    return plt



In [None]:
make_scatter_plot(nfl,'weather_temperature','score_home' )

In [None]:
make_scatter_plot(nfl,'weather_temperature','score_away' )
plt.xlabel('xlabel')
plt.title('Title')
plt.show()

In [None]:
make_scatter_plot(nfl,'weather_wind_mph','score_home' )

In [None]:
make_scatter_plot(nfl,'weather_wind_mph','score_away' )

In [None]:
#change data frame to only games after 1979

nfl_1979 = nfl[nfl['schedule_season']>= 1979]

# remove humidity column
del nfl_1979['weather_humidity']

In [None]:
#check dataframe for nulls
nfl_1979.isnull().sum()

In [None]:
nfl_1979.dtypes

In [None]:
nfl_1979['Total'] = nfl_1979['score_home'] + nfl_1979['score_away']

In [None]:
nfl_1979.head()

In [None]:
nfl_1979.dtypes

In [None]:
#calnumber of games that were over or under the over_under line. 
for index, row in nfl_1979.iterrows():
    total = row[16]
    line = row[10]
    if total > line:
        nfl_1979.loc[index,'OverUnder'] = 'over'
    elif total < line:
        nfl_1979.loc[index,'OverUnder'] = 'under'
    else:
        nfl_1979.loc[index,'OverUnder'] = 'push'

nfl_1979.head()

In [None]:
gby_OU = nfl_1979.groupby('OverUnder')

In [None]:
over_under_count = gby_OU['team_home'].count()
over_under_count

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['Over', 'Push', 'Under']
sizes = over_under_count
colors = ['#008fd5', '#e5ae37', '#fc4f30']

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=360, wedgeprops={'edgecolor':'black'}, colors=colors,
        pctdistance = 0.8, labeldistance = 1.2)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title('Scores of NFL games relative to Over/Under line', pad =30)
plt.tight_layout()
#plt.style.use("seaborn-colorblind") - this didn't seem to make a difference
plt.xkcd
plt.show()
#plt.savefig("over_under_pie.png")


In [None]:
ax = sns.countplot(x="team_home", hue = 'OverUnder', data=nfl_1979)

In [None]:
# Change names to team ids
nfl_1979.columns

In [None]:
#rename all team names to team_ids.  combining names for teams that have changed cities.

nfl_1979 = nfl_1979.replace({'Arizona Cardinals':'ARI',
'Phoenix Cardinals':'ARI',
'St. Louis Cardinals':'ARI',
'Atlanta Falcons':'ATL',
'Baltimore Ravens':'BAL',
'Buffalo Bills':'BUF',
'Carolina Panthers':'CAR',
'Chicago Bears':'CHI',
'Cincinnati Bengals': 'CIN',
'Cleveland Browns':'CLE',
'Dallas Cowboys':'DAL',
'Denver Broncos':'DEN',
'Detroit Lions':'DET',
'Green Bay Packers':'GB',
'Houston Texans':'HOU',
'Baltimore Colts':'IND',
'Indianapolis Colts':'IND',
'Jacksonville Jaguars':'JAX',
'Kansas City Chiefs':'KC',
'Los Angeles Chargers':'LAC',
'San Diego Chargers':'LAC',
'Los Angeles Rams':'LAR',
'St. Louis Rams':'LAR',
'Miami Dolphins':'MIA',
'Minnesota Vikings':'MIN',
'New England Patriots':'NE',
'Boston Patriots':'NE',
'New Orleans Saints':'NO',
'New York Giants':'NYG',
'New York Jets':'NYJ',
'Los Angeles Raiders':'OAK',
'Oakland Raiders':'OAK',
'Philadelphia Eagles':'PHI',
'Pittsburgh Steelers':'PIT',
'Seattle Seahawks':'SEA',
'San Francisco 49ers':'SF',
'Tampa Bay Buccaneers':'TB',
'Houston Oilers':'TEN',
'Tennessee Oilers':'TEN',
'Tennessee Titans':'TEN',
'Washington Football Team':'WAS',
'Washington Redskins':'WAS'})

nfl_1979 = nfl_1979.reset_index()
del nfl_1979['index']

nfl_1979.head()

In [None]:
nfl_1979['team_favorite_id']

In [None]:
# calc performance against spread:
#fav team points - other team points

#example using first game 
#(det vs TB, TB favored by 3, score: score_home 31.0, score_away 16.0  )


# record cover as win, lose , push
for index, row in nfl_1979.iterrows():
    home_team = row[4]
    away_team = row[7]
    fav_team = row[8]
    home_score = row[5]
    away_score = row[6]
    spread = row[9]
    if home_team == fav_team:
        fav_score = home_score + spread
        diff = fav_score - away_score
    elif away_team == fav_team:
        fav_score = away_score + spread
        diff = fav_score - home_score
    
    nfl_1979.loc[index,'Spread Diff'] = diff
    if diff > 0:
        nfl_1979.loc[index,'Cover'] = 'yes'
    elif diff < 0:
        nfl_1979.loc[index,'Cover'] = 'no'
    else:
        nfl_1979.loc[index,'Cover'] = 'push'
    

In [None]:
#deleted stadium_neutral and weather_detail columns
nfl_1979.columns
del(nfl_1979['stadium_neutral'])
del(nfl_1979['weather_detail'])

In [None]:
#trying to filter out covered from not covered etc.
nfl_1979.head()

home_team = nfl_1979['team_home']
cover = nfl_1979['Cover']





In [None]:
nfl_1979['Cover'].value_counts()
ax = sns.countplot(x="team_home", hue = 'Cover', data=nfl_year_cover_home)


#nfl_1979.head()

In [None]:
   
    home_records = {}
    away_records = {}
    
    for index, row in nfl_1979.iterrows():
        home_team = row[4]
        away_team = row[7]
        fav_team = row[8]
        cover = row[17]
    
        if home_team == fav_team:
            if home_team not in home_records:
                home_records[home_team] = [0,0,0] 
         
            if cover == 'yes':
                home_records[home_team][0] += 1 
            
            if cover == 'no':
                home_records[home_team][1] += 1 
            
            if cover == 'push':
                home_records[home_team][2] += 1 
                
        if away_team == fav_team:
            if away_team not in away_records:
                away_records[away_team] = [0,0,0] 
         
            if cover == 'yes':
                away_records[away_team][0] += 1 
            
            if cover == 'no':
                away_records[away_team][1] += 1 
            
            if cover == 'push':
                away_records[away_team][2] += 1 
            
            
home_records_df = pd.DataFrame(home_records).T
    
away_records_df = pd.DataFrame(away_records).T
    
home_records_df.columns = ['Home covered', 'Home not covered', 'Home pushed']
    
#home_records_df.loc['DET']

away_records_df.columns = ['Away covered', 'Away not covered', 'Away pushed']


   
print(away_records_df.head())

home_records_df.sort_index(inplace = True)

#print(home_records_df.head())



#home_records_df.head()
cover_records_joined = home_records_df.join(away_records_df, on=None, how='left', lsuffix='', rsuffix='', sort=False)
   
cover_records_joined.head()
    

#cover_records_joined.plot(kind='barh')                    
            
       
        
cover_records_joined.head()
            
                
           
    
    


In [None]:

labels = cover_records_joined.index
df = cover_records_joined

# Set x axis and tick locations
x_axis = np.arange(len(labels))
tick_locations = [value+0.4 for value in x_axis]

#set fig size, x-axis, y
plt.figure(figsize=(20,8))

width = 0.3
#plot covered data
plt.bar(x_axis - width, df['Home covered'],width = width, color='r', alpha=0.5, align="edge", label = 'covered')
plt.xticks(tick_locations, df.index, rotation="vertical")

plt.bar(x_axis, df['Home not covered'],width = width, color='b', alpha=0.5, align="edge", label = 'not covered')
plt.xticks(tick_locations, df.index, rotation="vertical")

plt.bar(x_axis + width, df['Home pushed'],width = width, color='g', alpha=0.5, align="edge", label = 'pushed')
plt.xticks(tick_locations, df.index, rotation="vertical")

# Set x and y limits
plt.xlim(-0.25, len(x_axis))
plt.ylim(0, max(df["Home covered"])+10)

# Set a Title and labels
plt.title("NFL home team performance relative to the spread", fontdict = {'fontsize' : 25})
plt.xlabel("Favored Team", fontdict = {'fontsize' : 18})
plt.ylabel("Games",  fontdict = {'fontsize' : 18})

plt.legend(loc = 'upper left')
plt.savefig('home_cover_bar.png')
plt.show()


#plot away covered data

# Set x axis and tick locations
x_axis = np.arange(len(labels))
tick_locations = [value+0.4 for value in x_axis]

#set fig size, x-axis, y
plt.figure(figsize=(20,8))

width = 0.3

df = cover_records_joined

plt.bar(x_axis - width, df['Away covered'],width = width, color='r', alpha=0.5, align="edge", label = 'covered')
plt.xticks(tick_locations, df.index, rotation="vertical")

plt.bar(x_axis, df['Away not covered'],width = width, color='b', alpha=0.5, align="edge", label = 'not covered')
plt.xticks(tick_locations, df.index, rotation="vertical")

plt.bar(x_axis + width, df['Away pushed'],width = width, color='g', alpha=0.5, align="edge", label = 'pushed')
plt.xticks(tick_locations, df.index, rotation="vertical")

# Set x and y limits
plt.xlim(-0.25, len(x_axis))
plt.ylim(0, max(df["Away covered"])+10)

# Set a Title and labels
plt.title("NFL away team performance relative to the spread", fontdict = {'fontsize' : 25})
plt.xlabel("Favored Team", fontdict = {'fontsize' : 18})
plt.ylabel("Games",  fontdict = {'fontsize' : 18})

plt.legend(loc = 'upper left')
plt.savefig('away_cover_bar.png')
plt.show()


In [None]:
os.getcwd()