In [1]:
# Dependencies and Setup
import pandas as pd


import matplotlib.pyplot as plt
import scipy.stats as st
pd.set_option("display.precision", 2)
import numpy as np
from scipy import stats
import os
import datetime as dt
import random
from scipy.stats import linregress
import scipy.stats as stats


In [2]:
#set working directory
os.chdir("C://Users//ksaville//Desktop/NFL-Betting-Analysis/")
os.getcwd()
nfl = pd.read_csv("raw_data/spreadspoke_scores.csv")

# Introduction:  

The data for this analysis was obtained from https://www.kaggle.com/tobycrabtree/nfl-scores-and-betting-data.  Below is the desription of this data from Kaggle.

National Football League (NFL) game results since 1966 with betting odds information since 1979. Dataset was created from a variety of sources including games and scores from a variety of public websites such as ESPN, NFL.com, and Pro Football Reference. Weather information is from NOAA data with NFLweather.com a good cross reference. Betting data was used from http://www.repole.com/sun4cast/data.html for 1978-2013 seasons. Pro-football-reference.com data was then cross referenced for betting lines and odds as well as weather data. From 2013 on betting data reflects lines available at sportsline.com.

Our goal is to investigate the performance of teams relative to the betting spread and over/under line as well as to investigate the potential effects of weather on overall scoring.

# Exploring the Data

In [3]:
nfl.shape

#shows there are 17 columns and 12934 rows of data

(12934, 17)

In [4]:
nfl.head()

#Shows that the first games were from 1966.  (note: Betting info is only available starting in 1979)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,9/2/1966,1966,1,False,Miami Dolphins,14.0,23.0,Oakland Raiders,,,,Orange Bowl,False,83.0,6.0,71,
1,9/3/1966,1966,1,False,Houston Oilers,45.0,7.0,Denver Broncos,,,,Rice Stadium,False,81.0,7.0,70,
2,9/4/1966,1966,1,False,San Diego Chargers,27.0,7.0,Buffalo Bills,,,,Balboa Stadium,False,70.0,7.0,82,
3,9/9/1966,1966,2,False,Miami Dolphins,14.0,19.0,New York Jets,,,,Orange Bowl,False,82.0,11.0,78,
4,9/10/1966,1966,1,False,Green Bay Packers,24.0,3.0,Baltimore Colts,,,,Lambeau Field,False,64.0,8.0,62,


In [5]:
nfl.columns
# shows the names of all the columns

Index(['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff',
       'team_home', 'score_home', 'score_away', 'team_away',
       'team_favorite_id', 'spread_favorite', 'over_under_line', 'stadium',
       'stadium_neutral', 'weather_temperature', 'weather_wind_mph',
       'weather_humidity', 'weather_detail'],
      dtype='object')

In [6]:
#Check data types
nfl.dtypes
# Note - schedule date, week, over_under line, and humidity are object type.  
# May need to change this to date format in subsequent analyses

schedule_date           object
schedule_season          int64
schedule_week           object
schedule_playoff          bool
team_home               object
score_home             float64
score_away             float64
team_away               object
team_favorite_id        object
spread_favorite        float64
over_under_line         object
stadium                 object
stadium_neutral           bool
weather_temperature    float64
weather_wind_mph       float64
weather_humidity        object
weather_detail          object
dtype: object

In [7]:
#check to see of there are null values in the data
nfl.isnull().sum()
# yes there are:  137 in score_home and score_away, 
#2616 in team favorite, spread favorite
#2626 in over_under.This probably reflects that betting data wasn't included before 1979
# not sure about the 2626 vs 2616 discrepancy
# 926 in weather_temp, weather_wind
#4546 in humidity
#10233 in weather_detail (this means rain, snow, fog, etc.)

schedule_date              0
schedule_season            0
schedule_week              0
schedule_playoff           0
team_home                  0
score_home               137
score_away               137
team_away                  0
team_favorite_id        2616
spread_favorite         2616
over_under_line         2626
stadium                    0
stadium_neutral            0
weather_temperature      926
weather_wind_mph         926
weather_humidity        4546
weather_detail         10223
dtype: int64

In [8]:
#Check out some basic statistics
nfl.describe()

Unnamed: 0,schedule_season,score_home,score_away,spread_favorite,weather_temperature,weather_wind_mph
count,12934.0,12797.0,12797.0,10318.0,12008.0,12008.0
mean,1994.94,22.39,19.65,-5.38,58.7,7.8
std,15.56,10.55,10.16,3.43,15.53,5.64
min,1966.0,0.0,0.0,-26.5,-6.0,0.0
25%,1982.0,14.0,13.0,-7.0,48.0,3.0
50%,1996.0,22.0,20.0,-4.5,61.0,8.0
75%,2008.0,29.0,27.0,-3.0,72.0,12.0
max,2020.0,72.0,62.0,0.0,97.0,40.0


In [9]:
nfl.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,9/2/1966,1966,1,False,Miami Dolphins,14.0,23.0,Oakland Raiders,,,,Orange Bowl,False,83.0,6.0,71,
1,9/3/1966,1966,1,False,Houston Oilers,45.0,7.0,Denver Broncos,,,,Rice Stadium,False,81.0,7.0,70,
2,9/4/1966,1966,1,False,San Diego Chargers,27.0,7.0,Buffalo Bills,,,,Balboa Stadium,False,70.0,7.0,82,
3,9/9/1966,1966,2,False,Miami Dolphins,14.0,19.0,New York Jets,,,,Orange Bowl,False,82.0,11.0,78,
4,9/10/1966,1966,1,False,Green Bay Packers,24.0,3.0,Baltimore Colts,,,,Lambeau Field,False,64.0,8.0,62,


In [10]:
#convert schedule_date to date format
nfl['schedule_date']=pd.to_datetime(nfl['schedule_date'])
# convert over_under line to float
nfl['schedule_date']=pd.to_datetime(nfl['schedule_date'])

nfl.dtypes
nfl.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,1966-09-02,1966,1,False,Miami Dolphins,14.0,23.0,Oakland Raiders,,,,Orange Bowl,False,83.0,6.0,71,
1,1966-09-03,1966,1,False,Houston Oilers,45.0,7.0,Denver Broncos,,,,Rice Stadium,False,81.0,7.0,70,
2,1966-09-04,1966,1,False,San Diego Chargers,27.0,7.0,Buffalo Bills,,,,Balboa Stadium,False,70.0,7.0,82,
3,1966-09-09,1966,2,False,Miami Dolphins,14.0,19.0,New York Jets,,,,Orange Bowl,False,82.0,11.0,78,
4,1966-09-10,1966,1,False,Green Bay Packers,24.0,3.0,Baltimore Colts,,,,Lambeau Field,False,64.0,8.0,62,


In [19]:
#make boxplots of home and away scores

#First drop nas for home and away scores (137 were identified above)
scores_df = nfl[['score_home', 'score_away']]
scores_df = scores_df.dropna(subset=['score_home', 'score_away'])
print(scores_df.isnull().sum())

home_mean = round(np.mean(scores_df["score_home"]),2)
away_mean = round(np.mean(scores_df["score_away"]),2)
diff = round(home_mean - away_mean,2)

print(f'Home mean score = {home_mean}; Away mean score = {away_mean}')
print(f'The difference in means is:  {diff}')

stats.ttest_ind(scores_df['score_home'], scores_df['score_away'])

score_home    0
score_away    0
dtype: int64
Home mean score = 22.39; Away mean score = 19.65
The difference in means is:  2.74


Ttest_indResult(statistic=21.152853088505378, pvalue=1.8109497028349484e-98)

In [None]:
#Make a boxplot for away scores and home scores (after nulls have been dropped)

scores = [scores_df['score_home'], scores_df['score_away']]

fig = plt.figure(figsize =(10, 7)) 
  
# Creating axes instance 
ax = fig.add_axes([0, 0, 1, 1]) 
  
# Creating plot 
bp = ax.boxplot(scores, patch_artist = True)

# fill with colors - not quite sure how this works, just trying things from google searches
colors = ['lightcoral', 'lightblue']

for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)

ax.set_xticklabels(['Home Scores', 'Away Scores']) 

plt.title("A comparison of home and away NFL scores from seasons 1966-2019") 



In [None]:
#make a scatter plot for scores as a function of temp.  Would like to see home, away, and total
# use function for making scatter plots
#Define function for making  scatter plots

def make_scatter_plot(df, x_col, y_col):
    "This creates a scatter plot given a list of x values, y values, x label, y label and title"
    x = df[x_col]
    y = df[y_col]
  
    
    #generating random number to be included in fig name so that new figure is unlikely to 
    #be named the same as a previous figure 
    
    #fig_num = random.randint(1, 100)

    plt.scatter(x, y)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f"{y_col} as a function of {x_col}")
    plt.ylim(min(y)-10,max(y)+10)
    plt.savefig(f"{x_col}_{y_col}.png")
   
    linregress(x,y)
    reg = linregress(x,y)
    reg_line = reg[0]*x + reg[1]

    plt.plot(x,reg_line, color="red")
    r_squared = round(reg[2]**2,2)
    print(f"r squared = {r_squared}")
    
    return plt



In [None]:
#prep data for scatter plots.  Want to look at weather_temp, weather_wind, home and away scores
nfl.columns
scores_weather = nfl[['weather_temperature', 'weather_wind_mph', 'score_home', 'score_away']]
print(scores_weather.isnull().sum())
scores_weather = scores_weather.dropna(subset=['weather_temperature', 'weather_wind_mph', 'score_home', 'score_away'])
print(scores_weather.isnull().sum())





In [None]:
make_scatter_plot(scores_weather,'weather_temperature','score_away' )

In [None]:
make_scatter_plot(scores_weather,'weather_wind_mph','score_home' )

In [None]:
make_scatter_plot(scores_weather,'weather_wind_mph','score_away' )

In [None]:
nfl.columns

In [None]:
#change data frame to only games after 1979

nfl_1979 = nfl[nfl['schedule_season']>= 1979]

In [None]:
nfl_1979 = ['over_under_line'].isnull().sum()

In [None]:
nfl_1979 = nfl_1979.dropna(subset=['score_home', 'score_away', 'over_under_line'])
nfl_1979.isnull().sum()

In [None]:
nfl_1979['over_under_line'].head()

In [None]:
#pd.to_numeric(weather.Temp, errors='coerce')
nfl_1979["over_under_line"] = pd.to_numeric(nfl_1979["over_under_line"], errors='coerce')

In [None]:
nfl_1979.dtypes

In [None]:
nfl.columns

In [None]:
nfl_1979_gby_teams = nfl_1979.groupby('team_favorite_id')
nfl_1979_gby_teams.head()

things to do:

Combine names of teams that have changed cities
calculate win percentages of all teams - make a table of output
calculate beating the spread % overall and per team
calculate over under % (% over, % under) overall and per team





