Import libraries and upload data from nflverse github

In [1]:
#Import Pandas
import pandas as pd

#Import requests
import requests

#Import beautifulsoup
from bs4 import BeautifulSoup



All data used in this notebook is from Pro-football-reference.com

In [2]:
# List of URLs you want to scrape
urls = [
    'https://www.pro-football-reference.com/years/2024/games.htm',
    'https://www.pro-football-reference.com/years/2023/games.htm',
    'https://www.pro-football-reference.com/years/2022/games.htm',
    'https://www.pro-football-reference.com/years/2021/games.htm',
    'https://www.pro-football-reference.com/years/2020/games.htm',
    'https://www.pro-football-reference.com/years/2019/games.htm',
    'https://www.pro-football-reference.com/years/2018/games.htm',
    'https://www.pro-football-reference.com/years/2017/games.htm'
]

# Function to scrape data from a single URL and return a DataFrame
def scrape_games_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main games table
    table = soup.find('table', id='games')

    # Initialize a list to store data
    data = []

    # Extract data from table rows
    for row in table.find_all('tr')[1:]:  # Skip header row
        cols = row.find_all(['th', 'td'])
        if len(cols) > 0:
            week = cols[0].text.strip()
            day = cols[1].text.strip()
            date = cols[2].text.strip()
            time = cols[3].text.strip()
            winner = cols[4].text.strip()
            awaygame = cols[5].text.strip()
            loser = cols[6].text.strip()
            pts_w = cols[8].text.strip()
            pts_l = cols[9].text.strip()
            yds_w = cols[10].text.strip()
            to_w = cols[11].text.strip()
            yds_l = cols[12].text.strip()
            to_l = cols[13].text.strip()

            data.append([week, day, date, time, winner, awaygame, loser, pts_w, pts_l, yds_w, to_w, yds_l, to_l])

    # Create DataFrame
    games_df = pd.DataFrame(data, columns=['Week', 'Day', 'Date', 'Time', 'Winner', 'Awaygame', 'Loser', 
                                           'PtsW', 'PtsL', 'YdsW', 'TOW', 'YdsL', 'TOL'])
    return games_df

# List to store DataFrames
all_games_data = []

# Loop over the URLs and scrape data
for url in urls:
    df = scrape_games_data(url)
    all_games_data.append(df)

# Optionally, concatenate all the DataFrames into one large DataFrame
combined_games_data = pd.concat(all_games_data, ignore_index=True)

# Print the first few rows of the combined DataFrame
print(combined_games_data.head())
print(combined_games_data.info())
print(combined_games_data.shape)

  Week  Day        Date    Time               Winner Awaygame  \
0    1  Thu  2024-09-05  8:20PM   Kansas City Chiefs            
1    1  Fri  2024-09-06  8:15PM  Philadelphia Eagles            
2    1  Sun  2024-09-08  1:00PM  Pittsburgh Steelers        @   
3    1  Sun  2024-09-08  1:00PM        Buffalo Bills            
4    1  Sun  2024-09-08  1:00PM   New Orleans Saints            

               Loser PtsW PtsL YdsW TOW YdsL TOL  
0   Baltimore Ravens   27   20  353   1  452   1  
1  Green Bay Packers   34   29  410   3  414   1  
2    Atlanta Falcons   18   10  270   0  226   3  
3  Arizona Cardinals   34   28  352   1  270   1  
4  Carolina Panthers   47   10  379   1  193   3  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2349 entries, 0 to 2348
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Week      2349 non-null   object
 1   Day       2349 non-null   object
 2   Date      2349 non-null   object
 3   T

In [3]:
#Data describe
print(combined_games_data.describe())

        Week   Day  Date    Time      Winner Awaygame      Loser  PtsW  PtsL  \
count   2349  2349  2349    2349        2349     2349       2349  2349  2349   
unique    24     9   478      34          37        3         37    56    45   
top     Week   Sun  Date  1:00PM  Winner/tie           Loser/tie    27    17   
freq     132  1821   132    1099         132     1347        132   182   222   

        YdsW   TOW  YdsL   TOL  
count   2349  2349  2349  2349  
unique   377     9   404    10  
top     YdsW     0  YdsL     1  
freq     132   920   132   687  


In [4]:
#Check awaygame column
print(combined_games_data["Awaygame"].unique())

['' '@' 'N']


In [5]:
#Check if ties are indicated correctly with known tie games, will use Detroit Lions vs Pittsburgh Steelers in week 10 of 2021 season
combined_games_data[combined_games_data["Date"].isin(["2021-11-14"])]

Unnamed: 0,Week,Day,Date,Time,Winner,Awaygame,Loser,PtsW,PtsL,YdsW,TOW,YdsL,TOL
1054,10,Sun,2021-11-14,1:00PM,Dallas Cowboys,,Atlanta Falcons,43,3,431,1,214,3
1055,10,Sun,2021-11-14,1:00PM,Buffalo Bills,@,New York Jets,45,17,489,2,366,5
1056,10,Sun,2021-11-14,1:00PM,Pittsburgh Steelers,,Detroit Lions,16,16,387,3,306,0
1057,10,Sun,2021-11-14,1:00PM,Tennessee Titans,,New Orleans Saints,23,21,264,0,373,1
1058,10,Sun,2021-11-14,1:00PM,Washington Football Team,,Tampa Bay Buccaneers,29,19,320,1,273,2
1059,10,Sun,2021-11-14,1:00PM,New England Patriots,,Cleveland Browns,45,7,452,0,217,1
1060,10,Sun,2021-11-14,1:00PM,Indianapolis Colts,,Jacksonville Jaguars,23,17,295,0,331,1
1061,10,Sun,2021-11-14,4:05PM,Minnesota Vikings,@,Los Angeles Chargers,27,20,381,1,253,1
1062,10,Sun,2021-11-14,4:05PM,Carolina Panthers,@,Arizona Cardinals,34,10,341,2,169,2
1063,10,Sun,2021-11-14,4:25PM,Philadelphia Eagles,@,Denver Broncos,30,13,386,1,308,1


In [6]:
#Adding tie column
combined_games_data['Tie'] = (combined_games_data['PtsW'] == combined_games_data['PtsL']).astype(int)

In [7]:
#Look at tie games
combined_games_data[combined_games_data["Tie"]==1]

Unnamed: 0,Week,Day,Date,Time,Winner,Awaygame,Loser,PtsW,PtsL,YdsW,TOW,YdsL,TOL,Tie
16,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
33,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
50,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
67,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
82,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2269,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
2286,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
2303,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1
2320,Week,Day,Date,Time,Winner/tie,,Loser/tie,Pts,Pts,YdsW,TOW,YdsL,TOL,1


In [8]:
#Look for non-dates in date column
print(combined_games_data["Date"].unique())

['2024-09-05' '2024-09-06' '2024-09-08' '2024-09-09' 'Date' '2024-09-12'
 '2024-09-15' '2024-09-16' '2024-09-19' '2024-09-22' '2024-09-23'
 '2024-09-26' '2024-09-29' '2024-09-30' '2024-10-03' '2024-10-06'
 '2024-10-07' '2024-10-10' '2024-10-13' '2024-10-14' '2024-10-17'
 '2024-10-20' '2024-10-21' '2024-10-24' '2024-10-27' '2024-10-28'
 '2024-10-31' '2024-11-03' '2024-11-04' '2024-11-07' '2024-11-10'
 '2024-11-11' '2024-11-14' '2024-11-17' '2024-11-18' '2024-11-21'
 '2024-11-24' '2024-11-25' '2024-11-28' '2024-11-29' '2024-12-01'
 '2024-12-02' '2024-12-05' '2024-12-08' '2024-12-09' '2024-12-12'
 '2024-12-15' '2024-12-16' '2024-12-19' '2024-12-21' '2024-12-22'
 '2024-12-23' '2024-12-25' '2024-12-26' '2024-12-28' '2024-12-29'
 '2024-12-30' '2025-01-04' '2025-01-05' 'Playoffs' '2025-01-11'
 '2025-01-12' '2025-01-13' '2025-01-18' '2025-01-19' '2025-01-26'
 '2025-02-09' '2023-09-07' '2023-09-10' '2023-09-11' '2023-09-14'
 '2023-09-17' '2023-09-18' '2023-09-21' '2023-09-24' '2023-09-25'
 '202

In [9]:
#Look at rows containing non dates in date column
searchvalues = ["Date", "Playoffs"]
print(combined_games_data[combined_games_data["Date"].isin(searchvalues)])

      Week  Day      Date  Time      Winner Awaygame      Loser PtsW PtsL  \
16    Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
33    Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
50    Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
67    Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
82    Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
...    ...  ...       ...   ...         ...      ...        ...  ...  ...   
2269  Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
2286  Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
2303  Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
2320  Week  Day      Date  Time  Winner/tie           Loser/tie  Pts  Pts   
2337             Playoffs                                                   

      YdsW  TOW  YdsL  TOL  Tie  
16    YdsW  TOW  YdsL  TOL    1  
33    Y

In [10]:
# Convert 'Date' column to datetime, with coercion of errors (invalid dates become NaT)
combined_games_data["Date"] = pd.to_datetime(combined_games_data["Date"], errors='coerce')

# Drop rows with NaT (invalid dates) if needed
combined_games_data = combined_games_data.dropna(subset=["Date"])

# Set 'Date' column as index
combined_games_data.set_index("Date", inplace=True)

# Print the DataFrame
print(combined_games_data)

                 Week  Day    Time                Winner Awaygame  \
Date                                                                
2024-09-05          1  Thu  8:20PM    Kansas City Chiefs            
2024-09-06          1  Fri  8:15PM   Philadelphia Eagles            
2024-09-08          1  Sun  1:00PM   Pittsburgh Steelers        @   
2024-09-08          1  Sun  1:00PM         Buffalo Bills            
2024-09-08          1  Sun  1:00PM    New Orleans Saints            
...               ...  ...     ...                   ...      ...   
2018-01-14   Division  Sun  1:05PM  Jacksonville Jaguars        @   
2018-01-14   Division  Sun  4:40PM     Minnesota Vikings            
2018-01-21  ConfChamp  Sun  3:05PM  New England Patriots            
2018-01-21  ConfChamp  Sun  6:40PM   Philadelphia Eagles            
2018-02-04  SuperBowl  Sun  6:30PM   Philadelphia Eagles        N   

                           Loser PtsW PtsL YdsW TOW YdsL TOL  Tie  
Date                              

In [11]:
#Look at tie games again
combined_games_data[combined_games_data["Tie"]==1]

Unnamed: 0_level_0,Week,Day,Time,Winner,Awaygame,Loser,PtsW,PtsL,YdsW,TOW,YdsL,TOL,Tie
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-09-11,1,Sun,1:00PM,Houston Texans,,Indianapolis Colts,20,20,299,1,517,2,1
2022-12-04,13,Sun,1:00PM,Washington Commanders,@,New York Giants,20,20,411,1,316,1,1
2021-11-14,10,Sun,1:00PM,Pittsburgh Steelers,,Detroit Lions,16,16,387,3,306,0,1
2020-09-27,3,Sun,1:00PM,Philadelphia Eagles,,Cincinnati Bengals,23,23,378,2,304,0,1
2019-09-08,1,Sun,4:25PM,Detroit Lions,@,Arizona Cardinals,27,27,477,2,387,1,1
2018-09-09,1,Sun,1:00PM,Pittsburgh Steelers,@,Cleveland Browns,21,21,472,6,327,1,1
2018-09-16,2,Sun,1:00PM,Minnesota Vikings,@,Green Bay Packers,29,29,480,1,351,0,1


Superbowl data will be available after this weekend, should solve the tie value there

In [12]:
#Check for missing values
print(combined_games_data.isnull().sum().sort_values(ascending=False))

Week        0
Day         0
Time        0
Winner      0
Awaygame    0
Loser       0
PtsW        0
PtsL        0
YdsW        0
TOW         0
YdsL        0
TOL         0
Tie         0
dtype: int64


In [13]:
combined_games_data.dtypes

Week        object
Day         object
Time        object
Winner      object
Awaygame    object
Loser       object
PtsW        object
PtsL        object
YdsW        object
TOW         object
YdsL        object
TOL         object
Tie          int64
dtype: object

In [14]:
#converting numeric data
combined_games_data['PtsW'] = pd.to_numeric(combined_games_data['PtsW'], errors='coerce')
combined_games_data['PtsL'] = pd.to_numeric(combined_games_data['PtsL'], errors='coerce')
combined_games_data['YdsW'] = pd.to_numeric(combined_games_data['YdsW'], errors='coerce')
combined_games_data['YdsL'] = pd.to_numeric(combined_games_data['YdsL'], errors='coerce')
combined_games_data['TOW'] = pd.to_numeric(combined_games_data['TOW'], errors='coerce')
combined_games_data['TOL'] = pd.to_numeric(combined_games_data['TOL'], errors='coerce')

In [15]:
#Rename df
NFL = combined_games_data

In [16]:
#Feature engineering for differentials (yards, turnovers, score)
#Points differential
NFL["PointDiff"] = NFL["PtsW"] - NFL ["PtsL"]

#Yard differential
NFL["YardDiff"] = NFL["YdsW"] - NFL["YdsL"]

#Turnover differential
NFL["TurnoverDiff"] = NFL["TOW"] - NFL["TOL"]

In [17]:
print(NFL.head())

           Week  Day    Time               Winner Awaygame              Loser  \
Date                                                                            
2024-09-05    1  Thu  8:20PM   Kansas City Chiefs            Baltimore Ravens   
2024-09-06    1  Fri  8:15PM  Philadelphia Eagles           Green Bay Packers   
2024-09-08    1  Sun  1:00PM  Pittsburgh Steelers        @    Atlanta Falcons   
2024-09-08    1  Sun  1:00PM        Buffalo Bills           Arizona Cardinals   
2024-09-08    1  Sun  1:00PM   New Orleans Saints           Carolina Panthers   

            PtsW  PtsL  YdsW  TOW  YdsL  TOL  Tie  PointDiff  YardDiff  \
Date                                                                     
2024-09-05    27    20   353    1   452    1    0          7       -99   
2024-09-06    34    29   410    3   414    1    0          5        -4   
2024-09-08    18    10   270    0   226    3    0          8        44   
2024-09-08    34    28   352    1   270    1    0          6  

In [18]:
# Create the 'Home' and 'Away' columns based on the 'Awaygame' column
NFL['Home'] = NFL['Awaygame'].apply(lambda x: True if x == '' else False)
NFL['Away'] = NFL['Awaygame'].apply(lambda x: True if x == '@' else False)

# For neutral games (N), we can set both as False (or leave them as NaN if you prefer)
NFL.loc[NFL['Awaygame'] == 'N', ['Home', 'Away']] = False, False

# Check the new columns
NFL[['Awaygame', 'Home', 'Away']].head()

Unnamed: 0_level_0,Awaygame,Home,Away
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-09-05,,True,False
2024-09-06,,True,False
2024-09-08,@,False,True
2024-09-08,,True,False
2024-09-08,,True,False


In [19]:
#Check N values to make sure Home and Away columns are false
NFL[NFL["Awaygame"]=='N']

Unnamed: 0_level_0,Week,Day,Time,Winner,Awaygame,Loser,PtsW,PtsL,YdsW,TOW,YdsL,TOL,Tie,PointDiff,YardDiff,TurnoverDiff,Home,Away
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-02-11,SuperBowl,Sun,6:30PM,Kansas City Chiefs,N,San Francisco 49ers,25,22,455,2,382,2,0,3,73,0,False,False
2023-02-12,SuperBowl,Sun,6:30PM,Kansas City Chiefs,N,Philadelphia Eagles,38,35,340,0,417,1,0,3,-77,-1,False,False
2022-02-13,SuperBowl,Sun,6:30PM,Los Angeles Rams,N,Cincinnati Bengals,23,20,313,2,305,0,0,3,8,2,False,False
2021-02-07,SuperBowl,Sun,6:30PM,Tampa Bay Buccaneers,N,Kansas City Chiefs,31,9,340,0,350,2,0,22,-10,-2,False,False
2020-02-02,SuperBowl,Sun,6:30PM,Kansas City Chiefs,N,San Francisco 49ers,31,20,397,2,351,2,0,11,46,0,False,False
2019-02-03,SuperBowl,Sun,6:30PM,New England Patriots,N,Los Angeles Rams,13,3,407,1,260,1,0,10,147,0,False,False
2018-02-04,SuperBowl,Sun,6:30PM,Philadelphia Eagles,N,New England Patriots,41,33,538,1,613,1,0,8,-75,0,False,False


In [20]:
#Looking at dataframe
NFL

Unnamed: 0_level_0,Week,Day,Time,Winner,Awaygame,Loser,PtsW,PtsL,YdsW,TOW,YdsL,TOL,Tie,PointDiff,YardDiff,TurnoverDiff,Home,Away
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-09-05,1,Thu,8:20PM,Kansas City Chiefs,,Baltimore Ravens,27,20,353,1,452,1,0,7,-99,0,True,False
2024-09-06,1,Fri,8:15PM,Philadelphia Eagles,,Green Bay Packers,34,29,410,3,414,1,0,5,-4,2,True,False
2024-09-08,1,Sun,1:00PM,Pittsburgh Steelers,@,Atlanta Falcons,18,10,270,0,226,3,0,8,44,-3,False,True
2024-09-08,1,Sun,1:00PM,Buffalo Bills,,Arizona Cardinals,34,28,352,1,270,1,0,6,82,0,True,False
2024-09-08,1,Sun,1:00PM,New Orleans Saints,,Carolina Panthers,47,10,379,1,193,3,0,37,186,-2,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-14,Division,Sun,1:05PM,Jacksonville Jaguars,@,Pittsburgh Steelers,45,42,378,0,545,2,0,3,-167,-2,False,True
2018-01-14,Division,Sun,4:40PM,Minnesota Vikings,,New Orleans Saints,29,24,403,1,358,2,0,5,45,-1,True,False
2018-01-21,ConfChamp,Sun,3:05PM,New England Patriots,,Jacksonville Jaguars,24,20,344,1,374,0,0,4,-30,1,True,False
2018-01-21,ConfChamp,Sun,6:40PM,Philadelphia Eagles,,Minnesota Vikings,38,7,456,0,333,3,0,31,123,-3,True,False


In [21]:
NFL

Unnamed: 0_level_0,Week,Day,Time,Winner,Awaygame,Loser,PtsW,PtsL,YdsW,TOW,YdsL,TOL,Tie,PointDiff,YardDiff,TurnoverDiff,Home,Away
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-09-05,1,Thu,8:20PM,Kansas City Chiefs,,Baltimore Ravens,27,20,353,1,452,1,0,7,-99,0,True,False
2024-09-06,1,Fri,8:15PM,Philadelphia Eagles,,Green Bay Packers,34,29,410,3,414,1,0,5,-4,2,True,False
2024-09-08,1,Sun,1:00PM,Pittsburgh Steelers,@,Atlanta Falcons,18,10,270,0,226,3,0,8,44,-3,False,True
2024-09-08,1,Sun,1:00PM,Buffalo Bills,,Arizona Cardinals,34,28,352,1,270,1,0,6,82,0,True,False
2024-09-08,1,Sun,1:00PM,New Orleans Saints,,Carolina Panthers,47,10,379,1,193,3,0,37,186,-2,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-01-14,Division,Sun,1:05PM,Jacksonville Jaguars,@,Pittsburgh Steelers,45,42,378,0,545,2,0,3,-167,-2,False,True
2018-01-14,Division,Sun,4:40PM,Minnesota Vikings,,New Orleans Saints,29,24,403,1,358,2,0,5,45,-1,True,False
2018-01-21,ConfChamp,Sun,3:05PM,New England Patriots,,Jacksonville Jaguars,24,20,344,1,374,0,0,4,-30,1,True,False
2018-01-21,ConfChamp,Sun,6:40PM,Philadelphia Eagles,,Minnesota Vikings,38,7,456,0,333,3,0,31,123,-3,True,False


In [22]:
#Combine duplicate teams into one
# Define the mapping to combine the Washington team names and the Raiders team names
team_name_mapping = {
    'Washington Redskins': 'Washington Commanders',
    'Washington Football Team': 'Washington Commanders',
    'Washington Commanders': 'Washington Commanders',
    'Oakland Raiders': 'Las Vegas Raiders',
    'Las Vegas Raiders': 'Las Vegas Raiders'
}

# Replace the team names in both 'Winner' and 'Loser' columns
NFL['Winner'] = NFL['Winner'].replace(team_name_mapping)
NFL['Loser'] = NFL['Loser'].replace(team_name_mapping)

# Check that the replacement was successful
NFL[['Winner', 'Loser']].drop_duplicates().head()

Unnamed: 0_level_0,Winner,Loser
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-09-05,Kansas City Chiefs,Baltimore Ravens
2024-09-06,Philadelphia Eagles,Green Bay Packers
2024-09-08,Pittsburgh Steelers,Atlanta Falcons
2024-09-08,Buffalo Bills,Arizona Cardinals
2024-09-08,New Orleans Saints,Carolina Panthers


In [23]:
#Check for duplicate teams
print(NFL["Winner"].unique())

['Kansas City Chiefs' 'Philadelphia Eagles' 'Pittsburgh Steelers'
 'Buffalo Bills' 'New Orleans Saints' 'Chicago Bears'
 'New England Patriots' 'Houston Texans' 'Miami Dolphins'
 'Minnesota Vikings' 'Seattle Seahawks' 'Los Angeles Chargers'
 'Dallas Cowboys' 'Tampa Bay Buccaneers' 'Detroit Lions'
 'San Francisco 49ers' 'Cleveland Browns' 'Green Bay Packers'
 'Washington Commanders' 'New York Jets' 'Las Vegas Raiders'
 'Arizona Cardinals' 'Atlanta Falcons' 'Indianapolis Colts'
 'New York Giants' 'Denver Broncos' 'Carolina Panthers' 'Baltimore Ravens'
 'Los Angeles Rams' 'Cincinnati Bengals' 'Tennessee Titans'
 'Jacksonville Jaguars']


Team Dataframe

In [24]:
# Make sure NFL DataFrame is sorted by Date (ascending or descending)
NFL.index = pd.to_datetime(NFL.index)  # Convert the index (Date) to datetime, if it's not already

# Sort the DataFrame by the Date index (ascending or descending)
NFL = NFL.sort_index(ascending=True)

In [25]:
# Initialize an empty DataFrame to hold the stats
team_stats = pd.DataFrame(columns=[
    'Total_Games', 'Win_Rate', 'Avg_PtsScored', 'Avg_PtsAllowed', 'Avg_Yards', 'Avg_YardsAllowed', 'Avg_TurnoverDiff', 
    'Avg_PointDiff', 'Avg_YardDiff',
    'Home_PtsScored', 'Away_PtsScored', 'Home_Yards', 'Away_Yards', 'Home_Turnover', 'Away_Turnover',
    'Last5_Win_Rate', 'Last5_Avg_PtsScored', 'Last5_Avg_PtsAllowed', 'Last5_Avg_TurnoverDiff', 'Last5_Avg_PointDiff'
])

# Iterate through each team
for team in NFL['Winner'].unique():
    # Total games for the team (count all games where the team was either winner or loser)
    total_games = len(NFL[(NFL['Winner'] == team) | (NFL['Loser'] == team)])  # Count both wins and losses

    # Calculate the win rate based on the number of wins and total games
    win_rate = len(NFL[(NFL['Winner'] == team) & (NFL['Tie'] == False)]) / total_games  # Wins divided by total games played

    # Stats where the team was the winner
    win_games = NFL[(NFL['Winner'] == team) & (NFL['Tie'] == False)]
    sum_pts_scored_win = win_games['PtsW'].sum()  # Sum points scored in wins
    sum_pts_allowed_win = win_games['PtsL'].sum()  # Sum points allowed in losses
    sum_yards_win = win_games['YdsW'].sum()  # Sum yards gained in wins
    sum_yards_allowed_win = win_games['YdsL'].sum()  # Sum yards allowed in losses
    sum_turnover_diff_win = win_games['TurnoverDiff'].sum()  # Sum turnover diff in wins
    sum_point_diff_win = win_games['PointDiff'].sum()  # Sum point diff in wins
    sum_yard_diff_win = win_games['YardDiff'].sum()  # Sum yard diff in wins

    # Stats where the team was the loser
    loss_games = NFL[(NFL['Loser'] == team) & (NFL['Tie'] == False)]
    sum_pts_scored_loss = loss_games['PtsL'].sum()  # Sum points scored in losses
    sum_pts_allowed_loss = loss_games['PtsW'].sum()  # Sum points allowed in losses
    sum_yards_loss = loss_games['YdsL'].sum()  # Sum yards scored in losses
    sum_yards_allowed_loss = loss_games['YdsW'].sum()  # Sum yards allowed in losses
    sum_turnover_diff_loss = (-loss_games['TurnoverDiff'].sum())  # Sum turnover diff in losses
    sum_point_diff_loss = (-loss_games['PointDiff'].sum())  # Sum point diff in losses
    sum_yard_diff_loss = (-loss_games['YardDiff'].sum())  # Sum yard diff in losses

    #Games where team was Home and Away
    home_games_team = NFL[((NFL['Home'] == True) & (NFL['Winner'] == team)) | ((NFL['Home'] == False) & (NFL['Loser'] == team))]
    away_games_team = NFL[((NFL['Away'] == True) & (NFL['Winner'] == team)) | ((NFL['Away'] == False) & (NFL['Loser'] == team))]

    # Home and Away stats for the team
    home_pts_scored = (
    pd.concat([
        home_games_team[home_games_team['Winner'] == team]['PtsW'],
        home_games_team[home_games_team['Loser'] == team]['PtsL']
    ]).mean() if len(home_games_team) > 0 else 0)
    away_pts_scored = (
    pd.concat([
        away_games_team[away_games_team['Winner'] == team]['PtsW'],
        away_games_team[away_games_team['Loser'] == team]['PtsL']
    ]).mean() if len(away_games_team) > 0 else 0)
    home_yards = (
    pd.concat([
        home_games_team[home_games_team['Winner'] == team]['YdsW'],
        home_games_team[home_games_team['Loser'] == team]['YdsL']
    ]).mean() if len(home_games_team) > 0 else 0)
    away_yards = (
    pd.concat([
        away_games_team[away_games_team['Winner'] == team]['YdsW'],
        away_games_team[away_games_team['Loser'] == team]['YdsL']
    ]).mean() if len(home_games_team) > 0 else 0)
    home_turnover =  (
    pd.concat([
        home_games_team[home_games_team['Winner'] == team]['TOW'],
        home_games_team[home_games_team['Loser'] == team]['TOL']
    ]).mean() if len(home_games_team) > 0 else 0)
    away_turnover =  (
    pd.concat([
        away_games_team[away_games_team['Winner'] == team]['TOW'],
        away_games_team[away_games_team['Loser'] == team]['TOL']
    ]).mean() if len(home_games_team) > 0 else 0)

    # Last 5 games stats
    last5_games = NFL[(NFL['Winner'] == team) | (NFL['Loser'] == team)].tail(5)
    last5_win_rate = len(last5_games[last5_games['Winner'] == team]) / len(last5_games)
    last5_pts_scored = last5_games['PtsW'].mean() if len(last5_games) > 0 else 0
    last5_pts_allowed = last5_games['PtsL'].mean() if len(last5_games) > 0 else 0
    last5_turnover_diff = last5_games['TurnoverDiff'].mean() if len(last5_games) > 0 else 0
    last5_point_diff = last5_games['PointDiff'].mean() if len(last5_games) > 0 else 0


    # Add all stats to the new dataframe
    team_stats.loc[team] = [
        total_games, win_rate, 
        (sum_pts_scored_win + sum_pts_scored_loss) / total_games,  # Average points scored across wins and losses
        (sum_pts_allowed_win + sum_pts_allowed_loss) / total_games,  # Average points allowed across wins and losses
        (sum_yards_win + sum_yards_loss) / total_games,  # Average yards across wins and losses
        (sum_yards_allowed_win + sum_yards_allowed_loss) / total_games,  # Average yards allowed across wins and losses
        (sum_turnover_diff_win + sum_turnover_diff_loss) / total_games,  # Average turnover diff across wins and losses
        (sum_point_diff_win + sum_point_diff_loss) / total_games,  # Average point diff across wins and losses
        (sum_yard_diff_win + sum_yard_diff_loss) / total_games,  # Average yard diff across wins and losses
        
        home_pts_scored, away_pts_scored, home_yards, away_yards, home_turnover, away_turnover,
        
        last5_win_rate, last5_pts_scored, last5_pts_allowed, last5_turnover_diff, last5_point_diff  # Last 5 games stats
    ]

# Show the new team stats dataframe
team_stats.head()



Unnamed: 0,Total_Games,Win_Rate,Avg_PtsScored,Avg_PtsAllowed,Avg_Yards,Avg_YardsAllowed,Avg_TurnoverDiff,Avg_PointDiff,Avg_YardDiff,Home_PtsScored,Away_PtsScored,Home_Yards,Away_Yards,Home_Turnover,Away_Turnover,Last5_Win_Rate,Last5_Avg_PtsScored,Last5_Avg_PtsAllowed,Last5_Avg_TurnoverDiff,Last5_Avg_PointDiff
Kansas City Chiefs,154.0,0.75974,27.746753,21.474026,384.61039,348.62987,-0.246753,6.272727,35.980519,26.756098,28.485714,372.731707,397.485714,0.963415,1.285714,0.6,32.4,15.0,-0.6,17.4
Dallas Cowboys,138.0,0.565217,25.478261,22.384058,367.492754,338.391304,-0.166667,3.094203,29.101449,28.434783,22.521739,386.73913,348.246377,1.333333,1.217391,0.4,29.4,16.8,-1.8,12.6
Green Bay Packers,140.0,0.592857,24.357143,21.907143,351.5,334.235714,-0.257143,2.45,17.264286,24.842857,24.285714,361.3,346.714286,0.942857,1.171429,0.4,27.4,14.0,-1.4,13.4
Carolina Panthers,133.0,0.353383,20.172932,25.112782,319.458647,345.233083,0.323308,-4.93985,-25.774436,20.80303,19.552239,324.272727,314.716418,1.378788,1.507463,0.4,36.0,22.4,-1.4,13.6
Los Angeles Rams,145.0,0.606897,24.882759,21.82069,358.986207,335.813793,-0.117241,3.062069,23.172414,25.0,24.493151,360.027778,357.232877,1.208333,1.260274,0.6,23.4,14.8,-1.6,8.6


In [26]:
print(team_stats.describe())
print(team_stats.info())
print(team_stats.shape)

       Total_Games   Win_Rate  Avg_PtsScored  Avg_PtsAllowed   Avg_Yards  \
count     32.00000  32.000000      32.000000       32.000000   32.000000   
mean     138.06250   0.495327      22.684470       22.783459  341.914555   
std        4.88538   0.108773       2.462026        1.582042   21.292506   
min      132.00000   0.305970      17.893939       19.120567  295.272727   
25%      134.00000   0.402239      20.728383       21.835381  327.795700   
50%      137.00000   0.490739      22.576006       22.613322  341.016165   
75%      140.00000   0.585438      24.809231       24.140435  357.382946   
max      154.00000   0.759740      27.746753       25.451128  384.610390   

       Avg_YardsAllowed  Avg_TurnoverDiff  Avg_PointDiff  Avg_YardDiff  \
count         32.000000         32.000000      32.000000     32.000000   
mean         342.600354          0.004267      -0.098990     -0.685799   
std           13.147554          0.235491       3.620611     27.981771   
min          316.23

In [27]:
team_stats

Unnamed: 0,Total_Games,Win_Rate,Avg_PtsScored,Avg_PtsAllowed,Avg_Yards,Avg_YardsAllowed,Avg_TurnoverDiff,Avg_PointDiff,Avg_YardDiff,Home_PtsScored,Away_PtsScored,Home_Yards,Away_Yards,Home_Turnover,Away_Turnover,Last5_Win_Rate,Last5_Avg_PtsScored,Last5_Avg_PtsAllowed,Last5_Avg_TurnoverDiff,Last5_Avg_PointDiff
Kansas City Chiefs,154.0,0.75974,27.746753,21.474026,384.61039,348.62987,-0.246753,6.272727,35.980519,26.756098,28.485714,372.731707,397.485714,0.963415,1.285714,0.6,32.4,15.0,-0.6,17.4
Dallas Cowboys,138.0,0.565217,25.478261,22.384058,367.492754,338.391304,-0.166667,3.094203,29.101449,28.434783,22.521739,386.73913,348.246377,1.333333,1.217391,0.4,29.4,16.8,-1.8,12.6
Green Bay Packers,140.0,0.592857,24.357143,21.907143,351.5,334.235714,-0.257143,2.45,17.264286,24.842857,24.285714,361.3,346.714286,0.942857,1.171429,0.4,27.4,14.0,-1.4,13.4
Carolina Panthers,133.0,0.353383,20.172932,25.112782,319.458647,345.233083,0.323308,-4.93985,-25.774436,20.80303,19.552239,324.272727,314.716418,1.378788,1.507463,0.4,36.0,22.4,-1.4,13.6
Los Angeles Rams,145.0,0.606897,24.882759,21.82069,358.986207,335.813793,-0.117241,3.062069,23.172414,25.0,24.493151,360.027778,357.232877,1.208333,1.260274,0.6,23.4,14.8,-1.6,8.6
Philadelphia Eagles,147.0,0.632653,25.462585,21.401361,358.945578,328.55102,-0.088435,4.061224,30.394558,26.328947,24.774648,362.618421,358.633803,1.092105,1.43662,1.0,33.0,18.0,-2.6,15.0
Atlanta Falcons,134.0,0.432836,22.149254,24.029851,352.141791,355.850746,0.201493,-1.880597,-3.708955,23.590909,20.75,364.045455,340.588235,1.363636,1.294118,0.4,33.0,19.8,-1.6,13.2
Jacksonville Jaguars,137.0,0.364964,20.058394,23.445255,330.934307,354.379562,0.335766,-3.386861,-23.445255,20.132353,19.985507,333.852941,328.057971,1.426471,1.623188,0.4,21.4,16.2,-0.8,5.2
Detroit Lions,136.0,0.470588,24.477941,24.933824,355.257353,366.080882,0.014706,-0.455882,-10.823529,27.101449,22.41791,367.028986,354.820896,1.202899,1.253731,0.6,39.6,26.6,-1.6,13.0
Pittsburgh Steelers,137.0,0.583942,21.664234,20.839416,327.226277,324.635036,-0.29927,0.824818,2.591241,24.029412,19.869565,341.411765,325.695652,1.264706,1.304348,0.0,27.4,14.2,-0.2,13.2


In [28]:
# Rounding the columns based on the suggested decimal places for 'team_stats' DataFrame

team_stats['Win_Rate'] = team_stats['Win_Rate'].round(2)           # Round to 2 decimal places
team_stats['Avg_PtsScored'] = team_stats['Avg_PtsScored'].round(2) # Round to 2 decimal places
team_stats['Avg_PtsAllowed'] = team_stats['Avg_PtsAllowed'].round(2) # Round to 2 decimal places
team_stats['Avg_Yards'] = team_stats['Avg_Yards'].round(2)         # Round to 2 decimal places
team_stats['Avg_YardsAllowed'] = team_stats['Avg_YardsAllowed'].round(2) # Round to 2 decimal places
team_stats['Avg_TurnoverDiff'] = team_stats['Avg_TurnoverDiff'].round(2) # Round to 2 decimal places
team_stats['Avg_PointDiff'] = team_stats['Avg_PointDiff'].round(2) # Round to 2 decimal places
team_stats['Avg_YardDiff'] = team_stats['Avg_YardDiff'].round(2)   # Round to 2 decimal places
team_stats['Home_PtsScored'] = team_stats['Home_PtsScored'].round(2) # Round to 2 decimal places
team_stats['Away_PtsScored'] = team_stats['Away_PtsScored'].round(2) # Round to 2 decimal places
team_stats['Home_Yards'] = team_stats['Home_Yards'].round(2)       # Round to 2 decimal places
team_stats['Away_Yards'] = team_stats['Away_Yards'].round(2)       # Round to 2 decimal places
team_stats['Home_Turnover'] = team_stats['Home_Turnover'].round(2) # Round to 2 decimal places
team_stats['Away_Turnover'] = team_stats['Away_Turnover'].round(2) # Round to 2 decimal places

team_stats['Last5_Win_Rate'] = team_stats['Last5_Win_Rate'].round(2)
team_stats['Last5_Avg_PtsScored'] = team_stats['Last5_Avg_PtsScored'].round(2)
team_stats['Last5_Avg_PtsAllowed'] = team_stats['Last5_Avg_PtsAllowed'].round(2)
team_stats['Last5_Avg_TurnoverDiff'] = team_stats['Last5_Avg_TurnoverDiff'].round(2)
team_stats['Last5_Avg_PointDiff'] = team_stats['Last5_Avg_PointDiff'].round(2)


In [29]:
print(team_stats)

                       Total_Games  Win_Rate  Avg_PtsScored  Avg_PtsAllowed  \
Kansas City Chiefs           154.0      0.76          27.75           21.47   
Dallas Cowboys               138.0      0.57          25.48           22.38   
Green Bay Packers            140.0      0.59          24.36           21.91   
Carolina Panthers            133.0      0.35          20.17           25.11   
Los Angeles Rams             145.0      0.61          24.88           21.82   
Philadelphia Eagles          147.0      0.63          25.46           21.40   
Atlanta Falcons              134.0      0.43          22.15           24.03   
Jacksonville Jaguars         137.0      0.36          20.06           23.45   
Detroit Lions                136.0      0.47          24.48           24.93   
Pittsburgh Steelers          137.0      0.58          21.66           20.84   
Baltimore Ravens             141.0      0.64          26.15           19.12   
Buffalo Bills                145.0      0.64        

In [30]:
#Save Data as CSV
team_stats.to_csv('Team_Stats.csv', index=True)