# Imports and Definitions

In [5]:
import pandas as pd
import requests
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, Boolean, ForeignKey, DateTime, Time, BigInteger, Text, text, UniqueConstraint, ForeignKeyConstraint
from sqlalchemy.types import Integer
import random
from bs4 import BeautifulSoup
from datetime import datetime, UTC

import sys
sys.path.insert(1, '../../')
from keys import aiven_pwd       # import passwords from local file (not pushed to github)

sql_engine = create_engine(f"mysql+pymysql://avnadmin:{aiven_pwd}@mysql-nfl-mhoffmann-nfl.b.aivencloud.com:10448/nfl", pool_size=20, max_overflow=50)

In [74]:
metadata = MetaData()

# Table for the player data (index: player_id -> primary key)
players_table = Table(
    'players', metadata,
    Column('player_id', Integer, primary_key=True),  # Set index as primary key
    Column('team_id', Integer, ForeignKey('teams.team_id')),
    Column('firstName', String(100)),
    Column('lastName', String(100)),
    Column('weight', Float),
    Column('height', Float),
    Column('age', Integer, nullable=True),
    Column('link', String(255)),
    Column('country', String(100), nullable=True),
    Column('picture', String(255), nullable=True),
    Column('jersey', Integer, nullable=True),
    Column('position_id', Integer, ForeignKey('positions.position_id')),
    Column('experience', Integer),
    Column('active', Boolean),
    Column('status_id', Integer, ForeignKey('playerstatuses.status_id')),
    Column('college_id', Integer, ForeignKey('colleges.college_id'))
)

# Table for the status data (index: status_id -> primary key)
playerstatuses_table = Table(
    'playerstatuses', metadata,
    Column('status_id', Integer, primary_key=True),  # Set index as primary key
    Column('name', String(100))
)

# Table for the team info data (index: team_id -> primary key)
teams_table = Table(
    'teams', metadata,
    Column('team_id', Integer, primary_key=True),  # Set index as primary key
    Column('abbreviation', String(10)),
    Column('name', String(255)),
    Column('location', String(255)),
    Column('color', String(50)),
    Column('logo', String(255)),
    Column('link', String(255))
)

# Table for the college data (index: college_id -> primary key)
colleges_table = Table(
    'colleges', metadata,
    Column('college_id', Integer, primary_key=True),  # Set index as primary key
    Column('name', String(255)),
    Column('abbreviation', String(10), nullable=True),
    Column('logo', String(255), nullable=True),
    Column('mascot', String(255), nullable=True)
)

# Table for the position data (index: position_id -> primary key)
positions_table = Table(
    'positions', metadata,
    Column('position_id', Integer, primary_key=True),  # Set index as primary key
    Column('name', String(100)),
    Column('abbreviation', String(10)),
    Column('parent', Integer, nullable=True),  # Nullable in case parent position is not specified
)

games_table = Table(
    'games', metadata,
    Column('game_id', Integer, primary_key=True),
    Column('date', DateTime(timezone=True)),
    Column('name', String(255)),
    Column('season', Integer),
    Column('game_type', String(100)),
    Column('week', Integer),
    Column('home_team_id', Integer, ForeignKey('teams.team_id')),
    Column('home_team_score', Integer),
    Column('away_team_id', Integer, ForeignKey('teams.team_id')),
    Column('away_team_score', Integer),
    Column('standing_home_overall_win', Integer),
    Column('standing_home_Home_win', Integer),
    Column('standing_home_Road_win', Integer),
    Column('standing_home_overall_loss', Integer),
    Column('standing_home_Home_loss', Integer),
    Column('standing_home_Road_loss', Integer),
    Column('standing_away_overall_win', Integer),
    Column('standing_away_Home_win', Integer),
    Column('standing_away_Road_win', Integer),
    Column('standing_away_overall_loss', Integer),
    Column('standing_away_Home_loss', Integer),
    Column('standing_away_Road_loss', Integer),
    Column('link', String(255)),
    Column('game_status', String(100)),
)

playtypes_table = Table(
    'playtypes', metadata,
    Column('playtype_id', Integer, primary_key=True),  
    Column('text', String(255)),  
    Column('abbreviation', String(10), nullable=True)
)

plays_table = Table(
    'plays', metadata,
    Column('play_id', BigInteger, primary_key=True),  
    Column('game_id', Integer, ForeignKey('games.game_id')),  
    Column('sequenceNumber', Integer),
    Column('homeScore', Integer),
    Column('awayScore', Integer),
    Column('quarter', Integer),
    Column('clock', Time),
    Column('offenseAtHome', Boolean),
    Column('down', Integer),
    Column('distance', Integer),
    Column('yardsToEndzone', Integer),
    Column('possessionChange', Boolean),
    Column('next_down', Integer),
    Column('next_distance', Integer),
    Column('next_yardsToEndzone', Integer),
    Column('playtype_id', Integer, ForeignKey('playtypes.playtype_id')),
    Column('description', Text),
    UniqueConstraint('game_id', 'sequenceNumber', name='uq_plays_game_sequence')
)

probabilities_table = Table(
    'probabilities', metadata,
    Column('proba_id', BigInteger, primary_key=True),  # Unique identifier
    Column('game_id', Integer, nullable=False),  # Game identifier
    Column('sequenceNumber', Integer, nullable=False),  # Sequence number
    Column('homeWinPercentage', Float, nullable=False),  # Probability of home win
    Column('awayWinPercentage', Float, nullable=False),  # Probability of away win
    Column('tiePercentage', Float, nullable=False),  # Probability of tie
    ForeignKeyConstraint(
        ['game_id', 'sequenceNumber'],  # Composite FK in probabilities
        ['plays.game_id', 'plays.sequenceNumber'],  # Composite key in plays
        name='fk_probabilities_plays'
    )
)

news_table = Table(
    'news', metadata,
    Column('news_id', Integer, primary_key=True),  # Index column as primary key
    Column('headline', String(255), nullable=False),
    Column('description', String(1000), nullable=False),
    Column('published', DateTime(timezone=True), nullable=False),
    Column('story', Text, nullable=False)
)

player_stats_table = Table(
    'player_stats', metadata,
    Column('player_stats_id', Integer, primary_key=True, autoincrement=True),  # Auto-increment primary key
    Column('player_id', Integer, ForeignKey('players.player_id'), nullable=False),  # Foreign key to players table
    Column('season', Integer, nullable=False),
    Column('season_type', Integer, nullable=False),
    Column('games_played', Integer),
    Column('data_retrieved', DateTime, nullable=False),
    Column('fumblesForced', Float),
    Column('fumblesForced_rank', Integer),
    Column('fumblesRecovered', Float),
    Column('fumblesRecovered_rank', Integer),
    Column('fumblesRecoveredYards', Float),
    Column('fumblesRecoveredYards_rank', Integer),
    Column('fumblesTouchdowns', Float),
    Column('fumblesTouchdowns_rank', Integer),
    Column('gamesPlayed', Float),
    Column('gamesPlayed_rank', Integer),
    Column('defensiveFumblesTouchdowns', Float),
    Column('defensiveFumblesTouchdowns_rank', Integer),
    Column('assistTackles', Float),
    Column('assistTackles_rank', Integer),
    Column('avgInterceptionYards', Float),
    Column('avgInterceptionYards_rank', Integer),
    Column('avgSackYards', Float),
    Column('avgSackYards_rank', Integer),
    Column('avgStuffYards', Float),
    Column('avgStuffYards_rank', Integer),
    Column('blockedFieldGoalTouchdowns', Float),
    Column('blockedFieldGoalTouchdowns_rank', Integer),
    Column('blockedPuntTouchdowns', Float),
    Column('blockedPuntTouchdowns_rank', Integer),
    Column('hurries', Float),
    Column('hurries_rank', Integer),
    Column('kicksBlocked', Float),
    Column('kicksBlocked_rank', Integer),
    Column('longInterception', Float),
    Column('longInterception_rank', Integer),
    Column('miscTouchdowns', Float),
    Column('miscTouchdowns_rank', Integer),
    Column('passesBattedDown', Float),
    Column('passesBattedDown_rank', Integer),
    Column('passesDefended', Float),
    Column('passesDefended_rank', Integer),
    Column('QBHits', Float),
    Column('QBHits_rank', Integer),
    Column('twoPtReturns', Float),
    Column('twoPtReturns_rank', Integer),
    Column('sacks', Float),
    Column('sacks_rank', Integer),
    Column('sackYards', Float),
    Column('sackYards_rank', Integer),
    Column('safeties', Float),
    Column('safeties_rank', Integer),
    Column('soloTackles', Float),
    Column('soloTackles_rank', Integer),
    Column('stuffs', Float),
    Column('stuffs_rank', Integer),
    Column('stuffYards', Float),
    Column('stuffYards_rank', Integer),
    Column('tacklesForLoss', Float),
    Column('tacklesForLoss_rank', Integer),
    Column('tacklesYardsLost', Float),
    Column('tacklesYardsLost_rank', Integer),
    Column('teamGamesPlayed', Float),
    Column('teamGamesPlayed_rank', Integer),
    Column('totalTackles', Float),
    Column('totalTackles_rank', Integer),
    Column('yardsAllowed', Float),
    Column('yardsAllowed_rank', Integer),
    Column('pointsAllowed', Float),
    Column('pointsAllowed_rank', Integer),
    Column('onePtSafetiesMade', Float),
    Column('onePtSafetiesMade_rank', Integer),
    Column('missedFieldGoalReturnTd', Float),
    Column('missedFieldGoalReturnTd_rank', Integer),
    Column('blockedPuntEzRecTd', Float),
    Column('blockedPuntEzRecTd_rank', Integer),
    Column('interceptions', Float),
    Column('interceptions_rank', Integer),
    Column('interceptionTouchdowns', Float),
    Column('interceptionTouchdowns_rank', Integer),
    Column('interceptionYards', Float),
    Column('interceptionYards_rank', Integer),
    Column('defensivePoints', Float),
    Column('defensivePoints_rank', Integer),
    Column('fieldGoals', Float),
    Column('fieldGoals_rank', Integer),
    Column('kickExtraPoints', Float),
    Column('kickExtraPoints_rank', Integer),
    Column('miscPoints', Float),
    Column('miscPoints_rank', Integer),
    Column('passingTouchdowns', Float),
    Column('passingTouchdowns_rank', Integer),
    Column('receivingTouchdowns', Float),
    Column('receivingTouchdowns_rank', Integer),
    Column('returnTouchdowns', Float),
    Column('returnTouchdowns_rank', Integer),
    Column('rushingTouchdowns', Float),
    Column('rushingTouchdowns_rank', Integer),
    Column('totalPoints', Float),
    Column('totalPoints_rank', Integer),
    Column('totalPointsPerGame', Float),
    Column('totalPointsPerGame_rank', Integer),
    Column('totalTouchdowns', Float),
    Column('totalTouchdowns_rank', Integer),
    Column('totalTwoPointConvs', Float),
    Column('totalTwoPointConvs_rank', Integer),
    Column('twoPointPassConvs', Float),
    Column('twoPointPassConvs_rank', Integer),
    Column('twoPointRecConvs', Float),
    Column('twoPointRecConvs_rank', Integer),
    Column('twoPointRushConvs', Float),
    Column('twoPointRushConvs_rank', Integer),
    Column('fumbles', Float),
    Column('fumbles_rank', Integer),
    Column('fumblesLost', Float),
    Column('fumblesLost_rank', Integer),
    Column('offensiveTwoPtReturns', Float),
    Column('offensiveTwoPtReturns_rank', Integer),
    Column('offensiveFumblesTouchdowns', Float),
    Column('offensiveFumblesTouchdowns_rank', Integer),
    Column('avgGain', Float),
    Column('avgGain_rank', Integer),
    Column('completionPct', Float),
    Column('completionPct_rank', Integer),
    Column('completions', Float),
    Column('completions_rank', Integer),
    Column('ESPNQBRating', Float),
    Column('ESPNQBRating_rank', Integer),
    Column('interceptionPct', Float),
    Column('interceptionPct_rank', Integer),
    Column('longPassing', Float),
    Column('longPassing_rank', Integer),
    Column('netPassingYards', Float),
    Column('netPassingYards_rank', Integer),
    Column('netPassingYardsPerGame', Float),
    Column('netPassingYardsPerGame_rank', Integer),
    Column('netTotalYards', Float),
    Column('netTotalYards_rank', Integer),
    Column('netYardsPerGame', Float),
    Column('netYardsPerGame_rank', Integer),
    Column('passingAttempts', Float),
    Column('passingAttempts_rank', Integer),
    Column('passingBigPlays', Float),
    Column('passingBigPlays_rank', Integer),
    Column('passingFirstDowns', Float),
    Column('passingFirstDowns_rank', Integer),
    Column('passingFumbles', Float),
    Column('passingFumbles_rank', Integer),
    Column('passingFumblesLost', Float),
    Column('passingFumblesLost_rank', Integer),
    Column('passingTouchdownPct', Float),
    Column('passingTouchdownPct_rank', Integer),
    Column('passingYards', Float),
    Column('passingYards_rank', Integer),
    Column('passingYardsAfterCatch', Float),
    Column('passingYardsAfterCatch_rank', Integer),
    Column('passingYardsAtCatch', Float),
    Column('passingYardsAtCatch_rank', Integer),
    Column('passingYardsPerGame', Float),
    Column('passingYardsPerGame_rank', Integer),
    Column('QBRating', Float),
    Column('QBRating_rank', Integer),
    Column('sackYardsLost', Float),
    Column('sackYardsLost_rank', Integer),
    Column('netPassingAttempts', Float),
    Column('netPassingAttempts_rank', Integer),
    Column('totalOffensivePlays', Float),
    Column('totalOffensivePlays_rank', Integer),
    Column('totalYards', Float),
    Column('totalYards_rank', Integer),
    Column('totalYardsFromScrimmage', Float),
    Column('totalYardsFromScrimmage_rank', Integer),
    Column('twoPtPass', Float),
    Column('twoPtPass_rank', Integer),
    Column('twoPtPassAttempts', Float),
    Column('twoPtPassAttempts_rank', Integer),
    Column('yardsFromScrimmagePerGame', Float),
    Column('yardsFromScrimmagePerGame_rank', Integer),
    Column('yardsPerCompletion', Float),
    Column('yardsPerCompletion_rank', Integer),
    Column('yardsPerGame', Float),
    Column('yardsPerGame_rank', Integer),
    Column('yardsPerPassAttempt', Float),
    Column('yardsPerPassAttempt_rank', Integer),
    Column('netYardsPerPassAttempt', Float),
    Column('netYardsPerPassAttempt_rank', Integer),
    Column('quarterbackRating', Float),
    Column('quarterbackRating_rank', Integer),
    Column('ESPNRBRating', Float),
    Column('ESPNRBRating_rank', Integer),
    Column('longRushing', Float),
    Column('longRushing_rank', Integer),
    Column('rushingAttempts', Float),
    Column('rushingAttempts_rank', Integer),
    Column('rushingBigPlays', Float),
    Column('rushingBigPlays_rank', Integer),
    Column('rushingFirstDowns', Float),
    Column('rushingFirstDowns_rank', Integer),
    Column('rushingFumbles', Float),
    Column('rushingFumbles_rank', Integer),
    Column('rushingFumblesLost', Float),
    Column('rushingFumblesLost_rank', Integer),
    Column('rushingYards', Float),
    Column('rushingYards_rank', Integer),
    Column('rushingYardsPerGame', Float),
    Column('rushingYardsPerGame_rank', Integer),
    Column('stuffYardsLost', Float),
    Column('stuffYardsLost_rank', Integer),
    Column('twoPtRush', Float),
    Column('twoPtRush_rank', Integer),
    Column('twoPtRushAttempts', Float),
    Column('twoPtRushAttempts_rank', Integer),
    Column('yardsPerRushAttempt', Float),
    Column('yardsPerRushAttempt_rank', Integer),
    Column('ESPNWRRating', Float),
    Column('ESPNWRRating_rank', Integer),
    Column('longReception', Float),
    Column('longReception_rank', Integer),
    Column('receivingBigPlays', Float),
    Column('receivingBigPlays_rank', Integer),
    Column('receivingFirstDowns', Float),
    Column('receivingFirstDowns_rank', Integer),
    Column('receivingFumbles', Float),
    Column('receivingFumbles_rank', Integer),
    Column('receivingFumblesLost', Float),
    Column('receivingFumblesLost_rank', Integer),
    Column('receivingTargets', Float),
    Column('receivingTargets_rank', Integer),
    Column('receivingYards', Float),
    Column('receivingYards_rank', Integer),
    Column('receivingYardsAfterCatch', Float),
    Column('receivingYardsAfterCatch_rank', Integer),
    Column('receivingYardsAtCatch', Float),
    Column('receivingYardsAtCatch_rank', Integer),
    Column('receivingYardsPerGame', Float),
    Column('receivingYardsPerGame_rank', Integer),
    Column('receptions', Float),
    Column('receptions_rank', Integer),
    Column('twoPtReception', Float),
    Column('twoPtReception_rank', Integer),
    Column('twoPtReceptionAttempts', Float),
    Column('twoPtReceptionAttempts_rank', Integer),
    Column('yardsPerReception', Float),
    Column('yardsPerReception_rank', Integer),
    Column('QBR', Float),
    Column('QBR_rank', Integer),
    Column('adjQBR', Float),
    Column('adjQBR_rank', Integer),
    Column('avgKickoffReturnYards', Float),
    Column('avgKickoffReturnYards_rank', Integer),
    Column('avgKickoffYards', Float),
    Column('avgKickoffYards_rank', Integer),
    Column('extraPointAttempts', Float),
    Column('extraPointAttempts_rank', Integer),
    Column('extraPointPct', Float),
    Column('extraPointPct_rank', Integer),
    Column('extraPointsBlocked', Float),
    Column('extraPointsBlocked_rank', Integer),
    Column('extraPointsBlockedPct', Float),
    Column('extraPointsBlockedPct_rank', Integer),
    Column('extraPointsMade', Float),
    Column('extraPointsMade_rank', Integer),
    Column('fairCatches', Float),
    Column('fairCatches_rank', Integer),
    Column('fairCatchPct', Float),
    Column('fairCatchPct_rank', Integer),
    Column('fieldGoalAttempts', Float),
    Column('fieldGoalAttempts_rank', Integer),
    Column('fieldGoalAttempts1_19', Float),
    Column('fieldGoalAttempts1_19_rank', Integer),
    Column('fieldGoalAttempts20_29', Float),
    Column('fieldGoalAttempts20_29_rank', Integer),
    Column('fieldGoalAttempts30_39', Float),
    Column('fieldGoalAttempts30_39_rank', Integer),
    Column('fieldGoalAttempts40_49', Float),
    Column('fieldGoalAttempts40_49_rank', Integer),
    Column('fieldGoalAttempts50_59', Float),
    Column('fieldGoalAttempts50_59_rank', Integer),
    Column('fieldGoalAttempts60_99', Float),
    Column('fieldGoalAttempts60_99_rank', Integer),
    Column('fieldGoalAttempts50', Float),
    Column('fieldGoalAttempts50_rank', Integer),
    Column('fieldGoalAttemptYards', Float),
    Column('fieldGoalAttemptYards_rank', Integer),
    Column('fieldGoalPct', Float),
    Column('fieldGoalPct_rank', Integer),
    Column('fieldGoalsBlocked', Float),
    Column('fieldGoalsBlocked_rank', Integer),
    Column('fieldGoalsBlockedPct', Float),
    Column('fieldGoalsBlockedPct_rank', Integer),
    Column('fieldGoalsMade', Float),
    Column('fieldGoalsMade_rank', Integer),
    Column('fieldGoalsMade1_19', Float),
    Column('fieldGoalsMade1_19_rank', Integer),
    Column('fieldGoalsMade20_29', Float),
    Column('fieldGoalsMade20_29_rank', Integer),
    Column('fieldGoalsMade30_39', Float),
    Column('fieldGoalsMade30_39_rank', Integer),
    Column('fieldGoalsMade40_49', Float),
    Column('fieldGoalsMade40_49_rank', Integer),
    Column('fieldGoalsMade50_59', Float),
    Column('fieldGoalsMade50_59_rank', Integer),
    Column('fieldGoalsMade60_99', Float),
    Column('fieldGoalsMade60_99_rank', Integer),
    Column('fieldGoalsMade50', Float),
    Column('fieldGoalsMade50_rank', Integer),
    Column('fieldGoalsMadeYards', Float),
    Column('fieldGoalsMadeYards_rank', Integer),
    Column('fieldGoalsMissedYards', Float),
    Column('fieldGoalsMissedYards_rank', Integer),
    Column('kickoffReturns', Float),
    Column('kickoffReturns_rank', Integer),
    Column('kickoffReturnTouchdowns', Float),
    Column('kickoffReturnTouchdowns_rank', Integer),
    Column('kickoffReturnYards', Float),
    Column('kickoffReturnYards_rank', Integer),
    Column('kickoffs', Float),
    Column('kickoffs_rank', Integer),
    Column('kickoffYards', Float),
    Column('kickoffYards_rank', Integer),
    Column('longFieldGoalAttempt', Float),
    Column('longFieldGoalAttempt_rank', Integer),
    Column('longFieldGoalMade', Float),
    Column('longFieldGoalMade_rank', Integer),
    Column('longKickoff', Float),
    Column('longKickoff_rank', Integer),
    Column('totalKickingPoints', Float),
    Column('totalKickingPoints_rank', Integer),
    Column('touchbackPct', Float),
    Column('touchbackPct_rank', Integer),
    Column('touchbacks', Float),
    Column('touchbacks_rank', Integer),
    Column('avgPuntReturnYards', Float),
    Column('avgPuntReturnYards_rank', Integer),
    Column('grossAvgPuntYards', Float),
    Column('grossAvgPuntYards_rank', Integer),
    Column('longPunt', Float),
    Column('longPunt_rank', Integer),
    Column('netAvgPuntYards', Float),
    Column('netAvgPuntYards_rank', Integer),
    Column('puntReturns', Float),
    Column('puntReturns_rank', Integer),
    Column('puntReturnYards', Float),
    Column('puntReturnYards_rank', Integer),
    Column('punts', Float),
    Column('punts_rank', Integer),
    Column('puntsBlocked', Float),
    Column('puntsBlocked_rank', Integer),
    Column('puntsBlockedPct', Float),
    Column('puntsBlockedPct_rank', Integer),
    Column('puntsInside10', Float),
    Column('puntsInside10_rank', Integer),
    Column('puntsInside10Pct', Float),
    Column('puntsInside10Pct_rank', Integer),
    Column('puntsInside20', Float),
    Column('puntsInside20_rank', Integer),
    Column('puntsInside20Pct', Float),
    Column('puntsInside20Pct_rank', Integer),
    Column('puntYards', Float),
    Column('puntYards_rank', Integer),
    Column('defFumbleReturns', Float),
    Column('defFumbleReturns_rank', Integer),
    Column('defFumbleReturnYards', Float),
    Column('defFumbleReturnYards_rank', Integer),
    Column('fumbleRecoveries', Float),
    Column('fumbleRecoveries_rank', Integer),
    Column('fumbleRecoveryYards', Float),
    Column('fumbleRecoveryYards_rank', Integer),
    Column('kickReturnFairCatches', Float),
    Column('kickReturnFairCatches_rank', Integer),
    Column('kickReturnFairCatchPct', Float),
    Column('kickReturnFairCatchPct_rank', Integer),
    Column('kickReturnFumbles', Float),
    Column('kickReturnFumbles_rank', Integer),
    Column('kickReturnFumblesLost', Float),
    Column('kickReturnFumblesLost_rank', Integer),
    Column('kickReturns', Float),
    Column('kickReturns_rank', Integer),
    Column('kickReturnTouchdowns', Float),
    Column('kickReturnTouchdowns_rank', Integer),
    Column('kickReturnYards', Float),
    Column('kickReturnYards_rank', Integer),
    Column('longKickReturn', Float),
    Column('longKickReturn_rank', Integer),
    Column('longPuntReturn', Float),
    Column('longPuntReturn_rank', Integer),
    Column('miscFumbleReturns', Float),
    Column('miscFumbleReturns_rank', Integer),
    Column('miscFumbleReturnYards', Float),
    Column('miscFumbleReturnYards_rank', Integer),
    Column('oppFumbleRecoveries', Float),
    Column('oppFumbleRecoveries_rank', Integer),
    Column('oppFumbleRecoveryYards', Float),
    Column('oppFumbleRecoveryYards_rank', Integer),
    Column('oppSpecialTeamFumbleReturns', Float),
    Column('oppSpecialTeamFumbleReturns_rank', Integer),
    Column('oppSpecialTeamFumbleReturnYards', Float),
    Column('oppSpecialTeamFumbleReturnYards_rank', Integer),
    Column('puntReturnFairCatches', Float),
    Column('puntReturnFairCatches_rank', Integer),
    Column('puntReturnFairCatchPct', Float),
    Column('puntReturnFairCatchPct_rank', Integer),
    Column('puntReturnFumbles', Float),
    Column('puntReturnFumbles_rank', Integer),
    Column('puntReturnFumblesLost', Float),
    Column('puntReturnFumblesLost_rank', Integer),
    Column('puntReturnsStartedInsideThe10', Float),
    Column('puntReturnsStartedInsideThe10_rank', Integer),
    Column('puntReturnsStartedInsideThe20', Float),
    Column('puntReturnsStartedInsideThe20_rank', Integer),
    Column('puntReturnTouchdowns', Float),
    Column('puntReturnTouchdowns_rank', Integer),
    Column('specialTeamFumbleReturns', Float),
    Column('specialTeamFumbleReturns_rank', Integer),
    Column('specialTeamFumbleReturnYards', Float),
    Column('specialTeamFumbleReturnYards_rank', Integer),
    Column('yardsPerKickReturn', Float),
    Column('yardsPerKickReturn_rank', Integer),
    Column('yardsPerPuntReturn', Float),
    Column('yardsPerPuntReturn_rank', Integer),
    Column('yardsPerReturn', Float),
    Column('yardsPerReturn_rank', Integer)
)


metadata.create_all(sql_engine)

In [7]:
def get_existing_ids(sql_engine, table, id_column):
    result = sql_engine.connect().execute(text(f"SELECT {id_column} FROM {table}"))
    df = pd.DataFrame(result.fetchall(), columns=[id_column])
    if df.empty:
        return set()  # Return an empty set if no rows are found
    return set(df[id_column].tolist())

def append_new_rows(dataframe, table, sql_engine, id_column, useIndex=True):
    existing_ids_set = get_existing_ids(sql_engine, table, id_column)
    if not existing_ids_set:  # If there are no existing IDs in the SQL table
        if useIndex:
            dataframe.to_sql(table, con=sql_engine, if_exists='append', index=True, index_label=id_column)
        else:
            dataframe.to_sql(table, con=sql_engine, if_exists='append', index=False)
    else:
        new_rows = dataframe[~dataframe.index.isin(existing_ids_set)]
        if useIndex:
            new_rows.to_sql(table, con=sql_engine, if_exists='append', index=True, index_label=id_column)
        else:
            new_rows.to_sql(table, con=sql_engine, if_exists='append', index=False)


# Teams

In [6]:
def get_nfl_teams():
    url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/teams"
    response = requests.get(url)
    team_data = response.json()

    teams = []
    for team_id in range(-2,35):
        url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/teams/{team_id}"
        response = requests.get(url)
        team_data = response.json()

        team = {}
        team['team_id'] = team_id
        team['abbreviation'] = team_data.get('team', {}).get('abbreviation')
        team['name'] = team_data.get('team', {}).get('displayName')
        team['location'] = team_data.get('team', {}).get('location')
        team['color'] = team_data.get('team', {}).get('color')
        team['logo'] = (team_data.get('team', {}).get('logos', [{}])[0]).get('href')
        team['link'] = (team_data.get('team', {}).get('links', [{}])[0]).get('href')
        teams.append(team)  

    teams_df = pd.DataFrame(teams)
    teams_df['team_id'] = teams_df['team_id'].astype('Int64')
    teams_df.sort_values(by='team_id', inplace=True)
    teams_df.set_index('team_id', inplace=True)
    return teams_df

teams_df = get_nfl_teams()
try:
    append_new_rows(teams_df, 'teams', sql_engine, 'team_id')
except:
    append_new_rows(teams_df, 'teams', sql_engine, 'team_id')

KeyboardInterrupt: 

# Players
(+ Status and Colleges)

In [15]:
def get_nfl_players(team_ids):
    players = []
    status_data = {}
    colleges = set()

    for team_id in team_ids:
        #print("Currently getting team",team_id)
        url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/2024/teams/{team_id}/athletes?limit=200"
        team_response = requests.get(url)
        team_roster = team_response.json()
        team_response = requests.get(url)
        for i in range(len(team_roster['items'])):   # players per team
            player_response = requests.get(team_roster['items'][i]['$ref'])
            player_data = player_response.json()
            player = {}
            player['player_id'] = player_data.get('id', None)
            player['team_id'] = team_id  # Assuming team_id is already safely set
            player['firstName'] = player_data.get('firstName', None)
            player['lastName'] = player_data.get('lastName', None)
            player['weight'] = player_data.get('weight', None)
            player['height'] = player_data.get('height', None)
            player['age'] = player_data.get('age', None)
            player['link'] = player_data.get('links', [{}])[0].get('href', None)
            player['country'] = player_data.get('birthPlace', {}).get('country', None)
            player['picture'] = player_data.get('headshot', {}).get('href', None)
            player['jersey'] = player_data.get('jersey', None)
            player['position_id'] = player_data.get('position', {}).get('id', None)
            player['experience'] = player_data.get('experience', {}).get('years', None)
            player['active'] = player_data.get('active', None)
            player['status_id'] = player_data.get('status', {}).get('id', None)
            player['college_id'] = player_data.get('college', {}).get('$ref', 'unknown').split('/')[-1].split('?')[0]
            if player['college_id'] == 'unknown':
                player['college_id'] = None
            else:
                colleges.add(player['college_id'])
            
            status_data[player['status_id']] = player_data.get('status', {}).get('name', None)
            
            players.append(player)

    players_df = pd.DataFrame(players)
    players_df['player_id'] = players_df['player_id'].astype('Int64')
    players_df['status_id'] = players_df['status_id'].astype('Int64')
    players_df['college_id'] = players_df['college_id'].astype('Int64')
    players_df.sort_values(by='player_id', inplace=True)
    players_df.set_index('player_id', inplace=True)

    status_df = pd.DataFrame(list(status_data.items()), columns=['status_id', 'name'])
    status_df['status_id'] = status_df['status_id'].astype('Int64')
    status_df.sort_values(by='status_id', inplace=True)
    status_df.set_index('status_id', inplace=True)

    return players_df, status_df, colleges

players_df, status_df, college_ids = get_nfl_players(teams_df.index)

In [16]:
def get_colleges(college_ids):
    colleges = []
    for college_id in list(college_ids):
        url = f"http://sports.core.api.espn.com/v2/colleges/{college_id}?lang=en&region=us"
        college_response = requests.get(url)
        college_data = college_response.json()
        college = {}
        college['college_id'] = college_id
        college['name'] = college_data.get('name', None)
        college['abbreviation'] = college_data.get('abbrev', None)
        college['logo'] = college_data.get('logos', [{}])[0].get('href', None)
        college['mascot'] = college_data.get('mascot', None)
        colleges.append(college)
    colleges_df = pd.DataFrame(colleges)
    colleges_df['college_id'] = colleges_df['college_id'].astype('Int64')
    colleges_df.set_index('college_id', inplace=True)
    return colleges_df

colleges_df = get_colleges(college_ids)



In [None]:
def get_positions(position_ids):
    positions = []
    for position_id in position_ids:
        url = f"http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/positions/{position_id}?lang=en&region=us"
        position_response = requests.get(url)
        position_data = position_response.json()
        position = {}
        position['position_id'] = position_id
        position['name'] = position_data.get('name', None)
        position['abbreviation'] = position_data.get('abbreviation', None)
        position['parent'] = position_data.get('parent', {}).get('$ref', 'unknown').split('/')[-1].split('?')[0]
        if position['parent'] == 'unknown':
            position['parent'] = None
        positions.append(position)

    positions_df = pd.DataFrame(positions)
    positions_df['position_id'] = positions_df['position_id'].astype('Int64')
    positions_df['parent'] = positions_df['parent'].astype('Int64')
    positions_df.set_index('position_id', inplace=True)
    positions_df.drop_duplicates(inplace=True)
    return positions_df

position_ids = set()
for i in players_df['position_id'].values:
    position_ids.add(i)
positions_df = get_positions(position_ids)



In [28]:
append_new_rows(positions_df, 'positions', sql_engine, 'position_id')
append_new_rows(colleges_df, 'colleges', sql_engine, 'college_id')
append_new_rows(status_df, 'playerstatuses', sql_engine, 'status_id')
append_new_rows(players_df, 'players', sql_engine, 'player_id')

# Player Stats

In [97]:
def get_player_stats(season, season_type):
    retrievalDate = datetime.now(UTC)
    player_stats = []
    players_in_df = list(get_existing_ids(sql_engine, 'players', 'player_id'))

    total_players = len(players_in_df)

    i = 0

    for player_id in players_in_df:

        i+=1

        if (i%100==0):
            print(f"{i}/{total_players} done")

        event_url = f"http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/{season}/athletes/{player_id}/eventlog?lang=en&region=us"
        response = requests.get(event_url)
        event_data = response.json()

        games_played = 0
        for event in event_data.get('events', {}).get('items', [{}]):
            if event.get('played', None) == True:
                games_played += 1

        if(games_played)>0:

            url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/{season}/types/{season_type}/athletes/{player_id}/statistics/0?lang=en&region=us"
            response = requests.get(url)
            data = response.json()


            categories = data.get('splits', {}).get('categories', [])

            if len(categories)>0:
                player_stat = {}
                player_stat['player_id'] = player_id
                player_stat['data_retrieved'] = retrievalDate
                player_stat['season'] = season
                player_stat['season_type'] = season_type
                player_stat['games_played'] = games_played

                for category in categories:
                    for stat in category['stats']:
                        statName = stat['name']
                        player_stat[statName] = stat['value']/games_played
                        player_stat[statName + "_rank"] = stat.get('rank', None)

                player_stats.append(player_stat)

    player_stats_df = pd.DataFrame(player_stats)
    return player_stats_df

In [98]:
player_stats_df = get_player_stats(2024, 2)
for column in player_stats_df.columns:
    if "_rank" in column:
        player_stats_df[column] = player_stats_df[column].astype('Int64')
    #     print(f"Column('{column}', Integer),")
    # else:
    #     print(f"Column('{column}', Float),")

append_new_rows(player_stats_df, 'player_stats', sql_engine, 'player_stats_id', useIndex=False)

100/2527 done
200/2527 done
300/2527 done
400/2527 done
500/2527 done
600/2527 done
700/2527 done
800/2527 done
900/2527 done
1000/2527 done
1100/2527 done
1200/2527 done
1300/2527 done
1400/2527 done
1500/2527 done
1600/2527 done
1700/2527 done
1800/2527 done
1900/2527 done
2000/2527 done
2100/2527 done
2200/2527 done
2300/2527 done
2400/2527 done
2500/2527 done


In [73]:
# get the metadata for sql :)
for column in player_stats_df.columns:
    if "_rank" in column:
        print(f"Column('{column}', Integer),")
    else:
        print(f"Column('{column}', Float),")

# Games

In [11]:
def load_game_data(events, asDataFrame=False, checkExistence=False):

    games_in_db =  list(get_existing_ids(sql_engine, 'games', 'game_id'))

    new_games = []

    for game_data in events:
        if (not checkExistence)or(not (int(game_data.get('id', None)) in games_in_db)):
            game = {}
            game['game_id'] = game_data.get('id', None)
            game['date'] = game_data.get('date', None)
            game['name'] = game_data.get('name', None)
            game['season'] = game_data.get('season', {}).get('year', None)
            game['game_type'] = game_data.get('season', {}).get('slug', None)
            game['week'] = game_data.get('week', {}).get('number', None)
            if game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('homeAway', None) == "home":
                game['home_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('team', {}).get('id', None)
                game['home_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('score', None)
                for i in range(3):
                    standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('name', '')
                    if not standing=='':
                        game['standing_home_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[0]
                        game['standing_home_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[-1]

                game['away_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('team', {}).get('id', None)
                game['away_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('score', None)
                for i in range(3):
                    standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('name', '')
                    if not standing=='':
                        game['standing_away_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[0]
                        game['standing_away_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[-1]

            else:
                game['home_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('team', {}).get('id', None)
                game['home_team_abr'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('team', {}).get('abbreviation', None)
                game['home_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('score', None)
                for i in range(3):
                    standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('name', '')
                    if not standing=='':
                        game['standing_home_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[0]
                        game['standing_home_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[-1]
                    
                game['away_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('team', {}).get('id', None)
                game['away_team_abr'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('team', {}).get('abbreviation', None)
                game['away_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('score', None)
                for i in range(3):
                    standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('name', '')
                    if not standing=='':
                        game['standing_away_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[0]
                        game['standing_away_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[-1]
            game['link'] = game_data.get('links', [{}])[0].get('href', None)
            game['game_status'] = game_data.get('status', {}).get('type', {}).get('id', None)
            new_games.append(game)
        #else:
        #    print(f"game {game_data.get('id', None)} already in database")

    if(asDataFrame):
        games_df = pd.DataFrame(new_games)
        games_df['game_id'] = games_df['game_id'].astype('Int64')
        games_df['home_team_id'] = games_df['home_team_id'].astype('Int64')
        games_df['home_team_score'] = games_df['home_team_score'].astype('Int64')
        games_df['away_team_id'] = games_df['away_team_id'].astype('Int64')
        games_df['away_team_score'] = games_df['away_team_score'].astype('Int64')
        games_df['date'] = pd.to_datetime(games_df['date'])
        games_df.set_index('game_id', inplace=True)
        return games_df
    else:
        return new_games

In [None]:
sql_engine = create_engine(f"mysql+pymysql://avnadmin:{aiven_pwd}@mysql-nfl-mhoffmann-nfl.b.aivencloud.com:10448/nfl", pool_size=20, max_overflow=50)

def get_games():

    years = [2024] # , 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009]

    new_games = []
    weeks = {2: list(range(1,19)),
            3: [1,2,3,4,5]}

    for year in years:

        for seasontype in [2,3]:
            for week in weeks[seasontype]:
                url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates={year}&seasontype={seasontype}&week={week}"
                games_response = requests.get(url)
                games_data = games_response.json()

                new_games = new_games + load_game_data(games_data['events'])

    if(len(new_games)>0):
        games_df = pd.DataFrame(new_games)
        games_df['game_id'] = games_df['game_id'].astype('Int64')
        games_df['home_team_id'] = games_df['home_team_id'].astype('Int64')
        games_df['home_team_score'] = games_df['home_team_score'].astype('Int64')
        games_df['away_team_id'] = games_df['away_team_id'].astype('Int64')
        games_df['away_team_score'] = games_df['away_team_score'].astype('Int64')
        games_df['date'] = pd.to_datetime(games_df['date'])
        games_df.set_index('game_id', inplace=True)

        append_new_rows(games_df, 'games', sql_engine, 'game_id')

get_games()



# Plays

In [12]:
def get_plays(game_ids):
    plays = []
    playtypes = []

    for game_id in game_ids:

        plays_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT play_id FROM plays WHERE game_id={game_id};")).fetchall()]

        url = f"https://cdn.espn.com/core/nfl/playbyplay?xhr=1&gameId={game_id}"

        try:
            game_response = requests.get(url)
            if game_response.status_code == 200:
                try:
                    game_data = game_response.json()

                    drives_data = game_data.get('gamepackageJSON', {}).get('drives',{}).get('previous', [])

                    game_from_sql = sql_engine.connect().execute(text(f"SELECT * FROM games where game_id = {game_id}"))
                    game_df = pd.DataFrame(game_from_sql.fetchall())
                    for drive_i in range(len(drives_data)):
                        drive_data = drives_data[drive_i]
                        plays_data = drive_data.get('plays',[])
                        for play_data in plays_data:
                            play_id = play_data.get('id', None)
                            if (not play_id == None)and(not int(play_id) in plays_in_db):
                                play = {}
                                play['play_id'] = play_data.get('id', None)
                                play['game_id'] = game_id
                                play['sequenceNumber'] = play_data.get('sequenceNumber', None)
                                play['homeScore'] = play_data.get('homeScore', None)
                                play['awayScore'] = play_data.get('awayScore', None)
                                play['quarter'] = play_data.get('period', {}).get('number', None)
                                play['clock'] = play_data.get('clock', {}).get('displayValue', None)

                                offense_team_id = play_data.get('start', {}).get('team', {}).get('id', None)

                                if offense_team_id == None:
                                    play['offenseAtHome'] = None
                                elif int(offense_team_id) == game_df['home_team_id'].values[0]:
                                    play['offenseAtHome'] = True
                                else:
                                    play['offenseAtHome'] = False

                                play['down'] = play_data.get('start', {}).get('down', None)
                                play['distance'] = play_data.get('start', {}).get('distance', None)
                                play['yardsToEndzone'] = play_data.get('start', {}).get('yardsToEndzone', None)

                                next_team_id = play_data.get('end', {}).get('team', {}).get('id', None)

                                if next_team_id == None:
                                    play['possessionChange'] = None
                                elif next_team_id== offense_team_id:
                                    play['possessionChange'] = False
                                else:
                                    play['possessionChange'] = False

                                play['next_down'] = play_data.get('end', {}).get('down', None)
                                play['next_distance'] = play_data.get('end', {}).get('distance', None)
                                play['next_yardsToEndzone'] = play_data.get('end', {}).get('yardsToEndzone', None)

                                play['playtype_id'] = play_data.get('type', {}).get('id', None)
                                play['description'] = play_data.get('text', None)

                                plays.append(play)
                                playtypes.append(play_data.get('type', {}))

                except Exception as e:
                    print("JSON error for game_id", game_id, e)
            else:
                print("No 200 response for game_id", game_id)
        except Exception as e:
            print("No response from Server for game_id", game_id)

    if len(plays)>0:
        playtypes_df = pd.DataFrame(playtypes)
        playtypes_df.drop_duplicates(inplace=True)
        playtypes_df['id'] = playtypes_df['id'].astype('Int64')
        playtypes_df.sort_values(by='id', inplace=True)
        playtypes_df.rename(columns={'id': 'playtype_id'}, inplace=True)
        playtypes_df.set_index('playtype_id', inplace=True)

        plays_df = pd.DataFrame(plays)
        plays_df['play_id'] = plays_df['play_id'].astype('Int64')
        plays_df['clock'] = '00:' + plays_df['clock']
        plays_df['playtype_id'] = plays_df['playtype_id'].astype('Int64')
        plays_df['sequenceNumber'] = plays_df['sequenceNumber'].astype('Int64')
        plays_df.set_index('play_id', inplace=True)

        return plays_df, playtypes_df
    else:
        return [], []

In [None]:


ids_in_games = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM games ")).fetchall()] # WHERE season in (2023, 2024)
ids_in_plays = list(get_existing_ids(sql_engine, 'plays', 'game_id'))

missing_games = list(set(ids_in_games) - set(ids_in_plays))

while len(missing_games)>0:

    print(len(missing_games), "games still missing", end = ' ')
    random.shuffle(missing_games)

    plays_df, playtypes_df = get_plays(missing_games[0:100])

    if len(plays_df)>0:
        
        plays_df.drop_duplicates(subset=['game_id', 'sequenceNumber'], keep='last', inplace=True)
        append_new_rows(playtypes_df, 'playtypes', sql_engine, 'playtype_id')
        append_new_rows(plays_df, 'plays', sql_engine, 'play_id')
        print('- Succesfully appended some new rows')
    else:
        print("No new data")

    ids_in_games = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM games")).fetchall()] # WHERE season in (2023, 2024)
    ids_in_plays = list(get_existing_ids(sql_engine, 'plays', 'game_id'))

    missing_games = list(set(ids_in_games) - set(ids_in_plays))

# Probabilities

In [13]:
def append_new_probabilities(dataframe, table, sql_engine, id_column):
    existing_ids_set = get_existing_ids(sql_engine, table, id_column)
    if not existing_ids_set:  # If there are no existing IDs in the SQL table
        for index, row in dataframe.iterrows():
            try:
                row.to_frame().T.to_sql('probabilities', con=sql_engine, if_exists='append', index=True, index_label=id_column)
            except:
                print(row.values)
    else:
        new_rows = dataframe[~dataframe.index.isin(existing_ids_set)]
        for index, row in new_rows.iterrows():
            try:
                row.to_frame().T.to_sql('probabilities', con=sql_engine, if_exists='append', index=True, index_label=id_column)
            except Exception as e:
                #if not row.values[1]==100:
                #    print(row.values)
                pass

In [14]:
def get_probabilities(game_ids):
    percentages = []
    #games_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM probabilities")).fetchall()]
    for game_id in game_ids: #  list(set(game_ids) - set(games_in_db)):
        probas_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT proba_id FROM probabilities WHERE game_id={game_id};")).fetchall()]

        try:
            url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{game_id}/competitions/{game_id}/probabilities?limit=3000"
            response = requests.get(url)
            data = response.json()
            pages = data.get('pageCount', 0)
            for page in range(1,pages+1):
                url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{game_id}/competitions/{game_id}/probabilities?limit=3000&page={page}"
                response = requests.get(url)
                data = response.json()
                for item in data['items']:
                    proba_id = int(str(game_id)+str(item.get('sequenceNumber', None)))
                    if not proba_id in probas_in_db:
                        percentage_data = {}
                        percentage_data['proba_id'] = proba_id
                        percentage_data['game_id'] = game_id
                        percentage_data['sequenceNumber'] = item.get('sequenceNumber', None)
                        percentage_data['homeWinPercentage'] = item.get('homeWinPercentage', None)
                        percentage_data['awayWinPercentage'] = item.get('awayWinPercentage', None)
                        percentage_data['tiePercentage'] = item.get('tiePercentage', None)
                        percentages.append(percentage_data)
        except Exception as e:
            print(game_id, e)
    if(len(percentages)>0):
        percentages_df = pd.DataFrame(percentages)
        percentages_df['sequenceNumber'] = percentages_df['sequenceNumber'].astype('Int64')
        percentages_df.set_index('proba_id', inplace=True)
        return percentages_df
    else:
        return []

In [None]:


games_with_plays = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT p.game_id FROM plays p")).fetchall()]
# LEFT JOIN games g ON p.game_id=g.game_id WHERE g.season IN (2023, 2024) 
games_in_proba = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM probabilities")).fetchall()]

missing_games = list(set(games_with_plays) - set(games_in_proba))

while len(missing_games)>0:
    print(len(missing_games), "games still missing", end = ' ')
    
    random.shuffle(missing_games)

    percentages_df = get_probabilities(missing_games[0:200])
    append_new_probabilities(percentages_df, 'probabilities', sql_engine, 'proba_id')
    print('Appended some new probabilities.')

    games_with_plays = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT p.game_id FROM plays p")).fetchall()]
    # LEFT JOIN games g ON p.game_id=g.game_id WHERE g.season IN (2023, 2024) 
    games_in_proba = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM probabilities")).fetchall()]

    missing_games = list(set(games_with_plays) - set(games_in_proba))

# News

In [None]:
sql_engine = create_engine(f"mysql+pymysql://avnadmin:{aiven_pwd}@mysql-nfl-mhoffmann-nfl.b.aivencloud.com:10448/nfl", pool_size=20, max_overflow=50)

def get_news(url1, url2, url3, team_ids):

    existing_news = get_existing_ids(sql_engine, "news", "news_id")

    news = []
    article_links = set()

    news_response = requests.get(url1)
    news_data = news_response.json()
    articles_data = news_data.get('articles', [])
    for article_i in articles_data:
        article_link = article_i.get('links', {}).get('api', {}).get('news', {}).get('href', '')
        article_links.add(article_link)
        article_link = article_i.get('links', {}).get('api', {}).get('self', {}).get('href', '')
        article_links.add(article_link)

    article_links.add(url2)

    for team_id in team_ids:
        news_response = requests.get(url3+str(team_id))
        news_data = news_response.json()
        articles_data = news_data.get('articles', [])
        for article_i in articles_data:
            article_link = article_i.get('links', {}).get('api', {}).get('news', {}).get('href', '')
            article_links.add(article_link)
            article_link = article_i.get('links', {}).get('api', {}).get('self', {}).get('href', '')
            article_links.add(article_link)

    cleaned_links = []
    for i in article_links:
        if ('sports/news' in i):
            cleaned_links.append(i)

    print(f"Retrieving news from {len(cleaned_links)} places.")
    for article_link in cleaned_links:
        article_response = requests.get(article_link)
        article_data = article_response.json()
        headlines_data = article_data.get('headlines', [])
        for headline_i in headlines_data:
            headline_id = headline_i.get('id', None)
            if ( (not headline_id == None)and(not headline_id in existing_news) ):
                new_news = {}
                new_news['news_id'] = headline_id
                new_news['headline'] = headline_i.get('headline', None)
                new_news['description'] = headline_i.get('description', None)
                new_news['published'] = headline_i.get('published', None)
                story = headline_i.get('story', None)
                story_soup = BeautifulSoup(story, 'html.parser')
                story_plain = story_soup.get_text(separator=' ', strip=True)
                new_news['story'] = story_plain
                news.append(new_news)
    if len(news)>0:
        news_df = pd.DataFrame(news)
        news_df['news_id'] = news_df['news_id'].astype('Int64')
        news_df.set_index('news_id', inplace=True)
        news_df['published'] = pd.to_datetime(news_df['published'])
        news_df = news_df.loc[~news_df.index.duplicated()]
        return news_df
    else:
        print("No new news yet.")

news_df = get_news("https://site.api.espn.com/apis/site/v2/sports/football/nfl/news?limit=150", "https://now.core.api.espn.com/v1/sports/news?limit=1000&sport=football", "https://site.api.espn.com/apis/site/v2/sports/football/nfl/news?team=", range(1,35))

append_new_rows(news_df, 'news', sql_engine, 'news_id')


Retrieving news from 67 places.


# Update running games

In [9]:
sql_engine = create_engine(f"mysql+pymysql://avnadmin:{aiven_pwd}@mysql-nfl-mhoffmann-nfl.b.aivencloud.com:10448/nfl", pool_size=20, max_overflow=50)

def get_current_week():
    url = f"https://cdn.espn.com/core/nfl/scoreboard?xhr=1&limit=50"
    response = requests.get(url)
    events = response.json().get('content', {}).get('sbData', {}).get('events', [])
    week = events[0]['week']['number']
    season = events[0]['season']['year']
    game_type = events[0]['season']['slug']
    return week, season, game_type


def update_week(week, season, game_type):
    games_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT game_id FROM games WHERE week='{week}' AND season='{season}' AND game_type='{game_type}';")).fetchall()]
    game_statuses_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT game_status FROM games WHERE week='{week}' AND season='{season}' AND game_type='{game_type}';")).fetchall()]
    if(len(games_in_db)>0):
        url = f"https://cdn.espn.com/core/nfl/scoreboard?xhr=1&limit=50"
        response = requests.get(url)
        data = response.json()
        games_df = load_game_data(data['content']['sbData']['events'], asDataFrame=True, checkExistence=False)
        if(len(games_df)>0):
            for game_id, status in zip(games_in_db, game_statuses_in_db):
                if(games_df.loc[game_id, 'game_status']>status):
                    print(f"status of game {game_id} changed from {status} to {games_df.loc[game_id, 'game_status']}.")
                    with sql_engine.connect() as sql_connection:
                        sql_connection.execute(text(f"UPDATE games SET game_status={games_df.loc[game_id, 'game_status']} WHERE game_id={game_id};"))
                        sql_connection.commit()
                    plays_df, _ = get_plays([game_id])
                    if(len(plays_df)>0):
                        append_new_rows(plays_df, 'plays', sql_engine, 'play_id')
                    percentages_df = get_probabilities([game_id])
                    if(len(percentages_df)>0):
                        append_new_probabilities(percentages_df, 'probabilities', sql_engine, 'proba_id')
                elif(games_df.loc[game_id, 'game_status']=='2'):
                    plays_df, _ = get_plays([game_id])
                    if(len(plays_df)>0):
                        append_new_rows(plays_df, 'plays', sql_engine, 'play_id')
                    percentages_df = get_probabilities([game_id])
                    if(len(percentages_df)>0):
                        append_new_probabilities(percentages_df, 'probabilities', sql_engine, 'proba_id')
    else:
        url = f"https://cdn.espn.com/core/nfl/scoreboard?xhr=1&limit=50"
        response = requests.get(url)
        data = response.json()
        games_df = load_game_data(data['content']['sbData']['events'], asDataFrame=True)
        if(len(games_df)>0):
            append_new_rows(games_df, 'games', sql_engine, 'game_id')
        plays_df, _ = get_plays(list(games_df.index))
        if(len(plays_df)>0):
            append_new_rows(plays_df, 'plays', sql_engine, 'play_id')
        percentages_df = get_probabilities(list(games_df.index))
        if(len(percentages_df)>0):
            append_new_probabilities(percentages_df, 'probabilities', sql_engine, 'proba_id')

#update_week(*get_current_week())

In [None]:
#update_week(*get_current_week())


status of game 401671620 changed from 1 to 2.
status of game 401671627 changed from 1 to 2.
status of game 401671649 changed from 1 to 2.
status of game 401671703 changed from 1 to 2.
status of game 401671705 changed from 1 to 2.
status of game 401671726 changed from 1 to 2.
status of game 401671743 changed from 1 to 2.
status of game 401671806 changed from 2 to 3.


In [35]:
def update_running_games(game_ids):
    plays_df, _ = get_plays(game_ids)
    print(len(plays_df))
    if(len(plays_df)>0):
        append_new_rows(plays_df, 'plays', sql_engine, 'play_id')
        percentages_df = get_probabilities(game_ids)
        print(len(percentages_df))
        if(len(percentages_df)>0):
            append_new_probabilities(percentages_df, 'probabilities', sql_engine, 'proba_id')

In [41]:
#update_running_games([401671651])

0


# TESTING