# Imports and Definitions

In [116]:
import pandas as pd
import requests
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, Boolean, ForeignKey, DateTime, Time, BigInteger, Text, text, UniqueConstraint, ForeignKeyConstraint
from sqlalchemy.types import Integer
import random
from bs4 import BeautifulSoup

import sys
sys.path.insert(1, '../')
from keys import mysql_password       # import passwords from local file (not pushed to github)

sql_engine = create_engine(f"mysql+pymysql://root:{mysql_password}@localhost:3306")
sql_engine.connect().execute(text("CREATE DATABASE IF NOT EXISTS nfl;"))
sql_engine = create_engine(f"mysql+pymysql://root:{mysql_password}@localhost:3306/nfl", pool_size=20, max_overflow=50)

In [117]:
metadata = MetaData()

# Table for the player data (index: player_id -> primary key)
players_table = Table(
    'players', metadata,
    Column('player_id', Integer, primary_key=True),  # Set index as primary key
    Column('team_id', Integer, ForeignKey('teams.team_id')),
    Column('firstName', String(100)),
    Column('lastName', String(100)),
    Column('weight', Float),
    Column('height', Float),
    Column('age', Integer, nullable=True),
    Column('link', String(255)),
    Column('country', String(100), nullable=True),
    Column('picture', String(255), nullable=True),
    Column('jersey', Integer, nullable=True),
    Column('position_id', Integer, ForeignKey('positions.position_id')),
    Column('experience', Integer),
    Column('active', Boolean),
    Column('status_id', Integer, ForeignKey('playerstatuses.status_id')),
    Column('college_id', Integer, ForeignKey('colleges.college_id'))
)

# Table for the status data (index: status_id -> primary key)
playerstatuses_table = Table(
    'playerstatuses', metadata,
    Column('status_id', Integer, primary_key=True),  # Set index as primary key
    Column('name', String(100))
)

# Table for the team info data (index: team_id -> primary key)
teams_table = Table(
    'teams', metadata,
    Column('team_id', Integer, primary_key=True),  # Set index as primary key
    Column('abbreviation', String(10)),
    Column('name', String(255)),
    Column('location', String(255)),
    Column('color', String(50)),
    Column('logo', String(255)),
    Column('link', String(255))
)

# Table for the college data (index: college_id -> primary key)
colleges_table = Table(
    'colleges', metadata,
    Column('college_id', Integer, primary_key=True),  # Set index as primary key
    Column('name', String(255)),
    Column('abbreviation', String(10), nullable=True),
    Column('logo', String(255), nullable=True),
    Column('mascot', String(255), nullable=True)
)

# Table for the position data (index: position_id -> primary key)
positions_table = Table(
    'positions', metadata,
    Column('position_id', Integer, primary_key=True),  # Set index as primary key
    Column('name', String(100)),
    Column('abbreviation', String(10)),
    Column('parent', Integer, nullable=True),  # Nullable in case parent position is not specified
)

games_table = Table(
    'games', metadata,
    Column('game_id', Integer, primary_key=True),
    Column('date', DateTime(timezone=True)),
    Column('name', String(255)),
    Column('season', Integer),
    Column('game_type', String(100)),
    Column('week', Integer),
    Column('home_team_id', Integer, ForeignKey('teams.team_id')),
    Column('home_team_score', Integer),
    Column('away_team_id', Integer, ForeignKey('teams.team_id')),
    Column('away_team_score', Integer),
    Column('standing_home_overall_win', Integer),
    Column('standing_home_Home_win', Integer),
    Column('standing_home_Road_win', Integer),
    Column('standing_home_overall_loss', Integer),
    Column('standing_home_Home_loss', Integer),
    Column('standing_home_Road_loss', Integer),
    Column('standing_away_overall_win', Integer),
    Column('standing_away_Home_win', Integer),
    Column('standing_away_Road_win', Integer),
    Column('standing_away_overall_loss', Integer),
    Column('standing_away_Home_loss', Integer),
    Column('standing_away_Road_loss', Integer),
    Column('link', String(255)),
    Column('game_status', String(100)),
)

playtypes_table = Table(
    'playtypes', metadata,
    Column('playtype_id', Integer, primary_key=True),  
    Column('text', String(255)),  
    Column('abbreviation', String(10), nullable=True)
)

plays_table = Table(
    'plays', metadata,
    Column('play_id', BigInteger, primary_key=True),  
    Column('game_id', Integer, ForeignKey('games.game_id')),  
    Column('sequenceNumber', Integer),
    Column('homeScore', Integer),
    Column('awayScore', Integer),
    Column('quarter', Integer),
    Column('clock', Time),
    Column('offenseAtHome', Boolean),
    Column('down', Integer),
    Column('distance', Integer),
    Column('yardsToEndzone', Integer),
    Column('possessionChange', Boolean),
    Column('next_down', Integer),
    Column('next_distance', Integer),
    Column('next_yardsToEndzone', Integer),
    Column('playtype_id', Integer, ForeignKey('playtypes.playtype_id')),
    Column('description', Text),
    UniqueConstraint('game_id', 'sequenceNumber', name='uq_plays_game_sequence')
)

probabilities_table = Table(
    'probabilities', metadata,
    Column('proba_id', BigInteger, primary_key=True),  # Unique identifier
    Column('game_id', Integer, nullable=False),  # Game identifier
    Column('sequenceNumber', Integer, nullable=False),  # Sequence number
    Column('homeWinPercentage', Float, nullable=False),  # Probability of home win
    Column('awayWinPercentage', Float, nullable=False),  # Probability of away win
    Column('tiePercentage', Float, nullable=False),  # Probability of tie
    ForeignKeyConstraint(
        ['game_id', 'sequenceNumber'],  # Composite FK in probabilities
        ['plays.game_id', 'plays.sequenceNumber'],  # Composite key in plays
        name='fk_probabilities_plays'
    )
)

news_table = Table(
    'news', metadata,
    Column('news_id', Integer, primary_key=True),  # Index column as primary key
    Column('headline', String(255), nullable=False),
    Column('description', String(1000), nullable=False),
    Column('published', DateTime(timezone=True), nullable=False),
    Column('story', Text, nullable=False)
)

metadata.create_all(sql_engine)

In [21]:
def get_existing_ids(sql_engine, table, id_column):
    result = sql_engine.connect().execute(text(f"SELECT {id_column} FROM {table}"))
    df = pd.DataFrame(result.fetchall(), columns=[id_column])
    if df.empty:
        return set()  # Return an empty set if no rows are found
    return set(df[id_column].tolist())

def append_new_rows(dataframe, table, sql_engine, id_column):
    existing_ids_set = get_existing_ids(sql_engine, table, id_column)
    if not existing_ids_set:  # If there are no existing IDs in the SQL table
        dataframe.to_sql(table, con=sql_engine, if_exists='append', index=True, index_label=id_column)
    else:
        new_rows = dataframe[~dataframe.index.isin(existing_ids_set)]
        new_rows.to_sql(table, con=sql_engine, if_exists='append', index=True, index_label=id_column)

# Teams

In [14]:
def get_nfl_teams():
    url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/teams"
    response = requests.get(url)
    team_data = response.json()

    teams = []
    for team_id in range(-2,35):
        url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/teams/{team_id}"
        response = requests.get(url)
        team_data = response.json()

        team = {}
        team['team_id'] = team_id
        team['abbreviation'] = team_data.get('team', {}).get('abbreviation')
        team['name'] = team_data.get('team', {}).get('displayName')
        team['location'] = team_data.get('team', {}).get('location')
        team['color'] = team_data.get('team', {}).get('color')
        team['logo'] = (team_data.get('team', {}).get('logos', [{}])[0]).get('href')
        team['link'] = (team_data.get('team', {}).get('links', [{}])[0]).get('href')
        teams.append(team)  

    teams_df = pd.DataFrame(teams)
    teams_df['team_id'] = teams_df['team_id'].astype('Int64')
    teams_df.sort_values(by='team_id', inplace=True)
    teams_df.set_index('team_id', inplace=True)
    return teams_df

teams_df = get_nfl_teams()
try:
    append_new_rows(teams_df, 'teams', sql_engine, 'team_id')
except:
    append_new_rows(teams_df, 'teams', sql_engine, 'team_id')

# Players
(+ Status and Colleges)

In [15]:
def get_nfl_players(team_ids):
    players = []
    status_data = {}
    colleges = set()

    for team_id in team_ids:
        #print("Currently getting team",team_id)
        url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/seasons/2024/teams/{team_id}/athletes?limit=200"
        team_response = requests.get(url)
        team_roster = team_response.json()
        team_response = requests.get(url)
        for i in range(len(team_roster['items'])):   # players per team
            player_response = requests.get(team_roster['items'][i]['$ref'])
            player_data = player_response.json()
            player = {}
            player['player_id'] = player_data.get('id', None)
            player['team_id'] = team_id  # Assuming team_id is already safely set
            player['firstName'] = player_data.get('firstName', None)
            player['lastName'] = player_data.get('lastName', None)
            player['weight'] = player_data.get('weight', None)
            player['height'] = player_data.get('height', None)
            player['age'] = player_data.get('age', None)
            player['link'] = player_data.get('links', [{}])[0].get('href', None)
            player['country'] = player_data.get('birthPlace', {}).get('country', None)
            player['picture'] = player_data.get('headshot', {}).get('href', None)
            player['jersey'] = player_data.get('jersey', None)
            player['position_id'] = player_data.get('position', {}).get('id', None)
            player['experience'] = player_data.get('experience', {}).get('years', None)
            player['active'] = player_data.get('active', None)
            player['status_id'] = player_data.get('status', {}).get('id', None)
            player['college_id'] = player_data.get('college', {}).get('$ref', 'unknown').split('/')[-1].split('?')[0]
            if player['college_id'] == 'unknown':
                player['college_id'] = None
            else:
                colleges.add(player['college_id'])
            
            status_data[player['status_id']] = player_data.get('status', {}).get('name', None)
            
            players.append(player)

    players_df = pd.DataFrame(players)
    players_df['player_id'] = players_df['player_id'].astype('Int64')
    players_df['status_id'] = players_df['status_id'].astype('Int64')
    players_df['college_id'] = players_df['college_id'].astype('Int64')
    players_df.sort_values(by='player_id', inplace=True)
    players_df.set_index('player_id', inplace=True)

    status_df = pd.DataFrame(list(status_data.items()), columns=['status_id', 'name'])
    status_df['status_id'] = status_df['status_id'].astype('Int64')
    status_df.sort_values(by='status_id', inplace=True)
    status_df.set_index('status_id', inplace=True)

    return players_df, status_df, colleges

players_df, status_df, college_ids = get_nfl_players(teams_df.index)

In [16]:
def get_colleges(college_ids):
    colleges = []
    for college_id in list(college_ids):
        url = f"http://sports.core.api.espn.com/v2/colleges/{college_id}?lang=en&region=us"
        college_response = requests.get(url)
        college_data = college_response.json()
        college = {}
        college['college_id'] = college_id
        college['name'] = college_data.get('name', None)
        college['abbreviation'] = college_data.get('abbrev', None)
        college['logo'] = college_data.get('logos', [{}])[0].get('href', None)
        college['mascot'] = college_data.get('mascot', None)
        colleges.append(college)
    colleges_df = pd.DataFrame(colleges)
    colleges_df['college_id'] = colleges_df['college_id'].astype('Int64')
    colleges_df.set_index('college_id', inplace=True)
    return colleges_df

colleges_df = get_colleges(college_ids)



In [None]:
def get_positions(position_ids):
    positions = []
    for position_id in position_ids:
        url = f"http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/positions/{position_id}?lang=en&region=us"
        position_response = requests.get(url)
        position_data = position_response.json()
        position = {}
        position['position_id'] = position_id
        position['name'] = position_data.get('name', None)
        position['abbreviation'] = position_data.get('abbreviation', None)
        position['parent'] = position_data.get('parent', {}).get('$ref', 'unknown').split('/')[-1].split('?')[0]
        if position['parent'] == 'unknown':
            position['parent'] = None
        positions.append(position)

    positions_df = pd.DataFrame(positions)
    positions_df['position_id'] = positions_df['position_id'].astype('Int64')
    positions_df['parent'] = positions_df['parent'].astype('Int64')
    positions_df.set_index('position_id', inplace=True)
    positions_df.drop_duplicates(inplace=True)
    return positions_df

position_ids = set()
for i in players_df['position_id'].values:
    position_ids.add(i)
positions_df = get_positions(position_ids)



In [28]:
append_new_rows(positions_df, 'positions', sql_engine, 'position_id')
append_new_rows(colleges_df, 'colleges', sql_engine, 'college_id')
append_new_rows(status_df, 'playerstatuses', sql_engine, 'status_id')
append_new_rows(players_df, 'players', sql_engine, 'player_id')

# Games

In [59]:
def get_games(years):
    games = []

    weeks = {2: list(range(1,19)),
            3: [1,2,3,4,5]}

    years_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT season FROM games")).fetchall()]

    for year in list(set(years) - set(years_in_db)):

        for seasontype in [2,3]:
            #print('SeasonType:', seasontype, end=' Weeks: ')
            for week in weeks[seasontype]:
                #print(week, end=' ')
                url = f"https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?dates={year}&seasontype={seasontype}&week={week}"
                games_response = requests.get(url)
                games_data = games_response.json()

                for game_ind in range(len(games_data['events'])):
                    game_data = games_data['events'][game_ind]
                    game = {}
                    game['game_id'] = game_data.get('id', None)
                    game['date'] = game_data.get('date', None)
                    game['name'] = game_data.get('name', None)
                    game['season'] = game_data.get('season', {}).get('year', None)
                    game['game_type'] = game_data.get('season', {}).get('slug', None)
                    game['week'] = game_data.get('week', {}).get('number', None)
                    if game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('homeAway', None) == "home":
                        game['home_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('team', {}).get('id', None)
                        game['home_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('score', None)
                        for i in range(3):
                            standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('name', '')
                            if not standing=='':
                                game['standing_home_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[0]
                                game['standing_home_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[-1]

                        game['away_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('team', {}).get('id', None)
                        game['away_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('score', None)
                        for i in range(3):
                            standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('name', '')
                            if not standing=='':
                                game['standing_away_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[0]
                                game['standing_away_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[-1]

                    else:
                        game['home_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('team', {}).get('id', None)
                        game['home_team_abr'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('team', {}).get('abbreviation', None)
                        game['home_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('score', None)
                        for i in range(3):
                            standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('name', '')
                            if not standing=='':
                                game['standing_home_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[0]
                                game['standing_home_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[1].get('records', [{},{},{}])[i].get('summary', [None])[-1]
                            
                        game['away_team_id'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('team', {}).get('id', None)
                        game['away_team_abr'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('team', {}).get('abbreviation', None)
                        game['away_team_score'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('score', None)
                        for i in range(3):
                            standing = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('name', '')
                            if not standing=='':
                                game['standing_away_'+standing+'_win'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[0]
                                game['standing_away_'+standing+'_loss'] = game_data.get('competitions', [{}])[0].get('competitors', [{},{}])[0].get('records', [{},{},{}])[i].get('summary', [None])[-1]
                    game['link'] = game_data.get('links', [{}])[0].get('href', None)
                    game['game_status'] = game_data.get('status', {}).get('type', {}).get('id', None)
                    games.append(game)
            #print('')
    games_df = pd.DataFrame(games)
    games_df['game_id'] = games_df['game_id'].astype('Int64')
    games_df['home_team_id'] = games_df['home_team_id'].astype('Int64')
    games_df['home_team_score'] = games_df['home_team_score'].astype('Int64')
    games_df['away_team_id'] = games_df['away_team_id'].astype('Int64')
    games_df['away_team_score'] = games_df['away_team_score'].astype('Int64')
    games_df['date'] = pd.to_datetime(games_df['date'])
    games_df.set_index('game_id', inplace=True)

    return games_df

years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009]
games_df = get_games(years)

append_new_rows(games_df, 'games', sql_engine, 'game_id')

Year: 2009
Year: 2010
Year: 2011
Year: 2012
Year: 2013
Year: 2014
Year: 2015
Year: 2016
Year: 2017
Year: 2018
Year: 2019
Year: 2020
Year: 2021


# Plays

In [None]:
def get_plays(game_ids):
    plays = []
    playtypes = []

    #random.shuffle(game_ids)

    #games_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM plays")).fetchall()]

    for game_id in game_ids: #list(set(game_ids) - set(games_in_db)):

        url = f"https://cdn.espn.com/core/nfl/playbyplay?xhr=1&gameId={game_id}"

        try:
            game_response = requests.get(url)
            if game_response.status_code == 200:
                try:
                    game_data = game_response.json()

                    drives_data = game_data.get('gamepackageJSON', {}).get('drives',{}).get('previous', [])

                    game_from_sql = sql_engine.connect().execute(text(f"SELECT * FROM games where game_id = {game_id}"))
                    game_df = pd.DataFrame(game_from_sql.fetchall())

                    for drive_i in range(len(drives_data)):
                        drive_data = drives_data[drive_i]
                        plays_data = drive_data.get('plays',[])
                        for sequence_i in range(len(plays_data)):
                            play_data = plays_data[sequence_i]

                            play = {}
                            play['play_id'] = play_data.get('id', None)
                            play['game_id'] = game_id
                            play['sequenceNumber'] = play_data.get('sequenceNumber', None)
                            play['homeScore'] = play_data.get('homeScore', None)
                            play['awayScore'] = play_data.get('awayScore', None)
                            play['quarter'] = play_data.get('period', {}).get('number', None)
                            play['clock'] = play_data.get('clock', {}).get('displayValue', None)

                            offense_team_id = play_data.get('start', {}).get('team', {}).get('id', None)

                            if offense_team_id == None:
                                play['offenseAtHome'] = None
                            elif int(offense_team_id) == game_df['home_team_id'].values[0]:
                                play['offenseAtHome'] = True
                            else:
                                play['offenseAtHome'] = False

                            play['down'] = play_data.get('start', {}).get('down', None)
                            play['distance'] = play_data.get('start', {}).get('distance', None)
                            play['yardsToEndzone'] = play_data.get('start', {}).get('yardsToEndzone', None)

                            next_team_id = play_data.get('end', {}).get('team', {}).get('id', None)

                            if next_team_id == None:
                                play['possessionChange'] = None
                            elif next_team_id== offense_team_id:
                                play['possessionChange'] = False
                            else:
                                play['possessionChange'] = False

                            play['next_down'] = play_data.get('end', {}).get('down', None)
                            play['next_distance'] = play_data.get('end', {}).get('distance', None)
                            play['next_yardsToEndzone'] = play_data.get('end', {}).get('yardsToEndzone', None)

                            play['playtype_id'] = play_data.get('type', {}).get('id', None)
                            play['description'] = play_data.get('text', None)

                            plays.append(play)
                            playtypes.append(play_data.get('type', {}))
                except Exception as e:
                    print("JSON error for game_id", game_id, e)
            else:
                print("No 200 response for game_id", game_id)
        except Exception as e:
            print("No response from Server for game_id", game_id)

    if len(plays)>0:
        playtypes_df = pd.DataFrame(playtypes)
        playtypes_df.drop_duplicates(inplace=True)
        playtypes_df['id'] = playtypes_df['id'].astype('Int64')
        playtypes_df.sort_values(by='id', inplace=True)
        playtypes_df.rename(columns={'id': 'playtype_id'}, inplace=True)
        playtypes_df.set_index('playtype_id', inplace=True)

        plays_df = pd.DataFrame(plays)
        plays_df['play_id'] = plays_df['play_id'].astype('Int64')
        plays_df['clock'] = '00:' + plays_df['clock']
        plays_df['playtype_id'] = plays_df['playtype_id'].astype('Int64')
        plays_df['sequenceNumber'] = plays_df['sequenceNumber'].astype('Int64')
        plays_df.set_index('play_id', inplace=True)

        return plays_df, playtypes_df
    else:
        return [], []

ids_in_games = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM games WHERE season in (2023, 2024)")).fetchall()]
ids_in_plays = list(get_existing_ids(sql_engine, 'plays', 'game_id'))

missing_games = list(set(ids_in_games) - set(ids_in_plays))

while len(missing_games)>0:

    print(len(missing_games), "games still missing", end = ' ')
    #random.shuffle(missing_games)

    plays_df, playtypes_df = get_plays(missing_games[0:100])

    if len(plays_df)>0:
        
        plays_df.drop_duplicates(subset=['game_id', 'sequenceNumber'], keep='last', inplace=True)
        append_new_rows(playtypes_df, 'playtypes', sql_engine, 'playtype_id')
        append_new_rows(plays_df, 'plays', sql_engine, 'play_id')
        print('- Succesfully appended some new rows')
    else:
        print("No new data")

    ids_in_games = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM games WHERE season in (2023, 2024)")).fetchall()]
    ids_in_plays = list(get_existing_ids(sql_engine, 'plays', 'game_id'))

    missing_games = list(set(ids_in_games) - set(ids_in_plays))

208 games still missing - Succesfully appended some new rows
177 games still missing - Succesfully appended some new rows
165 games still missing - Succesfully appended some new rows


# Probabilities

In [114]:
def append_new_probabilities(dataframe, table, sql_engine, id_column):
    existing_ids_set = get_existing_ids(sql_engine, table, id_column)
    if not existing_ids_set:  # If there are no existing IDs in the SQL table
        for index, row in dataframe.iterrows():
            try:
                row.to_frame().T.to_sql('probabilities', con=sql_engine, if_exists='append', index=True, index_label=id_column)
            except:
                print(row.values)
    else:
        new_rows = dataframe[~dataframe.index.isin(existing_ids_set)]
        for index, row in new_rows.iterrows():
            try:
                row.to_frame().T.to_sql('probabilities', con=sql_engine, if_exists='append', index=True, index_label=id_column)
            except Exception as e:
                #if not row.values[1]==100:
                #    print(row.values)
                pass

In [115]:
def get_probabilities(game_ids):
    percentages = []
    #games_in_db = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM probabilities")).fetchall()]
    for game_id in game_ids: #  list(set(game_ids) - set(games_in_db)):
        try:
            url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{game_id}/competitions/{game_id}/probabilities?limit=3000"
            response = requests.get(url)
            data = response.json()
            pages = data.get('pageCount', 0)
            for page in range(1,pages+1):
                url = f"https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{game_id}/competitions/{game_id}/probabilities?limit=3000&page={page}"
                response = requests.get(url)
                data = response.json()
                for item in data['items']:
                    percentage_data = {}
                    percentage_data['proba_id'] = int(str(game_id)+str(item.get('sequenceNumber', None)))
                    percentage_data['game_id'] = game_id
                    percentage_data['sequenceNumber'] = item.get('sequenceNumber', None)
                    percentage_data['homeWinPercentage'] = item.get('homeWinPercentage', None)
                    percentage_data['awayWinPercentage'] = item.get('awayWinPercentage', None)
                    percentage_data['tiePercentage'] = item.get('tiePercentage', None)
                    percentages.append(percentage_data)
        except Exception as e:
            print(game_id, e)
    percentages_df = pd.DataFrame(percentages)
    percentages_df['sequenceNumber'] = percentages_df['sequenceNumber'].astype('Int64')
    percentages_df.set_index('proba_id', inplace=True)
    return percentages_df

games_with_plays = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM plays p LEFT JOIN games g ON p.game_id=g.game_id WHERE g.season IN (2023, 2024) ")).fetchall()]
games_in_proba = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM probabilities")).fetchall()]

missing_games = list(set(games_with_plays) - set(games_in_proba))

while len(missing_games)>0:
    print(len(missing_games), "games still missing", end = ' ')
    
    #random.shuffle(missing_games)

    percentages_df = get_probabilities(missing_games[0:100])
    append_new_probabilities(percentages_df, 'probabilities', sql_engine, 'proba_id')
    print('Appended some new probabilities.')

    games_with_plays = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM plays p LEFT JOIN games g ON p.game_id=g.game_id WHERE g.season IN (2023, 2024) ")).fetchall()]
    games_in_proba = [i[0] for i in sql_engine.connect().execute(text(f"SELECT DISTINCT game_id FROM probabilities")).fetchall()]

    missing_games = list(set(games_with_plays) - set(games_in_proba))

Appended some new probabilities.
Appended some new probabilities.
Appended some new probabilities.
Appended some new probabilities.
Appended some new probabilities.


KeyboardInterrupt: 

# News

In [105]:
def get_news(url1, url2, url3, team_ids):

    existing_news = get_existing_ids(sql_engine, "news", "news_id")

    news = []
    article_links = set()

    news_response = requests.get(url1)
    news_data = news_response.json()
    articles_data = news_data.get('articles', [])
    for article_i in articles_data:
        article_link = article_i.get('links', {}).get('api', {}).get('news', {}).get('href', '')
        article_links.add(article_link)
        article_link = article_i.get('links', {}).get('api', {}).get('self', {}).get('href', '')
        article_links.add(article_link)

    article_links.add(url2)

    for team_id in team_ids:
        news_response = requests.get(url3+str(team_id))
        news_data = news_response.json()
        articles_data = news_data.get('articles', [])
        for article_i in articles_data:
            article_link = article_i.get('links', {}).get('api', {}).get('news', {}).get('href', '')
            article_links.add(article_link)
            article_link = article_i.get('links', {}).get('api', {}).get('self', {}).get('href', '')
            article_links.add(article_link)

    cleaned_links = []
    for i in article_links:
        if ('sports/news' in i):
            cleaned_links.append(i)

    print(f"Retrieving news from {len(cleaned_links)} places.")
    for article_link in cleaned_links:
        article_response = requests.get(article_link)
        article_data = article_response.json()
        headlines_data = article_data.get('headlines', [])
        for headline_i in headlines_data:
            headline_id = headline_i.get('id', None)
            if ( (not headline_id == None)and(not headline_id in existing_news) ):
                new_news = {}
                new_news['news_id'] = headline_id
                new_news['headline'] = headline_i.get('headline', None)
                new_news['description'] = headline_i.get('description', None)
                new_news['published'] = headline_i.get('published', None)
                story = headline_i.get('story', None)
                story_soup = BeautifulSoup(story, 'html.parser')
                story_plain = story_soup.get_text(separator=' ', strip=True)
                new_news['story'] = story_plain
                news.append(new_news)
    if len(news)>0:
        news_df = pd.DataFrame(news)
        news_df['news_id'] = news_df['news_id'].astype('Int64')
        news_df.set_index('news_id', inplace=True)
        news_df['published'] = pd.to_datetime(news_df['published'])
        news_df = news_df.loc[~news_df.index.duplicated()]
        return news_df
    else:
        print("No new news yet.")

news_df = get_news("https://site.api.espn.com/apis/site/v2/sports/football/nfl/news?limit=150", "https://now.core.api.espn.com/v1/sports/news?limit=1000&sport=football", "https://site.api.espn.com/apis/site/v2/sports/football/nfl/news?team=", range(1,35))

append_new_rows(news_df, 'news', sql_engine, 'news_id')


Retrieving news from 73 places.
