In [6]:
def get_data(endpoint, params):
    
    URL = "https://v3.football.api-sports.io/"
    headers = {
	'x-rapidapi-host': "v3.football.api-sports.io",
    'x-rapidapi-key': "fb2140228973d644db847895c454c22b"
    }
    
    response = requests.get(
        URL+endpoint,
        headers = headers,
        params = params
    )
    if response.status_code == 200:
            
        remaining = response.headers.get("x-ratelimit-requests-remaining")
        data = response.json()
        print(f"requests before reaching limit {remaining}")

    else:
        print(f"Error {response.status_code}, {response.text}")

    return data, remaining                                                                     

In [8]:
def encode_data(data_dict, parent_key = '', sep= '_'):
    encoded = []
    for key, val in data_dict.items():
        new_key = f'{parent_key}{sep}{key}' if parent_key else key
        if isinstance(val, dict):
            encoded.extend(encode_data(val, new_key, sep=sep).items())
        elif isinstance(val, list):
            if val:
                if all(isinstance(i, dict) for i in val):
                    for k, v in enumerate(val):
                        v_key = f'{new_key}{sep}{k}'
                        encoded.extend(encode_data(v, v_key, sep=sep).items())
                else:
                    encoded.append((new_key, val))
            else:
                encoded.append((new_key, []))
        else:
            encoded.append((new_key, val))
    return dict(encoded)

In [25]:
def encode_fix_stats(data, fixture_id):
    encoded = []
    encoded.append(('fixture_id', fixture_id))
    for key, val in data.items():
        if key =='team':
            encoded.append((key+'_id', val['id']))
        else:
            for el in val:
                encoded.append((el['type'].lower().replace(' ', '_').replace('%', 'percentage'), el['value']))
    return dict(encoded)

In [10]:
def data_to_sql(table_name, df, db_params, conflict_columns):
    conn = None
    cur = None
    try:
        # Establish the connection
        conn = psycopg2.connect(**db_params)
        cur = conn.cursor()
        
        #insert data into tables
        if len(conflict_columns) == 0:
            insert_query = """
                INSERT INTO {} ({})
                VALUES ({})
            """.format(table_name, ','.join(df.columns), ','.join(['%s']*len(df.columns)))
        else:
            insert_query = """
                INSERT INTO {} ({})
                VALUES ({})
                ON CONFLICT ({}) DO NOTHING
            """.format(table_name, ','.join(df.columns), ','.join(['%s']*len(df.columns)), ','.join(conflict_columns))
        if len(df) > 0:
            last_row = df.iloc[-1]
        cur.executemany(insert_query, df.values.tolist())
        print(f'table {table_name} updated')
        
        # Commit the changes
        conn.commit()
        
    
    except Exception as e:
        print(f"Error: {e}")
        if last_row is not None:
            print(f"Last row loaded before the error occurred: {last_row}")
    finally:
        if conn is not None:
            # Close the cursor and connection
            cur.close()
        if cur is not None:
            conn.close()

In [6]:
import time
import boto3
import pandas as pd
import io
import psycopg2
import numpy as np
import requests
import json
import pickle

# get current and european seasons files from bucket
s3 = boto3.client('s3')

bucket_name = 'sport-storage'
object_key1 = 'current.csv'
object_key2 = 'european_seasons.csv'
object_key3 = 'teams.json'

response1 = s3.get_object(Bucket = bucket_name, Key = object_key1)
response2 = s3.get_object(Bucket = bucket_name, Key = object_key2)
response3 = s3.get_object(Bucket = bucket_name, Key = object_key3)

current = pd.read_csv(io.BytesIO(response1['Body'].read()))
european_seasons = pd.read_csv(io.BytesIO(response2['Body'].read()))

# insert parameters to be found
league = current['league_id'][0]
year = current['year'][0]
to_find = current['type'][0]

# get current index
index = np.where((european_seasons['league_id']==league)&(european_seasons['year']==year))[0][0]

remaining = 100
total_team_stats_data = []
total_teams_data = []
done = False

# if script ended on collecting team stats get rest of teams
if to_find == 'teams/statistics':
    teams_list = pickle.loads(response3['Body'].read())
else:
    teams_list = []

# if limit of requests wasnt reached and there is still something to collect enter loop
while (done==False)&(remaining > 0):

    # get data from API
    if to_find == 'teams': 
        params = {'league': league,
                  'season': year
                 }
    else:
        team = teams_list[0]
        params = {'league': league,
                  'season': year,
                  'team': team
                 }
        
    endpoint = to_find
    data, remaining_req = get_data(endpoint, params)
    
    # preprocess data
    if endpoint == 'teams':
        total_teams_data.extend(encode_data(team) for team in data['response'])
    else:
        total_team_stats_data.append(encode_data(data['response']))

    # if last endpoint is teams now find stats for every team in that season
    if endpoint == 'teams':
        to_find = 'teams/statistics'
        teams_list = [row['team']['id'] for row in data['response']]

    # if team stats are collecting drop team that data was already collected for
    else:
        (endpoint == 'teams/statistics') & (len(teams_list) > 0)
        teams_list.pop(0)

    # if team stats were collected for every team in current season move to next season
        if (endpoint == 'teams/statistics') & (len(teams_list) == 0) & (index < len(european_seasons)-1):
            index += 1
            league = european_seasons.loc[index]['league_id']
            year = european_seasons.loc[index]['year']
            to_find = 'teams'

    # if data for all seasons were collected quit loop
        elif (endpoint == 'teams/statistics') & (len(teams_list) == 0) & (index == len(european_seasons)-1):
            done = True
        
    remaining = int(remaining_req)
    print(remaining)
    # sleep cause there can be done only 10 requests per minute
    time.sleep(7)
    
    # saving to sql database
    # db parameters
db_params = {
    'host': 'localhost',
    'database': 'preds',
    'user': 'postgres',
    'password': 'pass',
    'port': '5432'
}


if len(total_team_stats_data) > 0:
    team_stats_df = pd.DataFrame(total_team_stats_data)
    team_stats_df = team_stats_df.drop(columns = {
        'league_name', 
        'league_country', 
        'league_logo', 
        'league_flag', 
        'team_name', 
        'team_logo',
        'lineups'})
    team_stats_df = team_stats_df.fillna(0)
    conflict_col = []
    team_stats_df.columns = team_stats_df.columns.str.replace('-','_')
    data_to_sql('team_stats', team_stats_df, db_params, conflict_col)
                
if len(total_teams_data) > 0:
    teams_df = pd.DataFrame(total_teams_data)
    teams_df = teams_df[['team_id', 'team_name', 'team_country', 'team_logo', 'team_national', 'venue_capacity', 'venue_surface']].rename(columns = {'team_national': 'national'})
    conflict_col = ['team_id']
    data_to_sql('teams', teams_df, db_params, conflict_col)

if not done:
    #
    json_data = json.dumps(teams_list)
    s3.put_object(Bucket = bucket_name, Key = object_key3, Body=json_data)
   
    data = {'league_id': [league], 'year': [year], 'type': [to_find]}
    current = pd.DataFrame(data)
    
    csv_buffer = io.StringIO()
    current.to_csv(csv_buffer, index=False)
    s3.put_object(Bucket = bucket_name, Key = object_key1, Body=csv_buffer.getvalue().encode())
    
    print(f'{len(european_seasons)-index-1} seasons left')
else:
    print('Teams and their statistics were collected for every season played in Europe')

ModuleNotFoundError: No module named 'psycopg2'

In [24]:
def get_last_date(db_params, today):
    get_data_query = '''
    SELECT fixture_id
    FROM fixtures 
    WHERE fixture_status_short != 'FT' and fixture_date < today
    '''
    res = []
    conn = None
    cur = None
    try:
        # Establish the connection
        conn = psycopg2.connect(**db_params)
        cur = conn.cursor()
        
        cur.execute(get_data_query)
        
        res = [row[0] for row in cur.fetchall()]
    except Exception as e:
        print(f"Error: {e}")
        if last_row is not None:
            print(f"Last row loaded before the error occurred: {last_row}")
    finally:
        if conn is not None:
            # Close the cursor and connection
            cur.close()
        if cur is not None:
            conn.close()
        return res

In [None]:
import pandas as pd
from datetime import date

remaining = 100
fixtures_data = []
fixture_stats_data = []
s3 = boto3.client('s3')

bucket_name = 'sport-storage'
matches_key = 'total_fixs.json'

matches_response = s3.get_object(Bucket = bucket_name, Key = matches_key)

total_fixs = pickle.loads(matches_response['Body'].read())

if len(total_fixs) == 0:
    today = date.today()
    fixtures_to_find = get_last_date(db_params, today)
    total_fixs = fixtures_to_find
else:
    fixtures_to_find = total_fixs
    
while (remaining > 0) & (len(fixtures_to_find) > 0):
    
    ids = ''
    sep = '-'
    # get fixtures ids to find
    if len(fixtures_to_find) > 20:
        for i in range(0,20):
            ids = f'{ids}{sep}{fixtures_to_find[i]}' if ids else fixtures_to_find[i]
        fixtures_to_find = fixtures_to_find[20:]
    
    elif len(fixtures_to_find) > 0:
        for i in fixtures_to_find:
            ids = f'{ids}{sep}{i}' if ids else i
        fixtures_to_find = []
    
    #get fixtures
    if ids:      
        params = {'ids': ids}
        
        df, remaining = get_data('fixtures', params)
        if len(df['response']) > 0:
            fixtures_data.extend(encode_data(row) for row in df['response'])
        
while (remaining > 0) & (len(total_fixs) > 0):

    params = {'fixture': total_fixs[0]}
    df, remaining = get_data('fixtures/statistics', params)
    fixture_stats_data.extend(encode_fix_stats, total_fixs[0])
    total_fixs.pop(0)

if len(total_fixs) == 0:
    print('data is actual')
else:
    print('data is acutal')



if len(fixtures_data) > 0:
    fixtures_df = pd.DataFrame(fixtures_data)
    fixtures_df = fixtures_df.drop(columns = {
        'league_name',
        'league_country',
        'league_logo',
        'league_flag',
        'fixture_venue_city',
        'fixture_venue_name',
        'teams_away_logo',
        'teams_away_name',
        'teams_home_logo',
        'teams_home_name'})
    conflict_col = ['fixture_id']
    data_to_sql('fixtures', fixtures_df, db_params, conflict_col)
    
if len(fixture_stats_data) > 0:
    fixture_stats_df = pd.DataFrame(fixture_stats_data)
    conflict_col = ['fixture_id', 'team_id']
    data_to_sql('fixture_statistics', fixture_stats_df, db_params, conflict_col)

json_data = json.dump(total_fixs)
s3.put_object(Bucket = bucket_name, Key = object_key3, Body=json_data)

## update team stats and fixs stats 
## get fixs to update from list total_fix

In [None]:
''' 
1. get all historical data,
2. then find last results date and if season is current get missing results, team stats,

3. start collecting data about upcoming matches in next 7 days,
4. create predictions model,
5. create table for predictions and actual results,
6. after that 7 days get real result and save them in fixtures and preds table,
7. decide whats better add results to team stats table or collect latest data about team stats (??),
8. use preds and real results for models' training,
9. create flask app to display preds,
10. get upcoming fixtures odds and compare them to preds,
11. filter odds worth betting,
12. display that bets on flask api
13. display statistics for that bets,
'''
