In [None]:
def get_data(endpoint, params):
    response = requests.get(
        URL+endpoint,
        headers = headers,
        params = params
    )
    if response.status_code == 200:
            
        remaining = response.headers.get("x-ratelimit-requests-remaining")
        data = response.json()
        print(f"requests before reaching limit {remaining}")

    else:
        print(f"Error {response.status_code}, {response.text}")

    return data, remaining                                                                     

In [None]:
def encode_data(data_dict):
    encoded = []
    for key, val in data_dict.items():
        if isinstance(val, dict):
            encoded.extend(encode_data(val).items())
        elif isinstance(val, list):
            if val:
                if all(isinstance(i, dict) for i in val):
                    for k, v in enumerate(val):
                        encoded.extend(encode_data(v).items())
                else:
                    encoded.append((key, val))
            else:
                encoded.append((key, []))
        else:
            encoded.append((key, val))
            
    return dict(encoded)

In [None]:
def data_to_sql(table_name, df, db_params):

    try:
        # Establish the connection
        conn = psycopg2.connect(**db_params)
        cur = conn.cursor()
        
        #insert data into tables
        modified_columns = [change_column_name(col) for col in df.columns]
        insert_query = """
            INSERT INTO {} ({})
            VALUES ({})
        """.format(table_name, ','.join(df.columns), ','.join(['%s']*len(df.columns)))
        pgcursor.executemany(insert_query, df.values.tolist())
        print(f'table {table_name} updated')
        
        # Commit the changes
        conn.commit()
        
    
    except Exception as e:
        print(f"Error: {e}")
    
    finally:
        # Close the cursor and connection
        cur.close()
        conn.close()

In [None]:
import time
import boto3
import pandas as pd
import io
import psycopg2


# get current and european seasons files from bucket
s3 = boto3.client('s3')

bucket_name = 'sport-storage'
object_key1 = 'current.csv'
object_key2 = 'european_seasons.csv'
object_key3 = 'teams.csv'

response1 = s3.get_object(Bucket = bucket_name, Key = object_key1)
response2 = s3.get_object(Bucket = bucket_name, Key = object_key2)
response3 = s3.get_object(Bucket = bucket_name, Key = object_key3)

current = pd.read_csv(io.BytesIO(response1['Body'].read()))
european_seasons = pd.read_csv(io.BytesIO(response2['Body'].read()))

# insert parameters to be found
league = current['league_id'][0]
year = current['year'][0]
to_find = current['type'][0]

# get current index
index = np.where((european_seasons['league_id']==league)&(european_seasons['year']==year))[0][0]

remaining = 100
total_team_stats_data = []
total_teams_data = []
done = False

# if script ended on collecting team stats get rest of teams
if to_find == 'teams/statistics':
    teams_list = pickle.loads(response3['Body'].read())
else:
    teams_list = []

# if limit of requests wasnt reached and there is still something to collect enter loop
while (done==False)&(remaining > 0):

    # get data from API
    if to_find = 'teams': 
        params = {'league': league,
                  'season': year
                 }
    else:
        team = teams_list[0]
        params = {'league': league,
                  'season': year,
                  'team': team
                 }
        
    endpoint = to_find
    data, remaining_req = get_data(endpoint, params)
    
    # preprocess data
    if endpoint == 'teams':
        teams_data = [encode_data(data) for team in data['response']]
        total_teams_data.append(teams_data)
    else:
        team_stats_data = encode_data(data['response'])
        total_team_stats_data.append(team_stats_data)

    # if last endpoint is teams now find stats for every team in that season
    if endpoint == 'teams':
        to_find = 'teams/statistics'
        teams_list = [row['team']['id'] for row in data['response']]

    # if team stats are collecting drop team that data was already collected for
    elif endpoint == 'teams/statistics' & len(teams_list) > 0:
        teams_list.pop(0)

    # if team stats were collected for every team in current season move to next season
    elif endpoint == 'teams/statistics' & len(teams_list) == 0 & index < len(european_seasons)-1:
        index += 1
        league = european_seasons.loc[index]['league_id']
        year = european_seasons.loc[index]['year']
        to_find = 'teams'

    # if data for all seasons were collected quit loop
    else:
        done = True
        
    remaining = remaining_req
    
    # sleep cause there can be done only 10 requests per minute
    time.sleep(7)
    
    # saving to sql database
    # db parameters
    db_params = {
        'host': 'postgres',
        'database': 'preds',
        'user': 'postgres',
        'password': 'pass'
    }


if len(total_team_stats_data) > 0:
    data_to_sql(team_stats, total_teams_data)
                
if len(total_teams_data) > 0:
    data_to_sql(teams, total_teams_data)

if not done:
    #
        with open('/tmp/teams.pkl', 'wb') as f:
        pickle.dump(teams_list, f)
    with open('/tmp/teams.pkl', 'rb') as f:
        s3.put_object(Bucket = bucket_name, Key = object_key3, Body=teams_list)
    data = {'league_id': [league], 'year': [year], 'type': [to_find]}

    current = pd.DataFrame(data)
    s3.put_object(Bucket = bucket_name, Key = object_key1, Body=current)
    print(f'{len(european_seasons)-index-1} seasons left')
else:
    print('Teams and their statistics were collected for every season played in Europe')