# Player data Fetch and transfer and upload to S3

In [None]:
import requests
import os
import configparser
import boto3
import json
import time
from datetime import datetime, timedelta
import pandas as pd
from pandas import json_normalize

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
X_RapidAPI_Key = config['FOOTBALLAPI']['X_RapidAPI_Key']
X_RapidAPI_Host= config['FOOTBALLAPI']['X_RapidAPI_Host']

In [5]:
s3_client = boto3.client('s3')
#s3_resource = boto3.resource('s3')
bucket_name='capstonefootballbucket'

In [19]:
target_year = '2021'
apiEndpoint = "players"
url = f"https://api-football-v1.p.rapidapi.com/v3/{apiEndpoint}"

In [20]:
def check_file_exists(s3_client, file_key):
    """
    Check S3 folder if had existed the same file
    """
    from botocore.exceptions import ClientError
    try:
        s3_client.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except ClientError as e:
        if e.response['Error']['Code']=='NoSuchKey':
            return False
        else:
            return False

In [21]:
def is_valid_content(file_content):
    # Check if the content starts with an array or contains a newline character
    return file_content.strip().startswith('[') or '\n' not in file_content

In [22]:
def call_api(X_RapidAPI_Key, X_RapidAPI_Host, url, parameters:dict={}, count=0):
    """
      call api function
    """
    headers = {
    "X-RapidAPI-Key": X_RapidAPI_Key,
	"X-RapidAPI-Host": X_RapidAPI_Host
    }
    #print(url)
    #print(f"{parameters}")
    data = requests.request("GET", url, headers=headers, params=parameters)
    count += 1
    return data, count

In [23]:
def players_data(X_RapidAPI_Key, X_RapidAPI_Host, leagueId, year, page, players_list:list, count=0):
    """
      depended by page and used the recursive to call call_api() to add the players_list
    """
    parameters = {"league":leagueId,"season":year,"page":page}
    players, count = call_api(X_RapidAPI_Key, X_RapidAPI_Host, url, parameters, count)
    if players.status_code != 200:
        print(f"Status code: {players.status_code}")
    elif 'response' not in players.json():
        print(f"response not in data json")
    elif players.json().get('response') is None:
        print("response is None")
    else:    
        players_dict= players.json()['response']
        current_page = players.json()['paging']['current']
        total_page = players.json()['paging']['total']
        for item in players_dict:
            players_list.append(item)                   
                  
    if current_page < total_page:
        page = current_page + 1
        #if page/2 == 1:
        #time.sleep(1)
        players_list, count = players_data(X_RapidAPI_Key, X_RapidAPI_Host, leagueId, year, page, players_list, count)
    else: 
        players_list = json.dumps(players_list, default=str)
    return players_list, count    

In [24]:
def FlattenJSONFile(file_content):
    """
    Flatten Player JSON key 'statistics' array to become multi lines JSON structure 
    """
    data = json.loads(file_content)
    flattened_data = pd.DataFrame()    
    for item in data:
        # First, flatten the nested JSON objects using pd.json_normalize()
        flattened_item_df = json_normalize(item, sep='_')
        seasons_df = json_normalize(flattened_item_df['statistics'].values[0], sep='_')
        # Add 'seasons.' prefix to the column names in seasons_df
        seasons_df.columns = ['statistics_' + col for col in seasons_df.columns]
        # Duplicate flattened_item_df to match the length of seasons_df
        flattened_item_df = pd.concat([flattened_item_df]*len(seasons_df), ignore_index=True)
        flattened_item_df.drop(columns=['statistics'], inplace=True)
        # Concatenate the dataframes
        flattened_item_df = pd.concat([flattened_item_df, seasons_df], axis=1)
        flattened_data = pd.concat([flattened_data, flattened_item_df], ignore_index=True)

    json_lines = flattened_data.to_json(orient='records', lines=True)
    return json_lines

## List files in S3 using paginator

In [25]:
paginator = s3_client.get_paginator("list_objects_v2")
#Delimiter='/' can fix the search scope just in these layer, excluding subfolders files.
response = paginator.paginate(Bucket=bucket_name,  Prefix=f'backup/for{apiEndpoint}/league_teams/{target_year}/', Delimiter='/', PaginationConfig={"PageSize": 300})

prefix_len=len(f'backup/for{apiEndpoint}/league_teams/{target_year}/')
count = 0
for page in response:
    files = page.get("Contents")
    for file in files:
        key = file['Key']
        if key.endswith('/'):
            continue
        if not target_year in key:
            continue
        print(f"file ={key}")    
        _position = file['Key'].find("_", prefix_len)
        league_position = prefix_len
        league_len=(_position-1) - (league_position-1)
        league = file['Key'][league_position:league_position+league_len]
        filename = file['Key'][prefix_len:]
        #print(f"key:{file['Key']}, size:{file['Size']}")
        #print(f"league={league}")
        #print(f"filename={filename}")
        check_file_key = f"league_{apiEndpoint}/{target_year}/{league}{apiEndpoint}.json"        
        if check_file_exists(s3_client, check_file_key):
            # Delete the former object A
            s3_client.delete_object(Bucket=bucket_name, Key=file['Key'])
            #print(f"exists target file, so Delete forplayers file key={key}")
            continue  
        players_list = []
        #Extract the players data and load to S3
        data_string, count = players_data(X_RapidAPI_Key, X_RapidAPI_Host, league, target_year, '1', players_list, count)
        if data_string =="[]" or data_string=="":
            # Delete the former object A
            s3_client.delete_object(Bucket=bucket_name, Key=file['Key'])
            #print(f"No find any player data, so Delete forplayers file key={key}")
            continue
        #call Flatten function to rnormalize JSON format
        json_lines = FlattenJSONFile(data_string)
        s3_client.put_object(
        Bucket=bucket_name, 
        Key=check_file_key,
        Body=json_lines
        )
        print(f"target_key={check_file_key}")    
        # Delete the former object A
        s3_client.delete_object(Bucket=bucket_name, Key=file['Key'])
        #print(f"Finish create target file, so Delete forplayers file key={key}")
        print(f"count={count}")
        #time.sleep(60*5)
        if count >=1000:
            #print(f"count={count}")
            break
    if count >=1000:
        print("#"*10)
        print(f"count={count}")
        break
        

file =backup/forplayers/league_teams/2021/100_2021teams.json
target_key=league_players/2021/100players.json
count=16
file =backup/forplayers/league_teams/2021/101_2021teams.json
file =backup/forplayers/league_teams/2021/102_2021teams.json
file =backup/forplayers/league_teams/2021/103_2021teams.json
file =backup/forplayers/league_teams/2021/104_2021teams.json
file =backup/forplayers/league_teams/2021/105_2021teams.json
file =backup/forplayers/league_teams/2021/106_2021teams.json
file =backup/forplayers/league_teams/2021/107_2021teams.json
file =backup/forplayers/league_teams/2021/108_2021teams.json
file =backup/forplayers/league_teams/2021/109_2021teams.json
file =backup/forplayers/league_teams/2021/10_2021teams.json
file =backup/forplayers/league_teams/2021/110_2021teams.json
file =backup/forplayers/league_teams/2021/111_2021teams.json
file =backup/forplayers/league_teams/2021/112_2021teams.json
file =backup/forplayers/league_teams/2021/113_2021teams.json
file =backup/forplayers/league

file =backup/forplayers/league_teams/2021/213_2021teams.json
file =backup/forplayers/league_teams/2021/214_2021teams.json
file =backup/forplayers/league_teams/2021/215_2021teams.json
file =backup/forplayers/league_teams/2021/216_2021teams.json
file =backup/forplayers/league_teams/2021/217_2021teams.json
file =backup/forplayers/league_teams/2021/218_2021teams.json
file =backup/forplayers/league_teams/2021/219_2021teams.json
target_key=league_players/2021/219players.json
count=609
file =backup/forplayers/league_teams/2021/220_2021teams.json
file =backup/forplayers/league_teams/2021/221_2021teams.json
file =backup/forplayers/league_teams/2021/222_2021teams.json
file =backup/forplayers/league_teams/2021/223_2021teams.json
file =backup/forplayers/league_teams/2021/224_2021teams.json
file =backup/forplayers/league_teams/2021/225_2021teams.json
file =backup/forplayers/league_teams/2021/226_2021teams.json
file =backup/forplayers/league_teams/2021/227_2021teams.json
file =backup/forplayers/leag

file =backup/forplayers/league_teams/2021/330_2021teams.json
file =backup/forplayers/league_teams/2021/331_2021teams.json
file =backup/forplayers/league_teams/2021/332_2021teams.json
file =backup/forplayers/league_teams/2021/333_2021teams.json
file =backup/forplayers/league_teams/2021/334_2021teams.json
file =backup/forplayers/league_teams/2021/335_2021teams.json
file =backup/forplayers/league_teams/2021/336_2021teams.json
file =backup/forplayers/league_teams/2021/337_2021teams.json
file =backup/forplayers/league_teams/2021/338_2021teams.json
file =backup/forplayers/league_teams/2021/339_2021teams.json
file =backup/forplayers/league_teams/2021/340_2021teams.json
target_key=league_players/2021/340players.json
count=987
file =backup/forplayers/league_teams/2021/341_2021teams.json
file =backup/forplayers/league_teams/2021/342_2021teams.json
file =backup/forplayers/league_teams/2021/343_2021teams.json
file =backup/forplayers/league_teams/2021/344_2021teams.json
target_key=league_players/20