# Fixture Statistic data Fetch and transfer and upload to S3

In [1]:
import requests
import os
import configparser
import boto3
import json
import time
from datetime import datetime, timedelta
import pandas as pd
from pandas import json_normalize

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
X_RapidAPI_Key = config['FOOTBALLAPI']['X_RapidAPI_Key']
X_RapidAPI_Host= config['FOOTBALLAPI']['X_RapidAPI_Host']

In [5]:
headers = {
    "X-RapidAPI-Key": X_RapidAPI_Key,
	"X-RapidAPI-Host": X_RapidAPI_Host
}

In [6]:
s3_client = boto3.client('s3')
#s3_resource = boto3.resource('s3')
bucket_name='capstonefootballbucket'

In [7]:
target_year = '2022'
apiEndpoint = "statistics"
url = f"https://api-football-v1.p.rapidapi.com/v3/fixtures/{apiEndpoint}"

In [8]:
def check_file_exists(s3_client, file_key):
    """
    Check S3 folder if had existed the same file
    """
    from botocore.exceptions import ClientError
    try:
        s3_client.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except ClientError as e:
        if e.response['Error']['Code']=='NoSuchKey':
            return False
        else:
            return False

In [9]:
def is_valid_content(file_content):
    # Check if the content starts with an array or contains a newline character
    return file_content.strip().startswith('[') or '\n' not in file_content

In [10]:
def FlattenJSONFile(file_content, fixtureId):
    """
    Flatten Statistic JSON key 'statistics' array to become multi lines JSON structure 
    """
    data = json.loads(file_content)
    flattened_data = pd.DataFrame()    
    for item in data:
        # First, flatten the nested JSON objects using json_normalize()
        flattened_item_df = json_normalize(item, sep='_')
        #Have to assign fixture_id into the dataframe
        flattened_item_df = flattened_item_df.assign(fixture_id=int(fixtureId))
        if 'statistics' in flattened_item_df.columns:
            #Create an empty list
            series_list = []
            for stats in flattened_item_df['statistics'].values[0]:
                if stats:
                    stats_series = pd.Series(stats)
                    series_list.append(stats_series)
            # Pivot them 
            statistics_df = pd.concat(series_list, axis=1).T
            statistics_df = statistics_df.set_index('type').T
            statistics_df.columns = statistics_df.columns.str.replace(' ', '_')
            flattened_item_df.drop(columns=['statistics'], inplace=True)
            flattened_item_df.reset_index(drop=True, inplace=True)
            statistics_df.index = flattened_item_df.index # Make its index same as flattened_item_df
            # Concatenate the dataframes
            statistics_df.reset_index(drop=True, inplace=True)
            flattened_item_df = pd.concat([flattened_item_df, statistics_df], axis=1)
        else:
            print("statistics column does not exist in the DataFrame.")                
        flattened_data = pd.concat([flattened_data, flattened_item_df], ignore_index=True)

    json_lines = flattened_data.to_json(orient='records', lines=True)
    return json_lines

## List files in S3 using paginator

In [11]:
paginator = s3_client.get_paginator("list_objects_v2")

#Delimiter='/' can fix the search scope just in these layer, excluding subfolders files.
response = paginator.paginate(Bucket=bucket_name,  Prefix=f'backup/for{apiEndpoint}/league_fixtures/{target_year}/', Delimiter='/', PaginationConfig={"PageSize": 300})
count = 0
for page in response:
    files = page.get("Contents")
    for file in files:
        key = file['Key']
        if key.endswith('/'):
            continue
        if not target_year in key:
            continue
        print(f"file ={key}")    
        s3_clientobj = s3_client.get_object(Bucket= bucket_name, Key=key)
        s3_clientdata = s3_clientobj['Body'].read().decode('utf-8')
        for query in json.loads(s3_clientdata):
            fixtureId =f"{query['fixture']['id']}"            
            check_file_key = f"fixture_{apiEndpoint}/{target_year}/{fixtureId}{apiEndpoint}.json"
            if check_file_exists(s3_client, check_file_key):
                continue  
            
            #print(f"id={fixtureId}")
            querystring = {"fixture":fixtureId}
            data = requests.request("GET", url, headers=headers, params=querystring)
            count = count+1 
            if data.status_code != 200:
                print(f"Status code: {data.status_code}")
                continue
            if 'response' not in data.json():
                print(f"response not in data json, year={year}")
                continue  
            elif data.json().get('response') is None:
                print("response is None")
                continue 
            else:    
                data_string = json.dumps(data.json()['response'], default=str)
                if data_string =="[]" or data_string=="":
                    continue
                #call Flatten function to rnormalize JSON format
                json_lines = FlattenJSONFile(data_string, fixtureId)
                s3_client.put_object(
                Bucket=bucket_name, 
                Key=check_file_key,
                Body=json_lines
                )   
                print(f"target_key={check_file_key}")
        # Delete the former object A
        s3_client.delete_object(Bucket=bucket_name, Key=key) 
        print(f"Delete file key={key}")
        if count >=1000:
            print(f"count={count}")
            break            
    print("#"*100)
    print(f"count={count}")
    if count >=1000:
        break
    

file =backup/forstatistics/league_fixtures/2022/19_2022fixtures.json
Delete file key=backup/forstatistics/league_fixtures/2022/19_2022fixtures.json
file =backup/forstatistics/league_fixtures/2022/1_2022fixtures.json
target_key=fixture_statistics/2022/855734statistics.json
target_key=fixture_statistics/2022/855735statistics.json
target_key=fixture_statistics/2022/855736statistics.json
target_key=fixture_statistics/2022/855737statistics.json
target_key=fixture_statistics/2022/855738statistics.json
target_key=fixture_statistics/2022/855739statistics.json
target_key=fixture_statistics/2022/855740statistics.json
target_key=fixture_statistics/2022/855741statistics.json
target_key=fixture_statistics/2022/855742statistics.json
target_key=fixture_statistics/2022/855743statistics.json
target_key=fixture_statistics/2022/855744statistics.json
target_key=fixture_statistics/2022/855745statistics.json
target_key=fixture_statistics/2022/855746statistics.json
target_key=fixture_statistics/2022/855747st

target_key=fixture_statistics/2022/884406statistics.json
target_key=fixture_statistics/2022/884407statistics.json
target_key=fixture_statistics/2022/884408statistics.json
target_key=fixture_statistics/2022/884409statistics.json
target_key=fixture_statistics/2022/884410statistics.json
target_key=fixture_statistics/2022/884411statistics.json
target_key=fixture_statistics/2022/884412statistics.json
target_key=fixture_statistics/2022/884413statistics.json
target_key=fixture_statistics/2022/884414statistics.json
target_key=fixture_statistics/2022/884415statistics.json
target_key=fixture_statistics/2022/884416statistics.json
target_key=fixture_statistics/2022/884417statistics.json
target_key=fixture_statistics/2022/884418statistics.json
target_key=fixture_statistics/2022/884419statistics.json
target_key=fixture_statistics/2022/884420statistics.json
target_key=fixture_statistics/2022/884421statistics.json
target_key=fixture_statistics/2022/884422statistics.json
target_key=fixture_statistics/2

target_key=fixture_statistics/2022/884551statistics.json
target_key=fixture_statistics/2022/884552statistics.json
target_key=fixture_statistics/2022/884553statistics.json
target_key=fixture_statistics/2022/884554statistics.json
target_key=fixture_statistics/2022/884556statistics.json
target_key=fixture_statistics/2022/884557statistics.json
target_key=fixture_statistics/2022/884558statistics.json
target_key=fixture_statistics/2022/884559statistics.json
target_key=fixture_statistics/2022/884561statistics.json
target_key=fixture_statistics/2022/884562statistics.json
target_key=fixture_statistics/2022/884564statistics.json
target_key=fixture_statistics/2022/884565statistics.json
target_key=fixture_statistics/2022/884566statistics.json
target_key=fixture_statistics/2022/884567statistics.json
target_key=fixture_statistics/2022/884569statistics.json
target_key=fixture_statistics/2022/884570statistics.json
target_key=fixture_statistics/2022/884571statistics.json
target_key=fixture_statistics/2