# Lineup data Fetch and transfer and upload to S3

In [1]:
import requests
import os
import configparser
import boto3
import json
import time
from datetime import datetime, timedelta
import pandas as pd
from pandas import json_normalize

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
X_RapidAPI_Key = config['FOOTBALLAPI']['X_RapidAPI_Key']
X_RapidAPI_Host= config['FOOTBALLAPI']['X_RapidAPI_Host']

In [5]:
headers = {
    "X-RapidAPI-Key": X_RapidAPI_Key,
	"X-RapidAPI-Host": X_RapidAPI_Host
}

In [6]:
s3_client = boto3.client('s3')
#s3_resource = boto3.resource('s3')
bucket_name='capstonefootballbucket'

In [7]:
target_year = '2023'
apiEndpoint = "lineups"
url = f"https://api-football-v1.p.rapidapi.com/v3/fixtures/{apiEndpoint}"

In [8]:
def check_file_exists(s3_client, file_key):
    """
    Check S3 folder if had existed the same file
    """
    from botocore.exceptions import ClientError
    try:
        s3_client.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except ClientError as e:
        if e.response['Error']['Code']=='NoSuchKey':
            return False
        else:
            return False

In [9]:
def is_valid_content(file_content):
    # Check if the content starts with an array or contains a newline character
    return file_content.strip().startswith('[') or '\n' not in file_content

In [10]:
def FlattenJSONFile(file_content, fixtureId):
    """
    Flatten JSON File that data include key value by array list format
    """
    data = json.loads(file_content)
    flattened_data = pd.DataFrame()    
    for item in data:
        # First, flatten the nested JSON objects using json_normalize()
        flattened_item_df = json_normalize(item, sep='_')
        #Have to assign fixture_id into the dataframe
        flattened_item_df = flattened_item_df.assign(fixture_id=int(fixtureId))
        if 'startXI' in flattened_item_df.columns:
            startXI_df = json_normalize(flattened_item_df['startXI'].values[0], sep='_')
            startXI_df = startXI_df.assign(type='starter')
            # Add 'startXI.' prefix to the column names in seasons_df
            startXI_df.columns = ['lineup_' + col for col in startXI_df.columns]
            if 'substitutes' in flattened_item_df.columns:
                substitutes_df = json_normalize(flattened_item_df['substitutes'].values[0], sep='_')
                substitutes_df = substitutes_df.assign(type='substitute')
                substitutes_df.columns = ['lineup_' + col for col in substitutes_df.columns]
                result_df = pd.concat([startXI_df, substitutes_df], ignore_index=True)
            else:    
                result_df = startXI_df

            # Duplicate flattened_item_df to match the length of seasons_df
            flattened_item_df = pd.concat([flattened_item_df]*len(result_df), ignore_index=True)
            flattened_item_df.drop(columns=['startXI'], inplace=True)
            if 'substitutes' in flattened_item_df.columns:
                flattened_item_df.drop(columns=['substitutes'], inplace=True)
            # Concatenate the dataframes
            flattened_item_df = pd.concat([flattened_item_df, result_df], axis=1)
        else:
            print("startXI column does not exist in the DataFrame.") 
        #because old file put season column in last position, add it here   
        flattened_item_df = flattened_item_df.assign(season=int(target_year))    
        flattened_data = pd.concat([flattened_data, flattened_item_df], ignore_index=True)

    json_lines = flattened_data.to_json(orient='records', lines=True)
    return json_lines

## List files in S3 using paginator

In [13]:
paginator = s3_client.get_paginator("list_objects_v2")

#Delimiter='/' can fix the search scope just in these layer, excluding subfolders files.
response = paginator.paginate(Bucket=bucket_name,  Prefix=f'backup/forlinups/league_fixtures/{target_year}/', Delimiter='/', PaginationConfig={"PageSize": 300})
count = 0
for page in response:
    files = page.get("Contents")
    for file in files:
        key = file['Key']
        if key.endswith('/'):
            continue
        if not target_year in key:
            continue
        print(f"file ={key}")    
        s3_clientobj = s3_client.get_object(Bucket= bucket_name, Key=key)
        s3_clientdata = s3_clientobj['Body'].read().decode('utf-8')
        for query in json.loads(s3_clientdata):
            fixtureId =f"{query['fixture']['id']}"            
            check_file_key = f"fixture_{apiEndpoint}/{target_year}/{fixtureId}{apiEndpoint}.json"
            if check_file_exists(s3_client, check_file_key):
                continue  
            
            #print(f"id={fixtureId}")
            querystring = {"fixture":fixtureId}
            data = requests.request("GET", url, headers=headers, params=querystring)
            count = count+1 
            if data.status_code != 200:
                print(f"Status code: {data.status_code}")
                continue
            if 'response' not in data.json():
                print(f"response not in data json, year={year}")
                continue  
            elif data.json().get('response') is None:
                print("response is None")
                continue 
            else:    
                data_string = json.dumps(data.json()['response'], default=str)
                if data_string =="[]" or data_string=="":
                    continue
                #call Flatten function to rnormalize JSON format
                json_lines = FlattenJSONFile(data_string, fixtureId)
                s3_client.put_object(
                Bucket=bucket_name, 
                Key=check_file_key,
                Body=json_lines
                )   
                print(f"target_key={check_file_key}")
        # Delete the former object A
        s3_client.delete_object(Bucket=bucket_name, Key=key) 
        print(f"Delete file key={key}")
        if count >=1000:
            print(f"count={count}")
            break            
    print("#"*10)
    print(f"count={count}")
    if count >=1000:
        break
    

file =backup/forlinups/league_fixtures/2023/240_2023fixtures.json
target_key=fixture_lineups/2023/989528lineups.json
target_key=fixture_lineups/2023/989529lineups.json
target_key=fixture_lineups/2023/989530lineups.json
target_key=fixture_lineups/2023/989531lineups.json
target_key=fixture_lineups/2023/989532lineups.json
target_key=fixture_lineups/2023/989533lineups.json
target_key=fixture_lineups/2023/989534lineups.json
target_key=fixture_lineups/2023/989535lineups.json
target_key=fixture_lineups/2023/989536lineups.json
target_key=fixture_lineups/2023/989537lineups.json
target_key=fixture_lineups/2023/989538lineups.json
target_key=fixture_lineups/2023/989539lineups.json
target_key=fixture_lineups/2023/989540lineups.json
target_key=fixture_lineups/2023/989541lineups.json
target_key=fixture_lineups/2023/989542lineups.json
target_key=fixture_lineups/2023/989543lineups.json
target_key=fixture_lineups/2023/989544lineups.json
target_key=fixture_lineups/2023/989545lineups.json
target_key=fixtu

target_key=fixture_lineups/2023/1010871lineups.json
target_key=fixture_lineups/2023/1010872lineups.json
target_key=fixture_lineups/2023/1010873lineups.json
target_key=fixture_lineups/2023/1019974lineups.json
target_key=fixture_lineups/2023/1019975lineups.json
target_key=fixture_lineups/2023/1019976lineups.json
target_key=fixture_lineups/2023/1019977lineups.json
target_key=fixture_lineups/2023/1019978lineups.json
target_key=fixture_lineups/2023/1019979lineups.json
target_key=fixture_lineups/2023/1019980lineups.json
target_key=fixture_lineups/2023/1019981lineups.json
Delete file key=backup/forlinups/league_fixtures/2023/241_2023fixtures.json
file =backup/forlinups/league_fixtures/2023/242_2023fixtures.json
target_key=fixture_lineups/2023/996219lineups.json
target_key=fixture_lineups/2023/996220lineups.json
target_key=fixture_lineups/2023/996221lineups.json
target_key=fixture_lineups/2023/996222lineups.json
target_key=fixture_lineups/2023/996223lineups.json
target_key=fixture_lineups/2023

target_key=fixture_lineups/2023/996423lineups.json
target_key=fixture_lineups/2023/996424lineups.json
target_key=fixture_lineups/2023/996425lineups.json
Delete file key=backup/forlinups/league_fixtures/2023/244_2023fixtures.json
file =backup/forlinups/league_fixtures/2023/245_2023fixtures.json
target_key=fixture_lineups/2023/998948lineups.json
target_key=fixture_lineups/2023/998949lineups.json
target_key=fixture_lineups/2023/998950lineups.json
target_key=fixture_lineups/2023/998951lineups.json
target_key=fixture_lineups/2023/998952lineups.json
target_key=fixture_lineups/2023/998953lineups.json
target_key=fixture_lineups/2023/998954lineups.json
target_key=fixture_lineups/2023/998955lineups.json
target_key=fixture_lineups/2023/998956lineups.json
target_key=fixture_lineups/2023/998957lineups.json
target_key=fixture_lineups/2023/998958lineups.json
target_key=fixture_lineups/2023/998959lineups.json
target_key=fixture_lineups/2023/998960lineups.json
target_key=fixture_lineups/2023/998961line