# Looping through "year" variable within the API and extracting data for every year

In [36]:
# Dependencies
import json
import requests
import os
import pandas as pd
from pandas import json_normalize
from pprint import pprint

In [37]:
api_key = os.getenv('gd_api_key')

In [38]:
# Define List of Years to query

years = ['2017', '2018', '2019', '2020', '2021', '2022', '2023']

In [39]:
# Define the variables for API URL

file_format = "json"

tour = "pga"

event_id = "all"

url = "https://feeds.datagolf.com/historical-raw-data/rounds?"

In [42]:
# Create a list for all data and loop through each year, query the API, and compile the data into one list

all_data = {}

for year in years:
    query_url = f"{url}tour={tour}&event_id={event_id}&year={year}&file_format={file_format}&key={api_key}"
    response = requests.get(query_url)
    
    if response.status_code == 200:
        data = response.json()
        for event_id, event_data in data.items():
            unique_event_id = f"{year}_{event_id}"
            all_data[unique_event_id] = event_data
        print(f"Data for year {year} retrieved successfully. Total records so far: {len(all_data)}")
    else:
        print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

Data for year 2017 retrieved successfully. Total records so far: 46
Data for year 2018 retrieved successfully. Total records so far: 55
Failed to retrieve data for year 2019. Status code: 400
Failed to retrieve data for year 2020. Status code: 400
Failed to retrieve data for year 2021. Status code: 500
Failed to retrieve data for year 2022. Status code: 400
Failed to retrieve data for year 2023. Status code: 400


In [41]:
# Inspect the data structure

print(json.dumps(all_data, indent=4, sort_keys=True))

{}


In [21]:
# Inspect the data structure and data types for each key

for data in all_data:
    for key in data.keys():
        print(f"Key: {key}, Data Type: {type(data[key])}")

Key: event_completed, Data Type: <class 'str'>
Key: event_id, Data Type: <class 'str'>
Key: event_name, Data Type: <class 'str'>
Key: scores, Data Type: <class 'list'>
Key: season, Data Type: <class 'int'>
Key: sg_categories, Data Type: <class 'str'>
Key: tour, Data Type: <class 'str'>
Key: traditional_stats, Data Type: <class 'str'>
Key: year, Data Type: <class 'int'>
Key: event_completed, Data Type: <class 'str'>
Key: event_id, Data Type: <class 'str'>
Key: event_name, Data Type: <class 'str'>
Key: scores, Data Type: <class 'list'>
Key: season, Data Type: <class 'int'>
Key: sg_categories, Data Type: <class 'str'>
Key: tour, Data Type: <class 'str'>
Key: traditional_stats, Data Type: <class 'str'>
Key: year, Data Type: <class 'int'>
Key: event_completed, Data Type: <class 'str'>
Key: event_id, Data Type: <class 'str'>
Key: event_name, Data Type: <class 'str'>
Key: scores, Data Type: <class 'list'>
Key: season, Data Type: <class 'int'>
Key: sg_categories, Data Type: <class 'str'>
Key: 

In [22]:
# Extracting scores, flattening the data, and creating the DataFrame
scores_list = []

for event in all_data.values():
    event_completed = event["event_completed"]
    event_id = event["event_id"]
    event_name = event["event_name"]
    season = event["season"]
    sg_categrories = event["sg_categories"]
    traditional_stats = event["traditional_stats"]
    year = event["year"]
    for score in event["scores"]:
        for round_num in range(1, 5):
            round_key = f"round_{round_num}"
            if round_key in score:
                round_data = score[round_key]
                combined_score = {
                    "event_completed": event_completed,
                    "event_id": event_id,
                    "event_name": event_name,
                    "season": season,
                    "sg_categories": sg_categrories,
                    "traditional_stats": traditional_stats,
                    "year": year,
                    "dg_id": score["dg_id"],
                    "fin_text": score["fin_text"],
                    "player_name": score["player_name"],
                    "round": round_num,
                    **round_data
                }
                scores_list.append(combined_score)

# Create the anaylsis DataFrame
scores_df = pd.DataFrame(scores_list)

# Display the DataFrame
scores_df.head()

AttributeError: 'list' object has no attribute 'values'

### Saving the DataFrame to a .csv file to ensure all data was captured correctly

In [9]:
# Create a new folder in in the parent directory to store the CSV file

os.makedirs("../data_files", exist_ok=True)

# Print the file path to the new folder
print(os.path.abspath("../data_files"))

c:\Users\Jon\OneDrive\githubRepo\pga_analysis_project\data_files


In [10]:
# Save the DataFrame to a CSV file in the data_files folder

scores_df.to_csv("../data_files/pga_tour_data.csv", index=False)

# Print the file path to the new CSV file
print(os.path.abspath("../data_files/pga_tour_data.csv"))

c:\Users\Jon\OneDrive\githubRepo\pga_analysis_project\data_files\pga_tour_data.csv
