In [0]:
%pip install python-dotenv

In [0]:
dbutils.library.restartPython()

In [0]:
from dotenv import load_dotenv
import os
import requests
import json

def fetch_matches():
    load_dotenv('../.env')

    headers = { 'X-Auth-Token': os.getenv('FOOTBALLDATA_TOKEN') }
    params = { 'season': 2025 }
    api_url = 'https://api.football-data.org/v4/competitions/PD/matches?status=FINISHED'

    try:
        resp = requests.get(api_url, headers=headers, params=params)
        resp.raise_for_status()

        matches_json = resp.json()
        for match in matches_json['matches']:
            match['homeTeam_id'] = match['homeTeam']['id']
            match.pop('homeTeam')
            match['awayTeam_id'] = match['awayTeam']['id']
            match.pop('awayTeam')

        with open('/Volumes/workspace/default/kickoff_volume/matches.json', 'w') as f:
            json.dump(matches_json, f)
    except:
        print(f'An error ocurred during the FootballDataORG petition')
        raise

fetch_matches()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType

competition_schema = StructType([
    StructField("code", StringType(), True),
    StructField("emblem", StringType(), True),
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("type", StringType(), True)
])

filters_schema = StructType([
    StructField("season", LongType(), True),
    StructField("status", ArrayType(StringType()), True)
])

area_schema = StructType([
    StructField("code", StringType(), True),
    StructField("flag", StringType(), True),
    StructField("id", LongType(), True),
    StructField("name", StringType(), True)
])

referee_schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("nationality", StringType(), True),
    StructField("type", StringType(), True)
])

score_substruct = StructType([
    StructField("away", LongType(), True),
    StructField("home", LongType(), True)
])

score_schema = StructType([
    StructField("duration", StringType(), True),
    StructField("fullTime", score_substruct, True),
    StructField("halfTime", score_substruct, True),
    StructField("winner", StringType(), True)
])

match_season_schema = StructType([
    StructField("currentMatchday", LongType(), True),
    StructField("endDate", StringType(), True),
    StructField("id", LongType(), True),
    StructField("startDate", StringType(), True),
    StructField("winner", StringType(), True)
])

match_schema = StructType([
    StructField("area", area_schema, True),
    StructField("awayTeam_id", LongType(), True),
    StructField("competition", competition_schema, True),
    StructField("group", StringType(), True),
    StructField("homeTeam_id", LongType(), True),
    StructField("id", LongType(), True),
    StructField("lastUpdated", StringType(), True),
    StructField("matchday", LongType(), True),
    StructField("odds", StructType([StructField("msg", StringType(), True)]), True),
    StructField("referees", ArrayType(referee_schema), True),
    StructField("score", score_schema, True),
    StructField("season", match_season_schema, True),
    StructField("stage", StringType(), True),
    StructField("status", StringType(), True),
    StructField("utcDate", StringType(), True)
])

result_set_schema = StructType([
    StructField("count", LongType(), True),
    StructField("first", StringType(), True),
    StructField("last", StringType(), True),
    StructField("played", LongType(), True)
])

root_schema = StructType([
    StructField("competition", competition_schema, True),
    StructField("filters", filters_schema, True),
    StructField("matches", ArrayType(match_schema), True),
    StructField("resultSet", result_set_schema, True)
])

In [0]:
df_bronze = (
    spark
    .read
    .format('json')
    .schema(root_schema)
    .load('/Volumes/workspace/default/kickoff_volume/matches.json')
)

df_bronze.printSchema()
display(df_bronze.limit(10))

In [0]:
df_bronze.write.mode('overwrite').option('overwriteSchema', 'true').saveAsTable('raw_matches')