In [None]:
import json

import pandas as pd
from pytz import timezone

In [None]:
df = pd.read_parquet("../data/raw/raw_data.parquet")
df.columns = [col.lower() for col in df.columns]

In [None]:
# convert date to JST
uk_tz = timezone("Europe/London")
jp_tz = timezone("Asia/Tokyo")

df["date_time"] = df["date"] + " " + df["time"]

df["jst"] = pd.to_datetime(df["date_time"], format="%d/%m/%Y %H:%M") \
    .map(uk_tz.localize) \
    .map(lambda x: x.astimezone(jp_tz))

df["date"] = df["jst"].dt.strftime("%Y-%m-%d")

df["time"] = df["jst"].dt.strftime("%H:%M")

# encode team name
teams = pd.unique(df[["home", "away"]].values.ravel("K"))
teams.sort()
encode_team = {team:str(idx) for idx, team in enumerate(teams)}
encode_team.update({"None": "-1"})
decode_team = {idx:team for team, idx in encode_team.items()}

df["home"] = df["home"].map(encode_team)

df["away"] = df["away"].map(encode_team)

# absolute difference in goals
df["goals_diff"] = df["hg"] - df["ag"]

df["goals_abs_diff"] = df["goals_diff"].abs()

In [None]:
col = [
    "season", "date", "time",
    "home", "away", 
    "hg", "ag", "res",
    "avgch", "avgcd", "avgca",
    "goals_diff", "goals_abs_diff",
]
df = df[col].copy()

In [None]:
with open("../data/mapping/encoder.json", "w") as file:
    json.dump(encode_team, file)

with open("../data/mapping/decoder.json", "w") as file:
    json.dump(decode_team, file)

df.to_parquet("../data/cleansed/cleansed_data.parquet")

#### Fixture Information

In [None]:
col = [
    "season", "date", "time",
    "home", "away", "res",
    "avgch", "avgcd", "avgca",
]
fixtures = df[col].copy()

In [None]:
fixtures.to_parquet("../data/cleansed/fixtures.parquet")

#### Individual Team Results

In [None]:
dfs = []
for stadium, team in enumerate(["away", "home"]):
    play = df.rename(
        columns={team: "team", f"{team[0]}g": "goals"}
    ).copy()
    
    multp = (play["res"].str.lower() == team[0]) \
        .map({True: 1, False: -1})
    play["net_goals"] = play["goals_abs_diff"] * multp
    
    play["points"] = play["res"].str.lower() \
        .map({team[0]: 3, "d": 1}) \
        .fillna(0) \
        .astype(int)
    
    play["stadium"] = stadium
    
    dfs.append(play)

plays = pd.concat(dfs, ignore_index=True)

In [None]:
col = [
    "season", "date", "team",
    "goals", "net_goals", "points",
    "stadium",
]
plays = plays[col].copy()

plays.sort_values(by="date", ignore_index=True, inplace=True)

In [None]:
plays.to_parquet("../data/cleansed/plays.parquet")