## Data Pre-Processing
---

In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

### Data Import
---

In [2]:
j1 = pd.read_parquet("../data/j1_league.parquet")

### Data Cleansing
---

In [3]:
j1["date"] = pd.to_datetime(j1["date"]).dt.normalize()

j1[["goals_home", "goals_away"]] = j1[["goals_home", "goals_away"]].astype(int)

j1["handicap"] = j1["handicap"].str.replace("nan", "0").str.split(",") \
    .apply(lambda y: np.mean(list(map(float, y))))

### Data Wrangling
---

In [4]:
j1["season"] = j1["date"].dt.year

j1["points_home"] = np.select(
    [j1["goals_home"] > j1["goals_away"], j1["goals_home"] < j1["goals_away"]],
    [3, 0], default=1
)
j1["points_away"] = j1["points_home"].map({3: 0, 0: 3, 1: 1})

j1["net_goals_home"] = abs(j1["goals_home"] - j1["goals_away"]) \
    * np.where(j1["points_home"] == 3, 1, -1)
j1["net_goals_away"] = j1["net_goals_home"] * -1

j1["results"] = ((j1["goals_home"] + j1["handicap"] > j1["goals_away"]) \
    | ((j1["goals_home"] + j1["handicap"] == j1["goals_away"]) & (j1["handicap"] > 0))) + 0

### Normalisation
---

In [5]:
dc = ["date", "home", "away", "handicap", "results"]
fixtures = j1[dc].copy()
fixtures.head()

Unnamed: 0,date,home,away,handicap,results
0,2015-03-07,Gamba Osaka,FC Tokyo,-0.5,0
1,2015-03-07,Nagoya Grampus,Matsumoto Y FC,-0.75,0
2,2015-03-07,Sagan Tosu,Albirex Niigata,-0.25,1
3,2015-03-07,Sanfrecce Hiroshima,Ventforet Kofu,-0.5,1
4,2015-03-07,Vegalta Sendai,Montedio Yamagata,-0.25,1


In [6]:
dc = ["season", "date", "home", "goals_home", "points_home", "net_goals_home"]
home = j1[dc].copy()
home.columns = [column.replace(r"_home", "") for column in home.columns]
home.rename(columns={"home": "team"}, inplace=True)
home["stadium"] = 1

In [7]:
dc = ["season", "date", "away", "goals_away", "points_away", "net_goals_away"]
away = j1[dc].copy()
away.columns = [column.replace(r"_away", "") for column in away.columns]
away.rename(columns={"away": "team"}, inplace=True)
away["stadium"] = 0

In [8]:
plays = pd.concat([home, away], ignore_index=True)
plays.sort_values("date", ignore_index=True, inplace=True)
plays.head()

Unnamed: 0,season,date,team,goals,points,net_goals,stadium
0,2015,2015-03-07,Gamba Osaka,2,1,0,1
1,2015,2015-03-07,Kashiwa Reysol,1,3,1,0
2,2015,2015-03-07,Kawasaki Frontale,3,3,2,0
3,2015,2015-03-07,Montedio Yamagata,0,0,-2,0
4,2015,2015-03-07,FC Tokyo,2,1,0,0


### Data Export
---

In [9]:
fixtures.to_parquet("../data/fixtures.parquet")
plays.to_parquet("../data/plays.parquet")