In [1]:
from os import listdir

from pandas import DataFrame, concat, read_csv

games = []
filenames = listdir("./data/raw")
current_df = None
for file in sorted(filenames, reverse=True):
    if ".csv" in file:
        current_df = read_csv("./data/raw/" + file)[
            10:
        ]  # each file has 10 repeating rows
        games.append(current_df[::-1])
        games_df = concat(games, ignore_index=True)
        games_df = games_df.rename(columns={"winner": "player", "loser": "opponent"})


In [2]:
# Let each row with empty date have the same one from above
from numpy import NaN

games_df = games_df.replace({"\xa0": NaN})
games_df = games_df.fillna(method="backfill")

games_df

Unnamed: 0,date,player,opponent
0,2021.01.01,원성진,최 정
1,2021.01.01,신진서,한승주
2,2021.01.01,나 현,조한승
3,2021.01.01,이영구,이태현
4,2021.01.02,신민준,박건호
...,...,...,...
99532,2000.12.28,마샤오춘,고바야시 사토루
99533,2000.12.28,최명훈,김석흥
99534,2000.12.29,이희성,박승철
99535,2000.12.29,왕레이,저우허양


In [3]:
from tqdm import tqdm

"""
    Compute every player's cumulative streak
    
    PS = Player Streak
    OS = Opponenet Streak
"""

games_df['PS'], games_df['OS'] = 0, 0

for player in tqdm(games_df.player.unique()):
    filtered_df = games_df.loc[
        (games_df.player == player) | (games_df.opponent == player)
    ]
    filtered_df = filtered_df.assign(player_result=lambda row: row.player == player)
    filtered_df = filtered_df.assign(
        player_start_of_streak=filtered_df.player_result.ne(
            filtered_df["player_result"].shift()
        )
    )
    filtered_df = filtered_df.assign(
        player_streak_id=filtered_df["player_start_of_streak"].cumsum()
    )
    filtered_df = filtered_df.assign(
        PS=filtered_df.groupby("player_streak_id").cumcount() + 1
    )

    filtered_df = filtered_df.assign(opponent_result=lambda row: row.opponent == player)
    filtered_df = filtered_df.assign(
        opponent_start_of_streak=filtered_df.opponent_result.ne(
            filtered_df["opponent_result"].shift()
        )
    )
    filtered_df = filtered_df.assign(
        opponent_streak_id=filtered_df["opponent_start_of_streak"].cumsum()
    )
    filtered_df = filtered_df.assign(
        OS=filtered_df.groupby("opponent_streak_id").cumcount() + 1
    )
    filtered_df = filtered_df[['OS', 'PS']].shift(periods=1, fill_value=0)
    
    games_df.update(filtered_df)

games_df

100%|███████████████████████████████████████| 1418/1418 [00:32<00:00, 43.50it/s]


Unnamed: 0,date,player,opponent,PS,OS
0,2021.01.01,원성진,최 정,0.0,0.0
1,2021.01.01,신진서,한승주,0.0,0.0
2,2021.01.01,나 현,조한승,0.0,0.0
3,2021.01.01,이영구,이태현,0.0,0.0
4,2021.01.02,신민준,박건호,0.0,0.0
...,...,...,...,...,...
99532,2000.12.28,마샤오춘,고바야시 사토루,3.0,3.0
99533,2000.12.28,최명훈,김석흥,3.0,3.0
99534,2000.12.29,이희성,박승철,1.0,1.0
99535,2000.12.29,왕레이,저우허양,2.0,2.0


In [4]:
from pandas import concat
from tqdm import tqdm

"""
    Compute every player's cumulative win rates and last 5 game status
    
    PWR = Player Win Rate
    OWR = Opponent Win Rate
    PL5G = Player Last 5 Game
    OL5G = Opponent Last 5 Game
"""

games_df["PWR"], games_df["OWR"] = NaN, NaN
games_df["PL5G"], games_df["OL5G"] = NaN, NaN

for player in tqdm(games_df.player.unique()):
    filtered_df = games_df.loc[
        (games_df.player == player) | (games_df.opponent == player),
        ["player", "opponent"],
    ]
    filtered_df.insert(
        filtered_df.shape[1], "num_games", range(1, len(filtered_df) + 1)
    )
    filtered_df = filtered_df.assign(player_win=lambda row: row.player == player)
    filtered_df = filtered_df.assign(opponent_lose=lambda row: row.opponent == player)

    filtered_df.loc[filtered_df["player"] == player, "PNW"] = filtered_df[
        "player_win"
    ].cumsum()
    filtered_df["PNL"] = filtered_df["num_games"] - filtered_df["PNW"]
    filtered_df.loc[filtered_df["opponent"] == player, "ONL"] = filtered_df[
        "opponent_lose"
    ].cumsum()
    filtered_df["ONW"] = filtered_df["num_games"] - filtered_df["ONL"]

    filtered_df["PWR"] = filtered_df["PNW"] / filtered_df["num_games"]
    filtered_df["OWR"] = filtered_df["ONW"] / filtered_df["num_games"]

    filtered_df.loc[filtered_df["player"] == player, "PL5G"] = (
        filtered_df["player_win"].rolling(5, min_periods=1).sum()
        - filtered_df["opponent_lose"].rolling(5, min_periods=1).sum()
    )
    filtered_df.loc[filtered_df["opponent"] == player, "OL5G"] = (
        filtered_df["player_win"].rolling(5, min_periods=1).sum()
        - filtered_df["opponent_lose"].rolling(5, min_periods=1).sum()
    )

    filtered_df = filtered_df.loc[:, ["PWR", "OWR", "PL5G", "OL5G"]]
    filtered_df = filtered_df.shift(periods=1, fill_value=NaN)
    games_df.update(filtered_df)

games_df[(games_df["player"] == "이창석") | (games_df["opponent"] == "이창석")]

100%|███████████████████████████████████████| 1418/1418 [00:38<00:00, 36.75it/s]


Unnamed: 0,date,player,opponent,PS,OS,PWR,OWR,PL5G,OL5G
5,2021.01.02,이창석,강동윤,0.0,0.0,,,,
57,2021.01.07,이창석,나 현,1.0,1.0,1.000000,,1.0,
81,2021.01.10,이창석,설현준,1.0,1.0,1.000000,0.000000,2.0,-1.0
180,2021.01.14,이창석,오유진,1.0,1.0,0.333333,,-1.0,
295,2021.01.17,신진서,이창석,4.0,4.0,1.000000,,4.0,
...,...,...,...,...,...,...,...,...,...
41061,2015.12.10,이창석,류재형,4.0,4.0,0.500000,,3.0,
41082,2015.12.11,강승민,이창석,2.0,2.0,0.564669,,-1.0,
41136,2015.12.17,이창석,박종훈,1.0,1.0,,0.577844,,-1.0
41160,2015.12.18,이창석,김나현,6.0,6.0,0.624060,0.314815,3.0,-5.0


In [5]:
from collections import Counter

"""
    Compute every player's cumulative win rates agaisnt opponent
    
    AWR = Against Win Rate
"""


def computeAWR(x):
    n = x["num_games"]
    for i in range(0, x["num_games"] + 1):
        if (n - i) - i == x["AS"]:
            return (n - i) / n if x["reverted"] == 1 else i / n


games_df["AWR"] = NaN
against_df = games_df.copy(deep=True)
against_df["match"] = (
    against_df[["player", "opponent"]].apply(sorted, axis=1).astype(str)
)

against_df = against_df.groupby("match")

for name, group in tqdm(against_df):
    group["tolist"] = [
        str(val) for val in group[["player", "opponent"]].values.tolist()
    ]
    group["reverted"] = group.apply(lambda x: 1 if x["tolist"] == name else -1, axis=1)
    group["num_games"] = range(1, len(group) + 1)
    group["AS"] = group["reverted"].rolling(5, min_periods=1).sum()
    group["AWR"] = group.apply(computeAWR, axis=1)

    group = group["AWR"]
    
    group = group.shift(periods=1, fill_value=NaN)
    games_df.update(group)

games_df

100%|████████████████████████████████████| 53643/53643 [08:42<00:00, 102.64it/s]


Unnamed: 0,date,player,opponent,PS,OS,PWR,OWR,PL5G,OL5G,AWR
0,2021.01.01,원성진,최 정,0.0,0.0,,,,,
1,2021.01.01,신진서,한승주,0.0,0.0,,,,,
2,2021.01.01,나 현,조한승,0.0,0.0,,,,,
3,2021.01.01,이영구,이태현,0.0,0.0,,,,,
4,2021.01.02,신민준,박건호,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
99532,2000.12.28,마샤오춘,고바야시 사토루,3.0,3.0,0.458333,0.351852,-1.0,-1.0,
99533,2000.12.28,최명훈,김석흥,3.0,3.0,0.286689,,1.0,,1.0
99534,2000.12.29,이희성,박승철,1.0,1.0,,0.543860,,1.0,0.4
99535,2000.12.29,왕레이,저우허양,2.0,2.0,0.620482,,3.0,,1.0


In [6]:
from pandas import DataFrame, read_csv
from tqdm import tqdm

"""
    Reverse necessary columns and double the dataset. The reversed column will have a label of 0
"""

games_df["label"] = 1

new_games_df = games_df.copy(deep=True)
new_games_df = new_games_df.rename(
    columns={
        "player": "opponent",
        "opponent": "player",
        "PS": "OS",
        "OS": "PS",
        "PWR": "OWR",
        "OWR": "PWR",
        "PL5G": "OL5G",
        "OL5G": "PL5G",
    }
)
new_games_df["AWR"] = new_games_df.apply(lambda x: 1 - x["AWR"], axis=1)
new_games_df["label"] = 0


games_df = games_df.append(new_games_df, ignore_index=True)

games_df

Unnamed: 0,date,player,opponent,PS,OS,PWR,OWR,PL5G,OL5G,AWR,label
0,2021.01.01,원성진,최 정,0.0,0.0,,,,,,1
1,2021.01.01,신진서,한승주,0.0,0.0,,,,,,1
2,2021.01.01,나 현,조한승,0.0,0.0,,,,,,1
3,2021.01.01,이영구,이태현,0.0,0.0,,,,,,1
4,2021.01.02,신민준,박건호,0.0,0.0,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...
199069,2000.12.28,고바야시 사토루,마샤오춘,3.0,3.0,0.351852,0.458333,-1.0,-1.0,,0
199070,2000.12.28,김석흥,최명훈,3.0,3.0,,0.286689,,1.0,0.0,0
199071,2000.12.29,박승철,이희성,1.0,1.0,0.543860,,1.0,,0.6,0
199072,2000.12.29,저우허양,왕레이,2.0,2.0,,0.620482,,3.0,0.0,0


In [7]:
games_df.to_csv("./data/12-16-processed.csv")