In [1]:
from os import listdir

from pandas import DataFrame, concat, read_csv

games = []
filenames = listdir("./data/raw")
current_df = None
for file in sorted(filenames):
    if ".csv" in file:
        current_df = read_csv("./data/raw/" + file)[
            10:
        ]  # each file has 10 repeating rows
        games.append(current_df[::-1])
        games_df = concat(games, ignore_index=True)
        games_df = games_df.rename(columns={"winner": "player", "loser": "opponent"})


In [2]:
# Let each row with empty date have the same one from above
from numpy import NaN

games_df = games_df.replace({"\xa0": NaN})
games_df = games_df.fillna(method="backfill")

games_df

Unnamed: 0,date,player,opponent
0,2000.01.03,정수현,유창혁
1,2000.01.04,루이나이웨이(국내),이창호
2,2000.01.04,목진석,김주호
3,2000.01.04,최철한,김종준
4,2000.01.04,조훈현,이상훈(大)
...,...,...,...
99532,2021.12.10,강지훈,백현우
99533,2021.12.11,변상일,쉬자위안
99534,2021.12.11,안성준,김정현
99535,2021.12.11,신진서,쉬자양


In [3]:
from tqdm import tqdm

"""
    Compute every player's cumulative streak
    
    PS = Player Streak
    OS = Opponenet Streak
"""

games_df['PS'], games_df['OS'] = 0, 0

for player in tqdm(games_df.player.unique()):
    filtered_df = games_df.loc[
        (games_df.player == player) | (games_df.opponent == player)
    ]
    filtered_df = filtered_df.assign(player_result=lambda row: row.player == player)
    filtered_df = filtered_df.assign(
        player_start_of_streak=filtered_df.player_result.ne(
            filtered_df["player_result"].shift()
        )
    )
    filtered_df = filtered_df.assign(
        player_streak_id=filtered_df["player_start_of_streak"].cumsum()
    )
    filtered_df = filtered_df.assign(
        PS=filtered_df.groupby("player_streak_id").cumcount() + 1
    )

    filtered_df = filtered_df.assign(opponent_result=lambda row: row.opponent == player)
    filtered_df = filtered_df.assign(
        opponent_start_of_streak=filtered_df.opponent_result.ne(
            filtered_df["opponent_result"].shift()
        )
    )
    filtered_df = filtered_df.assign(
        opponent_streak_id=filtered_df["opponent_start_of_streak"].cumsum()
    )
    filtered_df = filtered_df.assign(
        OS=filtered_df.groupby("opponent_streak_id").cumcount() + 1
    )
    filtered_df = filtered_df[['OS', 'PS']].shift(periods=1, fill_value=0)
    
    games_df.update(filtered_df)

games_df

100%|███████████████████████████████████████| 1418/1418 [00:30<00:00, 46.19it/s]


Unnamed: 0,date,player,opponent,PS,OS
0,2000.01.03,정수현,유창혁,0.0,0.0
1,2000.01.04,루이나이웨이(국내),이창호,0.0,0.0
2,2000.01.04,목진석,김주호,0.0,0.0
3,2000.01.04,최철한,김종준,0.0,0.0
4,2000.01.04,조훈현,이상훈(大),0.0,0.0
...,...,...,...,...,...
99532,2021.12.10,강지훈,백현우,5.0,5.0
99533,2021.12.11,변상일,쉬자위안,2.0,2.0
99534,2021.12.11,안성준,김정현,4.0,4.0
99535,2021.12.11,신진서,쉬자양,3.0,3.0


In [4]:
from pandas import concat
from tqdm import tqdm

"""
    Compute every player's cumulative win rates and last 5 game status
    
    PWR = Player Win Rate
    OWR = Opponent Win Rate
    PL5G = Player Last 5 Game
    OL5G = Opponent Last 5 Game
"""

games_df["PWR"], games_df["OWR"] = NaN, NaN
games_df["PL5G"], games_df["OL5G"] = NaN, NaN
games_df["PNG"], games_df["ONG"] = NaN, NaN

for player in tqdm(games_df.player.unique()):
    filtered_df = games_df.loc[
        (games_df.player == player) | (games_df.opponent == player),
        ["player", "opponent"],
    ]
    filtered_df.insert(
        filtered_df.shape[1], "num_games", range(1, len(filtered_df) + 1)
    )
    filtered_df = filtered_df.assign(player_win=lambda row: row.player == player)
    filtered_df = filtered_df.assign(opponent_lose=lambda row: row.opponent == player)

    filtered_df.loc[filtered_df["player"] == player, "PNW"] = filtered_df[
        "player_win"
    ].cumsum()
    filtered_df["PNL"] = filtered_df["num_games"] - filtered_df["PNW"]
    filtered_df.loc[filtered_df["opponent"] == player, "ONL"] = filtered_df[
        "opponent_lose"
    ].cumsum()
    filtered_df["ONW"] = filtered_df["num_games"] - filtered_df["ONL"]

    filtered_df["PWR"] = filtered_df["PNW"] / filtered_df["num_games"]
    filtered_df["OWR"] = filtered_df["ONW"] / filtered_df["num_games"]
    
    filtered_df.loc[filtered_df["player"] == player, "PNG"] = filtered_df["num_games"]
    filtered_df.loc[filtered_df["opponent"] == player, "ONG"] = filtered_df["num_games"]
    games_df.update(filtered_df[['PNG', 'ONG']])
    
    filtered_df.loc[filtered_df["player"] == player, "PL5G"] = (
        filtered_df["player_win"].rolling(5, min_periods=1).sum()
        - filtered_df["opponent_lose"].rolling(5, min_periods=1).sum()
    )
    filtered_df.loc[filtered_df["opponent"] == player, "OL5G"] = (
        filtered_df["player_win"].rolling(5, min_periods=1).sum()
        - filtered_df["opponent_lose"].rolling(5, min_periods=1).sum()
    )

    filtered_df = filtered_df.loc[:, ["PWR", "OWR", "PL5G", "OL5G"]]
    filtered_df = filtered_df.shift(periods=1, fill_value=NaN)
    games_df.update(filtered_df)

games_df[(games_df["player"] == "이창석") | (games_df["opponent"] == "이창석")]

100%|███████████████████████████████████████| 1418/1418 [00:53<00:00, 26.74it/s]


Unnamed: 0,date,player,opponent,PS,OS,PWR,OWR,PL5G,OL5G,PNG,ONG
58493,2015.03.02,이창석,조경호,0.0,0.0,,0.512953,,-1.0,1.0,194.0
58545,2015.03.02,이창석,김영도,3.0,3.0,1.000000,0.000000,1.0,-3.0,2.0,4.0
58624,2015.03.03,최명훈,이창석,2.0,2.0,1.000000,,2.0,,678.0,3.0
58818,2015.03.13,이창석,이영주,1.0,1.0,0.394737,0.666667,1.0,1.0,4.0,115.0
58878,2015.03.16,박진솔,이창석,1.0,1.0,0.750000,,2.0,,433.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
99211,2021.11.29,홍무진,이창석,3.0,3.0,0.631579,,3.0,,191.0,397.0
99248,2021.12.02,이창석,강승민,1.0,1.0,,0.624685,,1.0,398.0,533.0
99307,2021.12.06,이창석,김상인,1.0,1.0,0.625628,0.352273,3.0,1.0,399.0,177.0
99393,2021.12.07,이창석,김승준,2.0,2.0,0.626566,,3.0,,400.0,792.0


In [5]:
from collections import Counter

"""
    Compute every player's cumulative win rates agaisnt opponent
    
    AWR = Against Win Rate
"""


def computeAWR(x):
    n = x["num_games"]
    for i in range(0, x["num_games"] + 1):
        if (n - i) - i == x["AS"]:
            return (n - i) / n if x["reverted"] == 1 else i / n


games_df["AWR"] = NaN
against_df = games_df.copy(deep=True)
against_df["match"] = (
    against_df[["player", "opponent"]].apply(sorted, axis=1).astype(str)
)

against_df = against_df.groupby("match")

for name, group in tqdm(against_df):
    group["tolist"] = [
        str(val) for val in group[["player", "opponent"]].values.tolist()
    ]
    group["reverted"] = group.apply(lambda x: 1 if x["tolist"] == name else -1, axis=1)
    group["num_games"] = range(1, len(group) + 1)
    group["AS"] = group["reverted"].rolling(5, min_periods=1).sum()
    group["AWR"] = group.apply(computeAWR, axis=1)

    group = group["AWR"]
    
    group = group.shift(periods=1, fill_value=NaN)
    games_df.update(group)

games_df

100%|█████████████████████████████████████| 53643/53643 [10:52<00:00, 82.27it/s]


Unnamed: 0,date,player,opponent,PS,OS,PWR,OWR,PL5G,OL5G,PNG,ONG,AWR
0,2000.01.03,정수현,유창혁,0.0,0.0,,,,,1.0,1.0,
1,2000.01.04,루이나이웨이(국내),이창호,0.0,0.0,,,,,1.0,1.0,
2,2000.01.04,목진석,김주호,0.0,0.0,,,,,1.0,1.0,
3,2000.01.04,최철한,김종준,0.0,0.0,,,,,1.0,1.0,
4,2000.01.04,조훈현,이상훈(大),0.0,0.0,,,,,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
99532,2021.12.10,강지훈,백현우,5.0,5.0,0.635870,,5.0,,155.0,185.0,0.500000
99533,2021.12.11,변상일,쉬자위안,2.0,2.0,0.406250,,1.0,,774.0,65.0,1.000000
99534,2021.12.11,안성준,김정현,4.0,4.0,0.615094,,3.0,,598.0,531.0,
99535,2021.12.11,신진서,쉬자양,3.0,3.0,0.682028,,3.0,,746.0,218.0,0.714286


In [6]:
from pandas import DataFrame, read_csv
from tqdm import tqdm

"""
    Reverse necessary columns and double the dataset. The reversed column will have a label of 0
"""

games_df["label"] = 1

new_games_df = games_df.copy(deep=True)
new_games_df = new_games_df.rename(
    columns={
        "player": "opponent",
        "opponent": "player",
        "PS": "OS",
        "OS": "PS",
        "PWR": "OWR",
        "OWR": "PWR",
        "PL5G": "OL5G",
        "OL5G": "PL5G",
    }
)
new_games_df["AWR"] = new_games_df.apply(lambda x: 1 - x["AWR"], axis=1)
new_games_df["label"] = 0


games_df = games_df.append(new_games_df, ignore_index=True)

games_df

Unnamed: 0,date,player,opponent,PS,OS,PWR,OWR,PL5G,OL5G,PNG,ONG,AWR,label
0,2000.01.03,정수현,유창혁,0.0,0.0,,,,,1.0,1.0,,1
1,2000.01.04,루이나이웨이(국내),이창호,0.0,0.0,,,,,1.0,1.0,,1
2,2000.01.04,목진석,김주호,0.0,0.0,,,,,1.0,1.0,,1
3,2000.01.04,최철한,김종준,0.0,0.0,,,,,1.0,1.0,,1
4,2000.01.04,조훈현,이상훈(大),0.0,0.0,,,,,1.0,1.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199069,2021.12.10,백현우,강지훈,5.0,5.0,,0.635870,,5.0,155.0,185.0,0.500000,0
199070,2021.12.11,쉬자위안,변상일,2.0,2.0,,0.406250,,1.0,774.0,65.0,0.000000,0
199071,2021.12.11,김정현,안성준,4.0,4.0,,0.615094,,3.0,598.0,531.0,,0
199072,2021.12.11,쉬자양,신진서,3.0,3.0,,0.682028,,3.0,746.0,218.0,0.285714,0


In [7]:
games_df.to_csv("./data/12-16-processed.csv")