In [1]:
from pandas import read_csv, DataFrame, concat
from os import listdir

games = []
filenames = listdir("./data")
for file in filenames:
    games.append(read_csv("./data/" + file))
games_df = concat(games,ignore_index=True)

games_df = games_df.rename(columns={"winner":"player","loser":"opponent"})

print(games_df.head())

         date player opponent
0  2021.09.24    이창석      박건호
1                원성진      최 정
2                강동윤      김승재
3                최철한      안성준
4  2021.09.23    김수진      박지영


In [2]:
latest = games_df.iloc[0]["date"]
for i in range(1,len(games_df)):
    if games_df.iloc[i]['date'] == '\xa0':
        games_df.iloc[i]['date'] = latest  
    else:
        latest = games_df.iloc[i-1]['date']

print(games_df.head())

         date player opponent
0  2021.09.24    이창석      박건호
1  2021.09.24    원성진      최 정
2  2021.09.24    강동윤      김승재
3  2021.09.24    최철한      안성준
4  2021.09.23    김수진      박지영


In [3]:
from tqdm import tqdm

games_df["player_result"] = None
games_df['player_start_of_streak'] = None
games_df['player_streak_id'] = None
games_df['PS'] = None

filtered_df = DataFrame()
for player in tqdm(games_df.player.unique()):
    filtered_df = games_df.loc[(games_df.player == player) | (games_df.opponent == player)]
    filtered_df = filtered_df.assign(player_result=lambda row: row.player == player)
    filtered_df = filtered_df.assign(player_start_of_streak = filtered_df.player_result.ne(filtered_df['player_result'].shift()))
    filtered_df = filtered_df.assign(player_streak_id=filtered_df['player_start_of_streak'].cumsum())
    filtered_df = filtered_df.assign(PS=filtered_df.groupby('player_streak_id').cumcount()+1)

    for index,row in filtered_df.iterrows():
        if row.player_result:
            games_df.loc[index,'PS'] = row.PS
        else:
            games_df.loc[index,'PS'] = row.PS * -1
    

100%|███████████████████████████████████████| 1322/1322 [00:40<00:00, 32.97it/s]


In [4]:
games_df["opponent_result"] = None
games_df['opponent_start_of_streak'] = None
games_df['opponent_streak_id'] = None
games_df['OS'] = None

filtered_df = DataFrame()
for player in tqdm(games_df.opponent.unique()):
    filtered_df = games_df.loc[(games_df.player == player) | (games_df.opponent == player)]
    filtered_df = filtered_df.assign(opponent_result=lambda row: row.opponent == player)
    filtered_df = filtered_df.assign(opponent_start_of_streak = filtered_df.opponent_result.ne(filtered_df['opponent_result'].shift()))
    filtered_df = filtered_df.assign(opponent_streak_id=filtered_df['opponent_start_of_streak'].cumsum())
    filtered_df = filtered_df.assign(OS=filtered_df.groupby('opponent_streak_id').cumcount()+1)
    
    for index,row in filtered_df.iterrows():
        if row.opponent_result:
            games_df.loc[index,'OS'] = row.OS
        else:
            games_df.loc[index,'OS'] = row.OS * -1


100%|███████████████████████████████████████| 1736/1736 [00:44<00:00, 39.13it/s]


In [5]:
# Add new column of win - lose of last 5 games
for i, row1 in tqdm(games_df.iterrows(), total=games_df.shape[0]):
    filtered_df = games_df.iloc[i + 1 :].loc[
        (games_df["player"] == row1["player"]) | (games_df["opponent"] == row1["player"])
    ][:5]
    value = 0
    for _, row2 in filtered_df.iterrows():
        value = value + 1 if row1["player"] == row2["player"] else value - 1
    games_df.loc[i, "PL5G"] = value

# Add new column of win - lose of last 10 games
for i, row1 in tqdm(games_df.iterrows(), total=games_df.shape[0]):
    filtered_df = games_df.iloc[i + 1 :].loc[
        (games_df["player"] == row1["opponent"]) | (games_df["opponent"] == row1["opponent"])
    ][:5]
    value = 0
    for _, row2 in filtered_df.iterrows():
        value = value + 1 if row1["opponent"] == row2["player"] else value - 1
    games_df.loc[i, "OL5G"] = value

100%|█████████████████████████████████████| 78093/78093 [13:51<00:00, 93.94it/s]
100%|█████████████████████████████████████| 78093/78093 [14:12<00:00, 91.64it/s]


In [6]:
# Add new column of against
for i, row1 in tqdm(games_df.iterrows(), total=games_df.shape[0]):
    filtered_df1 = games_df.iloc[i + 1 :].loc[
        ((row1["player"] == games_df["player"]) & (row1["opponent"] == games_df["opponent"]))
    ]
    filtered_df2 = games_df.iloc[i + 1 :].loc[
        ((row1["player"] == games_df["opponent"]) & (row1["opponent"] == games_df["player"]))
    ]

    games_df.loc[i, "PAW"] = filtered_df1.shape[0]
    games_df.loc[i, "OAW"] = filtered_df2.shape[0]

100%|█████████████████████████████████████| 78093/78093 [26:58<00:00, 48.25it/s]


In [7]:
# Add new column of winner win percentage
for i, row1 in tqdm(games_df.iterrows(), total=games_df.shape[0]):
    filtered_df1 = games_df.iloc[i + 1 :].loc[(row1["player"] == games_df["player"])]
    filtered_df2 = games_df.iloc[i + 1 :].loc[(row1["player"] == games_df["opponent"])]

    games_df.loc[i, "PNW"] = filtered_df1.shape[0]
    games_df.loc[i, "PNL"] = filtered_df2.shape[1]

# Add new column of loser win percentage
for i, row1 in tqdm(games_df.iterrows(), total=games_df.shape[0]):
    filtered_df1 = games_df.iloc[i + 1 :].loc[(row1["opponent"] == games_df["player"])]
    filtered_df2 = games_df.iloc[i + 1 :].loc[(row1["opponent"] == games_df["opponent"])]

    games_df.loc[i, "ONW"] = filtered_df1.shape[0]
    games_df.loc[i, "ONL"] = filtered_df2.shape[0]
    
    games_df["PWR"] = games_df["PNW"] / (games_df["PNW"] + games_df["PNL"])
    games_df["OWR"] = games_df["ONW"] / (games_df["ONW"] + games_df["ONL"])
    games_df["AWR"] = games_df["PAW"] / (games_df["PAW"] + games_df["OAW"])
    
print(games_df.head())

100%|█████████████████████████████████████| 78093/78093 [17:18<00:00, 75.23it/s]
100%|█████████████████████████████████████| 78093/78093 [20:10<00:00, 64.51it/s]

         date player opponent player_result player_start_of_streak  \
0  2021.09.24    이창석      박건호          None                   None   
1  2021.09.24    원성진      최 정          None                   None   
2  2021.09.24    강동윤      김승재          None                   None   
3  2021.09.24    최철한      안성준          None                   None   
4  2021.09.23    김수진      박지영          None                   None   

  player_streak_id  PS opponent_result opponent_start_of_streak  \
0             None  -1            None                     None   
1             None  -1            None                     None   
2             None  -1            None                     None   
3             None  -1            None                     None   
4             None  -1            None                     None   

  opponent_streak_id  ... OL5G   PAW  OAW    PNW   PNL    ONW    ONL  \
0               None  ... -1.0  16.0  1.0  251.0  15.0  188.0  159.0   
1               None  ... -3.0  




In [10]:
games_df.to_csv("./data/processed_games.csv")