In [37]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer

from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

In [38]:
games = pd.read_csv("data/games.csv")
turns = pd.read_csv("data/turns.csv")
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

# Test Casual Players

In [39]:
players_only_test = games[games['game_id'].isin(test[~test['nickname'].isin(train['nickname'])]['game_id'])][
    ['game_id', 'first', 'rating_mode']]
players_only_test = players_only_test.merge(test)[['game_id', 'rating_mode', 'nickname']]
amount_of_game_modes_per_user = players_only_test.groupby(['nickname', 'rating_mode']).agg(
    amount_of_games=('game_id', 'count')).reset_index()
casual_players = \
amount_of_game_modes_per_user.groupby('nickname').filter(lambda x: all(x['rating_mode'] == 'CASUAL')).sort_values(
    'amount_of_games', ascending=False)['nickname']
casual_players

81        Eloquent
451            rcg
46        Churrrro
146      Kevin_818
178         Menonr
          ...     
94     HadeefHafiy
380     groveringu
293      Tayming05
295     Thejana123
489           umar
Name: nickname, Length: 76, dtype: object

In [89]:
test[test['nickname'].isin(casual_players)]['game_id']

10          28
28          62
32          65
34          68
54          93
         ...  
44657    72700
44666    72706
44669    72709
44693    72741
44721    72762
Name: game_id, Length: 2707, dtype: int64

In [40]:
2707 / len(test)

0.06052407995349461

# FE

In [41]:
turns

Unnamed: 0,game_id,turn_number,nickname,rack,location,move,points,score,turn_type
0,1,1,BetterBot,DDEGITT,8G,DIG,10,10,Play
1,1,2,stevy,AEHOPUX,7H,HAP,18,18,Play
2,1,3,BetterBot,DEELTTU,6I,LUTE,16,26,Play
3,1,4,stevy,EMORSUX,5K,UM,16,34,Play
4,1,5,BetterBot,ACDEITU,L5,..DICATE,28,54,Play
...,...,...,...,...,...,...,...,...,...
2005493,72773,22,adola,ABINRRU,15N,IN,18,376,Play
2005494,72773,23,HastyBot,EGHIIP,H12,.HIG,24,331,Play
2005495,72773,24,adola,ABRRU,15E,BRA.,7,383,Play
2005496,72773,25,HastyBot,EIP,7A,PIE,11,342,Play


In [198]:
import re

locations = {'letter_x3': ['6B', 'B10', 'F2', 'F6', 'F10', 'F14', 'J2', 'J6', 'J10', 'J14', 'N6', 'N10'],
             'letter_x2': ['A4', 'A12', 'C7', 'C9', 'D1', 'D8', 'D15', 'G3', 'G7', 'G9', 'G13', 'H4', 'H12', 'I3', 'I7', 'I9', 'I13', 'L1', 'L8', 'L15', 'M7', 'M9', 'O4', 'O12'],
             'word_x3': ['A1', 'H1', 'O1', '8A', 'O8', '15A', 'H15', 'O15'],
             'word_x2': ['B2', 'C3', 'D4', 'E5', 'K5', 'L4', 'M3', 'N2', 'B14', 'C13', 'D12', 'E11', 'K11', 'L12', 'M13', 'N14']}


def swap_elements(lst):
    swapped = []
    for element in lst:
        # Separate numbers and letters
        numbers = ''.join(filter(str.isdigit, element))
        letters = ''.join(filter(str.isalpha, element))

        # Swap only if the letter is first in the original string
        if element[0].isalpha():
            swapped_element = numbers + letters
        else:
            swapped_element = letters + numbers
        swapped.append(swapped_element)
    return swapped

for key in locations.keys():
    locations[key] += swap_elements(locations[key])

def categorize_location(loc):
    for category, loc_list in locations.items():
        if loc in loc_list:
            return 1
    return 0

turns['loc_category'] = turns['location'].apply(categorize_location)

In [204]:
turns[turns['game_id'] == 1].groupby('nickname').loc_category.count()

nickname
BetterBot    13
stevy        14
Name: loc_category, dtype: int64

In [205]:
turns['move_len'] = turns['move'].str.len()
turns['rack_len'] = turns['rack'].str.len()
turns['rack_usage'] = turns['move_len'] / turns['rack_len']

train_test_merged = pd.concat([train, test], axis=0).sort_values('game_id')
turns_groupby = turns.groupby(['game_id', 'nickname']).agg(
    mean_points=('points', 'mean'),
    max_points=('points', 'max'),
    min_points=('points', 'min'),
    num_moves=('move', 'count'),
    mean_move_len=('move_len', 'mean'),
    max_move_len=('move_len', 'max'),
    min_move_len=('move_len', 'min'),
    mean_rack_len=('rack_len', 'mean'),
    max_rack_len=('rack_len', 'max'),
    min_rack_len=('rack_len', 'min'),
    mean_rack_usage=('rack_usage', 'mean'),
    max_rack_usage=('rack_usage', 'max'),
    min_rack_usage=('rack_usage', 'min'),
    num_special_loc=('loc_category', 'sum')).reset_index()

full_df = pd.merge(pd.merge(train_test_merged, turns_groupby), games)

full_df["created_at"] = pd.to_datetime(full_df["created_at"])
full_df['score_per_move'] = full_df['score'] / full_df['num_moves']
full_df["created_at_month"] = full_df["created_at"].dt.month
full_df["created_at_day"] = full_df["created_at"].dt.day
full_df["created_at_hour"] = full_df["created_at"].dt.hour
full_df["created_at_day_of_week"] = full_df["created_at"].dt.dayofweek


def is_first_winner(first, winner):
    bot_names = ['BetterBot', 'STEEBot', 'HastyBot']
    if (first in bot_names and winner == 0) or (first not in bot_names and winner == 1):
        return 1
    return 0

le = LabelEncoder()
full_df['is_first_winner'] = full_df.apply(lambda x: is_first_winner(x['first'], x['winner']), axis=1)
new_full_df = pd.get_dummies(full_df,
                             columns=['initial_time_seconds', 'time_control_name', 'game_end_reason', 'lexicon',
                                      'increment_seconds', 'rating_mode', 'max_overtime_minutes']).drop(
    ['nickname', 'created_at', 'first'], axis=1)

In [206]:
full_df

Unnamed: 0,game_id,nickname,score,rating,mean_points,max_points,min_points,num_moves,mean_move_len,max_move_len,...,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,score_per_move,created_at_month,created_at_day,created_at_hour,created_at_day_of_week,is_first_winner
0,1,BetterBot,335,1637.0,25.769231,68,8,13,4.615385,8.0,...,0,CASUAL,1,674.844274,25.769231,8,26,3,4,0
1,1,stevy,429,1500.0,30.642857,98,2,14,3.857143,7.0,...,0,CASUAL,1,674.844274,30.642857,8,26,3,4,0
2,2,BetterBot,401,2000.0,33.416667,85,8,12,5.250000,8.0,...,0,RATED,1,364.214418,33.416667,8,10,19,2,1
3,2,Super,488,,34.857143,94,0,14,4.357143,8.0,...,0,RATED,1,364.214418,34.857143,8,10,19,2,1
4,3,BetterBot,318,2071.0,24.461538,76,0,13,4.384615,8.0,...,0,RATED,5,492.268262,24.461538,9,4,8,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145541,72771,HastyBot,393,1614.0,26.200000,93,0,15,3.933333,7.0,...,0,CASUAL,10,236.133634,26.200000,9,19,3,0,1
145542,72772,BetterBot,442,1674.0,31.571429,81,9,14,4.571429,8.0,...,0,RATED,1,681.370077,31.571429,8,31,15,2,0
145543,72772,Gtowngrad,388,1364.0,24.250000,67,4,16,3.562500,8.0,...,0,RATED,1,681.370077,24.250000,8,31,15,2,0
145544,72773,HastyBot,346,2302.0,24.714286,39,4,14,4.214286,6.0,...,0,RATED,1,719.720262,24.714286,8,27,9,5,0


In [207]:
X_temp = new_full_df[~new_full_df['rating'].isna()].drop('game_id', axis=1)

test_ids = new_full_df[new_full_df['rating'].isna()]['game_id']
test_test = new_full_df[new_full_df['rating'].isna()].drop('game_id', axis=1)

X, y = X_temp.drop('rating', axis=1), X_temp['rating']
X_test, y_test = test_test.drop('rating', axis=1), test_test['rating']

In [212]:
def evaluate_models_with_kfold(X, y, X_test, n_splits=5):
    models = {
        'LinearRegression': LinearRegression(),
        'LGBMRegressor': LGBMRegressor(verbose=-1),
        'CatBoostRegressor': CatBoostRegressor(verbose=0, iterations=100),
        'RandomForestRegressor': RandomForestRegressor()
    }

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    model_rmse = {name: 0 for name in models}
    model_predictions = {name: [] for name in models}

    for train_index, val_index in tqdm(kf.split(X)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        for name, model in models.items():
            model.fit(X_train, y_train)
            val_preds = model.predict(X_val)
            rmse = mean_squared_error(y_val, val_preds, squared=False)
            model_rmse[name] += rmse / n_splits
            test_preds = model.predict(X_test)
            model_predictions[name].append(test_preds)

    for name in models:
        model_predictions[name] = np.mean(model_predictions[name], axis=0)

    best_model_predictions = model_predictions[min(model_rmse, key=model_rmse.get)]
    return model_rmse, best_model_predictions, models[min(model_rmse, key=model_rmse.get)]

In [None]:
model_rmse, model_predictions, best_model = evaluate_models_with_kfold(X, y, X_test, n_splits=5)

2it [07:10, 214.21s/it]

In [None]:
model_rmse

In [94]:
model_rmse

{'LinearRegression': 129.16682920190783,
 'LGBMRegressor': 107.01309632852934,
 'CatBoostRegressor': 105.3919063240607,
 'RandomForestRegressor': 105.03277334445264}

In [102]:
casual_game_ids = test[test['nickname'].isin(casual_players)]['game_id']
casual_game_ids

10          28
28          62
32          65
34          68
54          93
         ...  
44657    72700
44666    72706
44669    72709
44693    72741
44721    72762
Name: game_id, Length: 2707, dtype: int64

In [113]:
submission = pd.DataFrame()
submission["game_id"] = test_ids
submission["rating"] = model_predictions
#submission.loc[submission['game_id'].isin(casual_game_ids), 'rating'] = 1500
submission.to_csv("PlayerRatingSubmission.csv", index=False)
submission

Unnamed: 0,game_id,rating
3,2,2002.932
12,7,1964.112
20,11,1629.064
26,14,1984.470
53,27,2069.576
...,...,...
145518,72760,2030.818
145521,72761,1689.808
145523,72762,1708.794
145534,72768,2051.436


In [117]:
games[games['game_id'].isin(casual_game_ids)][['game_id', 'rating_mode', 'lexicon']]

Unnamed: 0,game_id,rating_mode,lexicon
27,28,CASUAL,CSW21
61,62,CASUAL,CSW21
64,65,CASUAL,CSW21
67,68,CASUAL,CSW21
92,93,CASUAL,CSW21
...,...,...,...
72699,72700,CASUAL,NWL20
72705,72706,CASUAL,CSW21
72708,72709,CASUAL,CSW21
72740,72741,CASUAL,CSW21


In [157]:
players_only_train = games[games['game_id'].isin(train[~train['nickname'].isin(test['nickname'])]['game_id'])][
    ['game_id', 'first', 'rating_mode', 'lexicon']]
players_only_train = players_only_train.merge(train)[['game_id', 'rating_mode', 'nickname', 'rating', 'lexicon']]
amount_of_game_modes_per_user_train = players_only_train.groupby(['nickname', 'rating_mode']).agg(
    amount_of_games=('game_id', 'count')).reset_index()
casual_players_train = amount_of_game_modes_per_user_train.groupby('nickname').filter(lambda x: all(x['rating_mode'] == 'CASUAL')).sort_values(
    'amount_of_games', ascending=False)['nickname']
players_only_train[players_only_train['nickname'].isin(casual_players_train)]

Unnamed: 0,game_id,rating_mode,nickname,rating,lexicon
1,1,CASUAL,stevy,1500,NWL20
6,5,CASUAL,stevy,1500,NWL20
16,12,CASUAL,BB-8,1500,ECWL
18,13,CASUAL,Trayz,2017,CSW21
25,17,CASUAL,BB-8,1500,ECWL
...,...,...,...,...,...
100745,72707,CASUAL,proton2020,1979,CSW21
100757,72716,CASUAL,BB-8,1500,ECWL
100767,72724,CASUAL,Sopejohn,1845,CSW21
100794,72756,CASUAL,Basonomia28,1719,CSW21


In [161]:
players_only_csw = games[games['game_id'].isin(train[~train['nickname'].isin(test['nickname'])]['game_id'])][
    ['game_id', 'first', 'rating_mode', 'lexicon']]
players_only_csw = players_only_csw.merge(train)[['game_id', 'rating_mode', 'nickname', 'rating', 'lexicon']]
amount_of_csw_per_user_train = players_only_csw.groupby(['nickname', 'lexicon']).agg(
    amount_of_games=('game_id', 'count')).reset_index()
csw_players_train = amount_of_csw_per_user_train.groupby('nickname').filter(lambda x: all(x['lexicon'] == 'CSW21')).sort_values(
    'amount_of_games', ascending=False)['nickname']
csw_players_train

194    Goldenlamb
226        HivinD
486     Rexington
359       Matt_86
276     Kathleen3
          ...    
769    fariha2022
770        fawzah
785       guanhui
243     Issacbruh
0      0188889876
Name: nickname, Length: 790, dtype: object

In [174]:
csw_casual = list(set(casual_players_train).intersection(set(csw_players_train)))
train[train['nickname'].isin(csw_casual)].sort_values('nickname')[:5]#['game_id']

Unnamed: 0,game_id,nickname,score,rating
27935,20098,171_selene,304,1937
95013,68561,171_selene,458,1937
83219,60100,171_selene,274,1922
30211,21760,171_selene,355,1937
90029,64972,171_selene,251,1922


In [173]:
games[games['game_id'].isin([20098, 68561, 60100, 21760, 64972])]

Unnamed: 0,game_id,first,time_control_name,game_end_reason,winner,created_at,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds
20097,20098,171_selene,regular,TIME,0,2022-08-22 01:03:00,CSW21,1080,0,CASUAL,1,1142.65992
21759,21760,BetterBot,regular,STANDARD,0,2022-08-22 01:22:14,CSW21,1080,0,CASUAL,1,1077.023195
60099,60100,BetterBot,regular,STANDARD,0,2022-08-21 09:51:14,CSW21,1080,0,CASUAL,1,633.480192
64971,64972,171_selene,regular,STANDARD,0,2022-08-17 22:42:43,CSW21,1080,0,CASUAL,1,867.213536
68560,68561,171_selene,regular,STANDARD,1,2022-08-22 01:40:30,CSW21,1080,0,CASUAL,1,638.003902
