# Imports

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRanker
from sklearn.metrics import mean_absolute_error as mae
import matplotlib.pyplot as plt
import seaborn as sns
from preprocess import preprocess, process, league_train_test_split, league_fusion, league_split, league_cross_val

# Data

In [10]:
data = pd.read_csv('jobfair_train.csv')

In [11]:
data

Unnamed: 0,season,club_id,league_id,dynamic_payment_segment,cohort_season,avg_age_top_11_players,avg_stars_top_11_players,avg_stars_top_14_players,avg_training_factor_top_11_players,days_active_last_28_days,...,registration_country,registration_platform_specific,league_match_won_count_last_28_days,training_count_last_28_days,global_competition_level,tokens_spent_last_28_days,tokens_stash,rests_stash,morale_boosters_stash,league_rank
0,173,6042825,2912348,2) Minnow,134,25,5.255151,4.483009,1.778801,25,...,Portugal,Flash FB Canvas,24,435,11.0,598,245,186,818,2
1,173,9620967,2912140,2) Minnow,28,23,4.962521,4.785648,-0.002687,28,...,Turkey,Android Phone,19,58,7.0,269,0,35,24,2
2,173,6045474,2912933,4) Whale,136,23,11.105782,10.511133,0.628794,28,...,Belgium,Flash FB Canvas,26,116,8.0,23180,369,444,4933,1
3,173,14187773,2910371,0) NonPayer,2,21,4.164848,4.023486,0.551904,12,...,Malaysia,Android Phone,11,22,2.0,41,43,213,126,6
4,173,13694853,2907632,0) NonPayer,5,22,3.031939,2.926029,0.690544,0,...,Italy,iOS Phone,9,0,4.0,0,16,100,147,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55309,173,14278145,2906514,0) NonPayer,2,25,4.160909,3.948086,0.503241,11,...,Switzerland,iOS Phone,13,25,,42,10,25,37,1
55310,173,14309556,2906913,2) Minnow,2,21,4.244145,3.762286,0.664514,28,...,France,iOS Phone,25,167,,214,158,358,256,1
55311,173,14455654,2905985,0) NonPayer,1,21,4.000158,3.753276,0.507620,6,...,France,iOS Tablet,14,17,,73,11,70,57,11
55312,173,13623733,2910437,0) NonPayer,5,24,4.582158,4.461391,0.664767,28,...,Morocco,Android Phone,22,80,4.0,90,65,109,150,5


In [12]:
data = data.sort_values(by='league_id')

In [13]:
data.head()

Unnamed: 0,season,club_id,league_id,dynamic_payment_segment,cohort_season,avg_age_top_11_players,avg_stars_top_11_players,avg_stars_top_14_players,avg_training_factor_top_11_players,days_active_last_28_days,...,registration_country,registration_platform_specific,league_match_won_count_last_28_days,training_count_last_28_days,global_competition_level,tokens_spent_last_28_days,tokens_stash,rests_stash,morale_boosters_stash,league_rank
17008,173,14438136,2904743,0) NonPayer,1,24,3.179297,3.038181,0.721519,2,...,Nigeria,iOS Phone,10,2,,2,53,22,35,10
17232,173,14487784,2904743,0) NonPayer,1,24,3.04543,2.914638,0.593801,2,...,Mexico,iOS Phone,4,1,,2,43,42,24,14
17219,173,14478979,2904743,0) NonPayer,1,23,3.920945,3.658,0.552545,12,...,Turkey,Android Phone,9,24,,91,3,16,7,5
17205,173,14437898,2904743,0) NonPayer,1,21,3.936582,3.678486,0.530063,3,...,Brazil,Android Phone,11,24,3.0,65,21,17,59,4
17185,173,14496777,2904743,0) NonPayer,1,23,3.140812,3.002486,0.608834,2,...,Armenia,Android Phone,0,3,,9,47,42,22,12


# Data split

In [69]:
X = data.drop('league_rank', axis=1)
y = data['league_rank']

In [70]:
X_train, X_test, y_train, y_test = league_train_test_split(X, y)

In [71]:
X_train = preprocess(X_train, y_train)
X_test = process(X_test)

In [72]:
group_data = X_train.groupby('league_id').size().values

In [73]:
X_train.drop(['league_id', 'season', 'club_id'], inplace=True, axis=1)
X_test.drop(['league_id', 'season', 'club_id'], inplace=True, axis=1)

# Model 

In [74]:
ranker = XGBRanker(objective='rank:ndcg', device='cuda')

In [75]:
ranker.fit(X_train, y_train, group=group_data)
ranker.set_params(device='cpu')

In [76]:
y_train_pred = ranker.predict(X_train)

In [77]:
for i in range(0, y_train_pred.shape[0], 14):
    y_train_pred[i:i+14] = np.argsort(y_train_pred[i:i+14])

In [78]:
mae(y_train, y_train_pred)

4.52971234747368

In [83]:
y_test_pred = pd.DataFrame()
predicted_scores = ranker.predict(X_test)
y_test_pred['predicted_scores'] = predicted_scores
y_test_pred['league_id'] = data['league_id'].iloc[X_test.index].values
y_test_pred['predicted_rank'] = y_test_pred.groupby('league_id')['predicted_scores'].rank(ascending=False, method='first')

In [88]:
y_test_pred['predicted_rank']

0        3.0
1       14.0
2        5.0
3        7.0
4        1.0
        ... 
9907     1.0
9908    13.0
9909    12.0
9910    10.0
9911    14.0
Name: predicted_rank, Length: 9912, dtype: float64

In [89]:
my_y_test_pred = ranker.predict(X_test)
for i in range(0, my_y_test_pred.shape[0], 14):
    my_y_test_pred[i:i+14] = np.argsort(my_y_test_pred[i:i+14])

In [90]:
my_y_test_pred

array([ 1.,  5., 13., ...,  6.,  7.,  9.], dtype=float32)

In [91]:
mae(y_test, y_test_pred['predicted_rank'])

6.45137207425343

In [92]:
mae(y_test, my_y_test_pred)

4.5258272800645685

In [94]:
y_test_pred.iloc[:14]

Unnamed: 0,predicted_scores,league_id,predicted_rank
0,0.904927,2904743,3.0
1,-3.309383,2904743,14.0
2,0.478665,2904743,5.0
3,0.152457,2904743,7.0
4,1.457854,2904743,1.0
5,-2.978901,2904743,13.0
6,0.721618,2904743,4.0
7,0.408883,2904743,6.0
8,1.134933,2904743,2.0
9,-0.943451,2904743,10.0


In [99]:
ranker.predict(X_test)[:14]

array([ 0.9049271 , -3.3093832 ,  0.47866464,  0.15245706,  1.4578536 ,
       -2.9789011 ,  0.7216177 ,  0.40888348,  1.1349329 , -0.94345134,
       -0.82596445, -1.2945577 , -0.62868166, -1.3478018 ], dtype=float32)