# Imports

In [244]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_absolute_error as mae, accuracy_score, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from preprocess import preprocess, process, league_train_test_split
from sklearn.linear_model import LinearRegression
from lineartree import LinearBoostRegressor, LinearForestRegressor

# Data 

In [245]:
data = pd.read_csv('jobfair_train.csv')

In [246]:
data

Unnamed: 0,season,club_id,league_id,dynamic_payment_segment,cohort_season,avg_age_top_11_players,avg_stars_top_11_players,avg_stars_top_14_players,avg_training_factor_top_11_players,days_active_last_28_days,...,registration_country,registration_platform_specific,league_match_won_count_last_28_days,training_count_last_28_days,global_competition_level,tokens_spent_last_28_days,tokens_stash,rests_stash,morale_boosters_stash,league_rank
0,173,6042825,2912348,2) Minnow,134,25,5.255151,4.483009,1.778801,25,...,Portugal,Flash FB Canvas,24,435,11.0,598,245,186,818,2
1,173,9620967,2912140,2) Minnow,28,23,4.962521,4.785648,-0.002687,28,...,Turkey,Android Phone,19,58,7.0,269,0,35,24,2
2,173,6045474,2912933,4) Whale,136,23,11.105782,10.511133,0.628794,28,...,Belgium,Flash FB Canvas,26,116,8.0,23180,369,444,4933,1
3,173,14187773,2910371,0) NonPayer,2,21,4.164848,4.023486,0.551904,12,...,Malaysia,Android Phone,11,22,2.0,41,43,213,126,6
4,173,13694853,2907632,0) NonPayer,5,22,3.031939,2.926029,0.690544,0,...,Italy,iOS Phone,9,0,4.0,0,16,100,147,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55309,173,14278145,2906514,0) NonPayer,2,25,4.160909,3.948086,0.503241,11,...,Switzerland,iOS Phone,13,25,,42,10,25,37,1
55310,173,14309556,2906913,2) Minnow,2,21,4.244145,3.762286,0.664514,28,...,France,iOS Phone,25,167,,214,158,358,256,1
55311,173,14455654,2905985,0) NonPayer,1,21,4.000158,3.753276,0.507620,6,...,France,iOS Tablet,14,17,,73,11,70,57,11
55312,173,13623733,2910437,0) NonPayer,5,24,4.582158,4.461391,0.664767,28,...,Morocco,Android Phone,22,80,4.0,90,65,109,150,5


In [247]:
# data = data.drop(['season', 'league_id', 'club_id'], axis=1)

# Data split

In [248]:
X = data.drop('league_rank', axis=1)
y = data['league_rank']

In [249]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = league_train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.drop(['season', 'league_id', 'club_id'], axis=1)
X_test = X_test.drop(['season', 'league_id', 'club_id'], axis=1)

In [250]:
X_train = preprocess(X_train, y_train)

In [251]:
X_test = process(X_test)

# Model testing 

In [252]:
# params = {'max_depth': [2, 4, 8, 16, None], 'n_estimators': [100, 128, 256, 512, 1024, 2048, None]}
params = {'max_depth': [2], 'n_estimators': [2048, 4096, 8192]}

### Regression

regression_gs = GridSearchCV(XGBRegressor(device='cuda'), params,
                             scoring='neg_mean_absolute_error', n_jobs=4)

regression_gs.fit(X_train, y_train)

print(regression_gs.best_score_, regression_gs.best_params_)
print(regression_gs.cv_results_['mean_test_score'])

regression_model = regression_gs.best_estimator_
regression_model.set_params(device='cpu')

In [253]:
%%time
regression_model = XGBRegressor(max_depth=2, n_estimators=2048, device='cuda')
regression_model.fit(X_train, y_train)
regression_model.set_params(device='cpu')

CPU times: total: 6.27 s
Wall time: 4.38 s


In [254]:
print("train:", mae(y_train, regression_model.predict(X_train)))
print("test:", mae(y_test, regression_model.predict(X_test)))

train: 2.1746134586945294
test: 2.409337422296218


In [255]:
feature_imp = list(zip(*reversed(sorted(list(zip(regression_model.feature_importances_, X_train.columns))))))
sns.set(rc={'figure.figsize': (20, 20)})
sns.barplot(x=list(feature_imp[0]), y=list(feature_imp[1]))