# Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, accuracy_score, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from preprocess import preprocess, process

# Data

In [3]:
data = pd.read_csv('jobfair_train.csv')

In [4]:
data

Unnamed: 0,season,club_id,league_id,dynamic_payment_segment,cohort_season,avg_age_top_11_players,avg_stars_top_11_players,avg_stars_top_14_players,avg_training_factor_top_11_players,days_active_last_28_days,...,registration_country,registration_platform_specific,league_match_won_count_last_28_days,training_count_last_28_days,global_competition_level,tokens_spent_last_28_days,tokens_stash,rests_stash,morale_boosters_stash,league_rank
0,173,6042825,2912348,2) Minnow,134,25,5.255151,4.483009,1.778801,25,...,Portugal,Flash FB Canvas,24,435,11.0,598,245,186,818,2
1,173,9620967,2912140,2) Minnow,28,23,4.962521,4.785648,-0.002687,28,...,Turkey,Android Phone,19,58,7.0,269,0,35,24,2
2,173,6045474,2912933,4) Whale,136,23,11.105782,10.511133,0.628794,28,...,Belgium,Flash FB Canvas,26,116,8.0,23180,369,444,4933,1
3,173,14187773,2910371,0) NonPayer,2,21,4.164848,4.023486,0.551904,12,...,Malaysia,Android Phone,11,22,2.0,41,43,213,126,6
4,173,13694853,2907632,0) NonPayer,5,22,3.031939,2.926029,0.690544,0,...,Italy,iOS Phone,9,0,4.0,0,16,100,147,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55309,173,14278145,2906514,0) NonPayer,2,25,4.160909,3.948086,0.503241,11,...,Switzerland,iOS Phone,13,25,,42,10,25,37,1
55310,173,14309556,2906913,2) Minnow,2,21,4.244145,3.762286,0.664514,28,...,France,iOS Phone,25,167,,214,158,358,256,1
55311,173,14455654,2905985,0) NonPayer,1,21,4.000158,3.753276,0.507620,6,...,France,iOS Tablet,14,17,,73,11,70,57,11
55312,173,13623733,2910437,0) NonPayer,5,24,4.582158,4.461391,0.664767,28,...,Morocco,Android Phone,22,80,4.0,90,65,109,150,5


# Data split

In [6]:
X = data.drop('league_rank', axis=1)
y = data['league_rank']-1

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [8]:
standard_scaler = StandardScaler()

In [9]:
X_train = standard_scaler.fit_transform(preprocess(X_train, y_train))

In [10]:
X_test = standard_scaler.transform(process(X_test))

# Model testing

In [35]:
params = {'n_neighbors': [1, 5, 9, 15, 23, 57, 117, 235], 'weights': ['uniform', 'distance']}

### Regression

In [36]:
regresssion_gs = GridSearchCV(KNeighborsRegressor(n_jobs=4), params,
                              scoring='neg_mean_absolute_error', n_jobs=4)

In [37]:
regresssion_gs.fit(X_train, y_train)

In [38]:
print(regresssion_gs.best_params_)
regresssion_gs.cv_results_['mean_test_score']

{'n_neighbors': 23, 'weights': 'distance'}


array([-3.15856288, -3.15856288, -2.56034538, -2.55854638, -2.4980574 ,
       -2.49387466, -2.47092046, -2.46615802, -2.46591983, -2.45971835,
       -2.48326005, -2.47494578, -2.51510809, -2.50473728, -2.56460764,
       -2.5503151 ])

In [39]:
reg_model = regresssion_gs.best_estimator_

In [40]:
mae(y_test, reg_model.predict(X_test))

2.4748666196685205

### Classification

In [41]:
classification_gs = GridSearchCV(KNeighborsClassifier(n_jobs=4), params,
                              scoring='neg_mean_absolute_error', n_jobs=4)

In [42]:
classification_gs.fit(X_train, y_train)

In [43]:
print(classification_gs.best_params_, classification_gs.best_score_)
classification_gs.cv_results_['mean_test_score']

{'n_neighbors': 57, 'weights': 'distance'} -2.847583305200522


array([-3.16156173, -3.16156173, -3.32216722, -3.029947  , -3.02392766,
       -2.94678397, -2.96643636, -2.90277703, -2.92011198, -2.88605961,
       -2.86576862, -2.84758331, -2.87261809, -2.86228089, -2.92219648,
       -2.90530871])

In [44]:
clf_model = classification_gs.best_estimator_

In [45]:
mae(y_test, clf_model.predict(X_test))

2.8477946493130877

In [46]:
clf_model.score(X_test, y_test)

0.14943359845745963