In [1]:
import pandas as pd
import catboost as cb
from matplotlib import pyplot as plt 
import seaborn as sns
from catboost import CatBoostRegressor
import numpy as np
import yaml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
with open('param.yaml') as f: target_feature_dict = yaml.load(f)
with open('extra_ft_dict') as f: extra_ft_dict = yaml.load(f)

  with open('param.yaml') as f: target_feature_dict = yaml.load(f)
  with open('extra_ft_dict') as f: extra_ft_dict = yaml.load(f)


In [56]:
def get_normalized_data(train_data, val_data, test_data, features_arr, target_arr, extra_ft): # Возвращает датасэты с нормированными данными
    ft_arr = features_arr.copy()
    tg_arr = target_arr.copy()
    if 'minutes_x' not in ft_arr:
        ft_arr.append('minutes_x')
    if 'minutes_y' not in tg_arr:
        tg_arr.append('minutes_y')
    new_train = train_data[ft_arr + tg_arr + extra_ft].copy()
    new_val = val_data[ft_arr + tg_arr + extra_ft].copy()
    new_test = test_data[ft_arr + tg_arr + extra_ft].copy()
    
    div_tg_cols = []
    div_ft_cols = []
    for col in ft_arr:
        if col == 'minutes_x':
            continue
        if new_train[col].dtype == np.float64 and 'per90' not in col and 'pct' not in col and 'possession' not in col and '_old' not in col and '_new' not in col:
            div_ft_cols.append(col)
    
    for col in tg_arr:
        if col == 'minutes_y':
            continue
        if new_train[col].dtype == np.float64 and 'per90' not in col and 'pct' not in col and 'possession' not in col and '_old' not in col and '_new' not in col:
            div_tg_cols.append(col)
    
    #нормируем по минутам
    new_train[div_tg_cols + div_ft_cols] = new_train[div_tg_cols + div_ft_cols].apply(lambda x : x * 90)
    new_train[div_ft_cols] = new_train[div_ft_cols].div(new_train.minutes_x, axis=0).copy()
    new_train[div_tg_cols] = new_train[div_tg_cols].div(new_train.minutes_y, axis=0).copy()
    
    new_val[div_tg_cols + div_ft_cols] = new_val[div_tg_cols + div_ft_cols].apply(lambda x : x * 90)
    new_val[div_ft_cols] = new_val[div_ft_cols].div(new_val.minutes_x, axis=0).copy()
    new_val[div_tg_cols] = new_val[div_tg_cols].div(new_val.minutes_y, axis=0).copy()
    
    new_test[div_tg_cols + div_ft_cols] = new_test[div_tg_cols + div_ft_cols].apply(lambda x : x * 90)
    new_test[div_ft_cols] = new_test[div_ft_cols].div(new_test.minutes_x, axis=0).copy()
    new_test[div_tg_cols] = new_test[div_tg_cols].div(new_test.minutes_y, axis=0).copy()
    
    new_train.dropna(inplace=True, axis=0, subset=extra_ft)
    new_val.dropna(inplace=True, axis=0, subset=extra_ft)
    new_test.dropna(inplace=True, axis=0, subset=extra_ft)
    
    #PCA + StantardScaler
    if ('position_x' in ft_arr):
        ft_arr.remove('position_x')
    if ('minutes_x' in ft_arr):
        ft_arr.remove('minutes_x')
    if ('minutes_y' in ft_arr):
        ft_arr.remove('minutes_y')
    SC = StandardScaler()
    new_train[ft_arr] = SC.fit_transform(new_train[ft_arr])
    new_val[ft_arr] = SC.transform(new_val[ft_arr])
    new_test[ft_arr] = SC.transform(new_test[ft_arr])
    
    
    pca = PCA(n_components=4)
    
    x = new_train[extra_ft].copy()
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4'])
    new_train = pd.concat([new_train[features_arr + target_arr], principalDf], axis=1)
    
    x = new_val[extra_ft].copy()
    principalComponents = pca.transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4'])
    new_val = pd.concat([new_val[features_arr + target_arr], principalDf], axis=1)
    
    x = new_test[extra_ft].copy()
    principalComponents = pca.transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4'])
    new_test = pd.concat([new_test[features_arr + target_arr], principalDf], axis=1)
    
    
    return new_train, new_val, new_test
    
def get_cat_model(features_arr, target_arr, train_data, val_data, metric = 'RMSE', cat_features_names = [], plt = True):
    #подаем названия столбцов признаков и целевых переменных +
    # + DataFrame с данными. loss_f -- строка-название функции потерь
    df = train_data[features_arr + target_arr].copy()
    df.dropna(subset = target_arr, inplace=True) # удаляем строки, где целевая переменнная = NaN
    train_x = df[features_arr].copy()
    train_y = df[target_arr].copy()  
    
    
    df = val_data[features_arr + target_arr].copy()
    df.dropna(subset = target_arr, inplace=True) # удаляем строки, где целевая переменнная = NaN
    val_x = df[features_arr].copy()
    val_y = df[target_arr].copy()
    
    
    cat_features = [train_x.columns.get_loc(col) for col in cat_features_names]
    SEED = 100
    params = {'loss_function': metric, # objective function
          'eval_metric': metric, # metric
          'verbose': False, # output to stdout info about training process every 200 iterations
          'random_seed': SEED,
          'cat_features' : cat_features, # вектор с номерами столбцов, считая от 0
         }
    
    
    mdl = CatBoostRegressor(**params)
    mdl.fit(train_x, train_y, # data to train on 
          eval_set=(val_x, val_y), # data to validate on
          use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score
          plot=plt # True for visualization of the training process (it is not shown in a published kernel - try executing this code)
         );
    return mdl


def get_test_res(model, features_arr, target_arr, test_data):  #predict. Возвращаем DataFrame с результатом
    
    df = test_data[features_arr + target_arr].copy()
    df.dropna(subset = target_arr, inplace=True) # удаляем строки, где целевая переменнная = NaN
    test_x = df[features_arr]
    
    ans = pd.DataFrame(model.predict(test_x), columns=target_arr)
    return ans

def MAPE(first, second):
    
    error = 0
    for i in first.columns:
        error += min(1.5, abs(float(first[i]) - float(second[i])) / abs(float(second[i])))
    error /= len(first.columns)
    return error

def test_quality(test_data, target_arr, result, loss_func = MAPE):
                                                                                
    test_y = test_data[target_arr].copy()
    test_y.dropna(subset = target_arr, inplace=True) # удаляем строки, где целевая переменнная = NaN
    
    min_error = -1 # минимальное значение ошибки
    max_error = -1
    error = 0
    size = result.shape
    for i in range(0,size[0]):
        first = pd.DataFrame(test_y.iloc[i:(i+1), :])
        second = pd.DataFrame(result.iloc[i:(i+1), :])
        curr_error = loss_func(first, second)
        if min_error == -1 or min_error > curr_error:
            min_error = curr_error
        if max_error == -1 or curr_error > max_error:
            max_error = curr_error
        error += curr_error
    error /= size[0]
    
    return error, min_error, max_error
    
def print_feature_importance(model): # Рисуем график "важности" признаков
    
    feature_importance_df = pd.DataFrame(model.get_feature_importance(prettified=True), columns=['Feature Id', 'Importances'])
    plt.figure(figsize=(12, 6));
    sns.barplot(x="Importances", y="Feature Id", data=feature_importance_df);
    plt.title('CatBoost features importance:');
    

#Шаблон итоговой функции
def build_model(train_dt, val_dt, test_dt, position, ft_arr, target_, extra_ft = [], print_plot=False): 
    #В первых трех аргументах передаем путь до соответствущих табличек
    cat_ft = []
    ft_arr_ = ft_arr.copy()
    if (position == 'all'):
        ft_arr_.append('position_x')
        cat_ft.append('position_x')
    
    #отбираем признаки для PCA
    tmp = []
    for i in extra_ft:
        if i not in ft_arr_:
            tmp.append(i)
    extra_ft_ = tmp
    #грузим таблицы
    train_dt = pd.read_csv(train_dt)
    val_dt = pd.read_csv(val_dt)
    test_dt = pd.read_csv(test_dt)
    
    
    train_, val_, test_ = get_normalized_data(train_dt, val_dt, test_dt, ft_arr_, target_, extra_ft_)
    
    
    
    
    model_cat_MAPE = get_cat_model(ft_arr_, target_, train_, val_, 'MAPE', cat_ft, False)
    
    #значение на test
    res_cat_MAPE_test = get_test_res(model_cat_MAPE, ft_arr_, target_, test_)
    
    #значение на val
    res_cat_MAPE_val = get_test_res(model_cat_MAPE, ft_arr_, target_, val_)
    
    
    res_cat_MAPE_error_test, res_cat_MAPE_min_error_test, res_cat_MAPE_max_error_test = test_quality(test_, target_, res_cat_MAPE_test, MAPE)
    res_cat_MAPE_error_val, res_cat_MAPE_min_error_val, res_cat_MAPE_max_error_val = test_quality(val_, target_, res_cat_MAPE_val, MAPE)
    
    print(f'Target: {target_[0]}')
    print(f'Position: {position}')
    #print(f'MAPE:\naverage error: {res_cat_MAPE_error_val}\nmin error: {res_cat_MAPE_min_error_val}\nmax error: {res_cat_MAPE_max_error_val}')
    print('TEST ERROR')
    print(f'MAPE:\naverage error: {res_cat_MAPE_error_test}\nmin error: {res_cat_MAPE_min_error_test}\nmax error: {res_cat_MAPE_max_error_test}')
    return pd.DataFrame(res_cat_MAPE_test)
    #return res_cat_MAPE_error_test

build_model(path_train, path_val, path_test, position, features_arr, target, extra_features_for_PCA)


In [58]:
build_model('data replaced/Fullbacks_train.csv', 'data replaced/Fullbacks_val.csv', 'data replaced/Fullbacks_test.csv', 'FB', target_feature_dict['passes_pct_y'], ['passes_pct_y'], extra_ft_dict['FB'])

Target: passes_pct_y
Position: FB
TEST ERROR
MAPE:
average error: 0.05787362058274148
min error: 0.00039570902915793833
max error: 0.20250291931799516


Unnamed: 0,passes_pct_y
0,72.486487
1,81.653365
2,74.53302
3,80.00578
4,72.575601
5,77.168424
6,80.835085
7,79.205328
8,80.603264
9,71.698487


In [29]:
path_dict = {}
path_dict['CB'] = 'data replaced/Centerbacks'
path_dict['FB'] = 'data replaced/Fullbacks'
path_dict['MID'] = 'data replaced/Midfielders'
path_dict['ST'] = 'data replaced/Forwards'
path_dict['WIN'] = 'data replaced/Wingers'
path_dict['all'] = 'all'

In [5]:
res = []
path_dict = {}
path_dict['CB'] = 'data replaced/Centerbacks'
path_dict['FB'] = 'data replaced/Fullbacks'
path_dict['MID'] = 'data replaced/Midfielders'
path_dict['ST'] = 'data replaced/Forwards'
path_dict['WIN'] = 'data replaced/Wingers'
path_dict['all'] = 'all'
target = ['aerials_won_y', 'carry_progressive_distance_y',
       'crosses_y', 'dribbles_completed_pct_y', 'gca_per90_y',
       'interceptions_y', 'npxg_per90_y', 'npxg_xa_per90_y',
       'passes_completed_y', 'passes_into_penalty_area_y', 'passes_pct_y',
       'passes_progressive_distance_y', 'pressures_y', 'sca_per90_y',
       'shots_total_per90_y', 'tackles_won_y', 'tackles_y', 'touches_y']
cols =['CB', 'FB', 'MID', 'ST', 'WIN', 'all']
main = []
for i in cols:
    curr_arr = []
    for j in target:
        curr_arr.append(build_model(path_dict[i] + '_train.csv', path_dict[i] + '_val.csv', path_dict[i] + '_test.csv', i, target_feature_dict[j], [j], extra_ft_dict[i]))
    main.append(curr_arr)

Target: aerials_won_y
Position: CB
VAL ERROR
MAPE:
average error: 0.21821876572909293
min error: 0.007783434117769844
max error: 0.5065103292415875

TEST ERROR
MAPE:
average error: 0.2866426308738259
min error: 0.004738647404470129
max error: 0.7167547470989647
Target: carry_progressive_distance_y
Position: CB
VAL ERROR
MAPE:
average error: 0.2359397908067981
min error: 0.01845357907298476
max error: 0.7047422411329068

TEST ERROR
MAPE:
average error: 0.3600503505685473
min error: 0.01876853184269865
max error: 1.4285569290128277
Target: crosses_y
Position: CB
VAL ERROR
MAPE:
average error: 0.6178063910917544
min error: 0.07948706013471467
max error: 1.5

TEST ERROR
MAPE:
average error: 0.8888979801433079
min error: 0.11469217429009752
max error: 1.5
Target: dribbles_completed_pct_y
Position: CB
VAL ERROR
MAPE:
average error: 0.2741257032048215
min error: 0.07761652797234879
max error: 0.5753188767097993

TEST ERROR
MAPE:
average error: 0.33224629120578236
min error: 0.0298212158035006

Target: tackles_won_y
Position: FB
VAL ERROR
MAPE:
average error: 0.3613408401530815
min error: 0.03875752806000724
max error: 0.9233223134264624

TEST ERROR
MAPE:
average error: 0.305407414829219
min error: 0.024061722795660018
max error: 1.3465089080601413
Target: tackles_y
Position: FB
VAL ERROR
MAPE:
average error: 0.2749009337691554
min error: 0.04457988565188956
max error: 0.5178236029367305

TEST ERROR
MAPE:
average error: 0.22473520009224648
min error: 0.0005766315190283721
max error: 0.8896438407526551
Target: touches_y
Position: FB
VAL ERROR
MAPE:
average error: 0.12047229533680201
min error: 0.02160916567771665
max error: 0.2269575481479239

TEST ERROR
MAPE:
average error: 0.09584142505316381
min error: 0.002962841921813299
max error: 0.25613838247583204
Target: aerials_won_y
Position: MID
VAL ERROR
MAPE:
average error: 0.49718503599370256
min error: 0.026589748391889507
max error: 1.5

TEST ERROR
MAPE:
average error: 0.42066357075434935
min error: 0.0017013967925441696
max 

Target: pressures_y
Position: ST
VAL ERROR
MAPE:
average error: 0.2659444053512071
min error: 0.08607226080460528
max error: 0.8632122396149361

TEST ERROR
MAPE:
average error: 0.21757601374068253
min error: 0.00014538593288892894
max error: 0.7540030714169148
Target: sca_per90_y
Position: ST
VAL ERROR
MAPE:
average error: 0.34007866265651593
min error: 0.029287644427958295
max error: 0.7849002927401169

TEST ERROR
MAPE:
average error: 0.38657429897741336
min error: 0.011272050737921379
max error: 1.5
Target: shots_total_per90_y
Position: ST
VAL ERROR
MAPE:
average error: 0.3053457914682535
min error: 0.004508192691307721
max error: 1.2320407409240421

TEST ERROR
MAPE:
average error: 0.40155799692757027
min error: 0.014039077969549046
max error: 1.2956604418433004
Target: tackles_won_y
Position: ST
VAL ERROR
MAPE:
average error: 0.3598972157101643
min error: 0.0024492951392732065
max error: 0.7903012834430744

TEST ERROR
MAPE:
average error: 0.5969766694510752
min error: 0.035237498697

Target: passes_into_penalty_area_y
Position: all
VAL ERROR
MAPE:
average error: 0.4699068451273245
min error: 0.01261817522024699
max error: 1.5

TEST ERROR
MAPE:
average error: 0.4183812339204665
min error: 0.000781316445037566
max error: 1.5
Target: passes_pct_y
Position: all
VAL ERROR
MAPE:
average error: 0.046629553092346246
min error: 0.0017862676150893178
max error: 0.1744311021274779

TEST ERROR
MAPE:
average error: 0.04478064095571246
min error: 3.425402139084702e-05
max error: 0.17800652124051114
Target: passes_progressive_distance_y
Position: all
VAL ERROR
MAPE:
average error: 0.4941535761117611
min error: 0.00683565955426559
max error: 1.5

TEST ERROR
MAPE:
average error: 0.48360519738173813
min error: 0.0009730318355875967
max error: 1.5
Target: pressures_y
Position: all
VAL ERROR
MAPE:
average error: 0.2033795945565795
min error: 0.019563448216463664
max error: 0.6715348489162482

TEST ERROR
MAPE:
average error: 0.20208959829433043
min error: 0.0033461547970917724
max erro

In [33]:
df = pd.DataFrame(main, columns=target)
df.insert(0, 'position', cols)
df

Unnamed: 0,position,aerials_won_y,carry_progressive_distance_y,crosses_y,dribbles_completed_pct_y,gca_per90_y,interceptions_y,npxg_per90_y,npxg_xa_per90_y,passes_completed_y,passes_into_penalty_area_y,passes_pct_y,passes_progressive_distance_y,pressures_y,sca_per90_y,shots_total_per90_y,tackles_won_y,tackles_y,touches_y
0,CB,0.286643,0.36005,0.888898,0.332246,0.921167,0.370674,0.591602,0.619386,0.187875,0.673752,0.042875,0.195762,0.256479,0.471874,0.454752,0.3441,0.405711,0.120489
1,FB,0.371909,0.258395,0.412424,0.186005,0.682552,0.362389,0.755789,0.398239,0.140146,0.374419,0.057874,0.157992,0.179914,0.403373,0.606839,0.305407,0.224735,0.095841
2,MID,0.420664,0.256276,0.590727,0.37282,0.61653,0.356526,0.558424,0.452279,0.150738,0.387,0.042288,0.207806,0.197667,0.309568,0.478503,0.301202,0.277869,0.108447
3,ST,0.580125,0.65856,0.809962,0.350953,0.582494,0.710454,0.302877,0.24418,0.266005,0.538562,0.052767,0.646578,0.217576,0.386574,0.401558,0.596977,0.694108,0.194836
4,WIN,0.56709,0.274877,0.577209,0.235402,0.498028,0.454424,0.33398,0.288451,0.225971,0.348501,0.051743,0.386092,0.187411,0.269256,0.291971,0.422455,0.385303,0.143564
5,all,0.43307,0.410641,0.575711,0.38162,0.607513,0.449387,0.509136,0.377259,0.19169,0.418381,0.044781,0.483605,0.20209,0.32355,0.389506,0.367745,0.352597,0.125242


In [29]:
best = ['passes_completed_y', 'passes_pct_y', 'passes_progressive_distance_y', 'pressures_y', 'touches_y', 'tackles_y']

In [30]:
average_test = []
cols =['CB', 'FB', 'MID', 'ST', 'WIN', 'all']

for i in cols:
    curr_arr = []
    for j in best:
        curr_res = []
        curr_train = pd.read_csv(path_dict[i] + '_train.csv')
        curr_val = pd.read_csv(path_dict[i] + '_val.csv')
        curr_test = pd.read_csv(path_dict[i] + '_test.csv')
        curr_table = pd.concat([curr_train, curr_val])
        curr_avrg = curr_table[j].mean()
        curr_test = pd.DataFrame(curr_test[j], columns=[j])
        curr_test.dropna(subset = [j], inplace=True)
        ans = []
        for k in range(0, curr_test.shape[0]):
            ans.append(curr_avrg)
        ans = pd.DataFrame(ans, columns=[j])
        err, _, _ = test_quality(curr_test, [j], ans)
        curr_arr.append(err)
    average_test.append(curr_arr)

In [31]:
avg_res = pd.DataFrame(average_test, columns = best)
avg_res.insert(0, 'position', cols)
avg_res

Unnamed: 0,position,passes_completed_y,passes_pct_y,passes_progressive_distance_y,pressures_y,touches_y,tackles_y
0,CB,0.466758,0.053922,0.490424,0.425088,0.464507,0.515169
1,FB,0.476777,0.062731,0.525779,0.50434,0.4721,0.511795
2,MID,0.449648,0.052054,0.50642,0.479651,0.434563,0.578156
3,ST,0.522882,0.064966,0.657255,0.570592,0.53979,0.685289
4,WIN,0.597525,0.067716,0.630525,0.571576,0.58287,0.661034
5,all,0.573631,0.078796,0.645234,0.535142,0.533909,0.627728


In [32]:
avg_res.to_csv('avg_res.csv', index=False)

In [34]:
df = df[['position'] + best]
df

Unnamed: 0,position,passes_completed_y,passes_pct_y,passes_progressive_distance_y,pressures_y,touches_y,tackles_y
0,CB,0.187875,0.042875,0.195762,0.256479,0.120489,0.405711
1,FB,0.140146,0.057874,0.157992,0.179914,0.095841,0.224735
2,MID,0.150738,0.042288,0.207806,0.197667,0.108447,0.277869
3,ST,0.266005,0.052767,0.646578,0.217576,0.194836,0.694108
4,WIN,0.225971,0.051743,0.386092,0.187411,0.143564,0.385303
5,all,0.19169,0.044781,0.483605,0.20209,0.125242,0.352597


In [45]:
res_avg = pd.DataFrame()

In [41]:
res_avg['passes_completed_y'] = avg_res['passes_completed_y'].mean()
res_avg['passes_pct_y'] = avg_res['passes_pct_y'].mean()
res_avg['passes_progressive_distance_y'] = avg_res['passes_progressive_distance_y'].mean()
res_avg['pressures_y'] = avg_res['pressures_y'].mean()
res_avg['touches_y'] = avg_res['touches_y'].mean()
res_avg['tackles_y'] = avg_res['tackles_y'].mean()
res_avg

TypeError: 'float' object is not iterable

In [39]:
avg_res['passes_completed_y'].mean()

0.5145369666682922

In [48]:
res_prog = pd.DataFrame()
res_prog.insert(0, 'passes_completed_y', [df['passes_completed_y'].mean()])
res_prog.insert(0, 'passes_pct_y', df['passes_pct_y'].mean())
res_prog.insert(0, 'passes_progressive_distance_y', df['passes_progressive_distance_y'].mean())
res_prog.insert(0, 'pressures_y', df['pressures_y'].mean())
res_prog.insert(0, 'touches_y', df['touches_y'].mean())
res_prog.insert(0, 'tackles_y', df['tackles_y'].mean())
res_prog.insert(0, '', ['model prediction'])
res_prog

Unnamed: 0,Unnamed: 1,tackles_y,touches_y,pressures_y,passes_progressive_distance_y,passes_pct_y,passes_completed_y
0,model prediction,0.390054,0.131403,0.206856,0.346306,0.048721,0.193737


In [50]:
res_avg = pd.DataFrame()
res_avg.insert(0, 'passes_completed_y', [avg_res['passes_completed_y'].mean()])
res_avg.insert(0, 'passes_pct_y', avg_res['passes_pct_y'].mean())
res_avg.insert(0, 'passes_progressive_distance_y', avg_res['passes_progressive_distance_y'].mean())
res_avg.insert(0, 'pressures_y', avg_res['pressures_y'].mean())
res_avg.insert(0, 'touches_y', avg_res['touches_y'].mean())
res_avg.insert(0, 'tackles_y', avg_res['tackles_y'].mean())
res_avg.insert(0, '', ['average prediction'])
res_avg

Unnamed: 0,Unnamed: 1,tackles_y,touches_y,pressures_y,passes_progressive_distance_y,passes_pct_y,passes_completed_y
0,average prediction,0.596528,0.504623,0.514398,0.57594,0.063364,0.514537


Unnamed: 0,tackles_y,touches_y,pressures_y,passes_progressive_distance_y,passes_pct_y,passes_completed_y
0,0.390054,0.131403,0.206856,0.346306,0.048721,0.193737
