In [1]:
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [3]:
import yaml
from pprint import pprint

with open('param.yaml') as f:
    templates = yaml.safe_load(f)

pprint(templates)

{'aerials_won_y': ['aerials_won_x',
                   'tackles_won_x',
                   'touches_x',
                   'pressure_regain_pct_x',
                   'possession_old',
                   'aerials_won_old',
                   'possession_new',
                   'aerials_won_new'],
 'carry_progressive_distance_y': ['carry_progressive_distance_x',
                                  'passes_short_x',
                                  'passes_medium_x',
                                  'passes_long_x',
                                  'dribbles_completed_pct_x',
                                  'possession_old',
                                  'passes_progressive_distance_old',
                                  'carry_progressive_distance_old',
                                  'possession_new',
                                  'passes_progressive_distance_new',
                                  'carry_progressive_distance_new'],
 'crosses_y': ['crosses_x',
          

In [4]:
import numpy as np

def mean_absolute_percentage_error(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    for index in range(len(actual)):
        actual[index] = int(actual[index]) + 0.000001
        pred[index] = int(pred[index])

    return np.mean(np.minimum(np.abs(actual - pred) / actual, 1)) * 100

Измененная ф-я Артёма, для проверки работы модели.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import numpy as np
import pandas as pd

def evaluate(position, position_attributes, attributes):  #т.е позиция, выбранные атрибуты для позиции, предсказываемые атрибуты.
    target_columns = attributes[0] + '_y'
    print(target_columns)

    col = templates[target_columns] + [x + '_y' for x in attributes]

    _train = pd.read_csv(position + '_train.csv') #выгрузил Сашины выборки
    _val   = pd.read_csv(position + '_val.csv')
    _test  = pd.read_csv(position + '_test.csv')
    
    _train.dropna(subset=[target_columns])
    _test.dropna(subset=[target_columns])
    _val.dropna(subset=[target_columns])

    train = pd.DataFrame(_train[col]) #оставил только нужные колонки
    val   = pd.DataFrame(_val[col])
    test  = pd.DataFrame(_test[col])

    lgb_train = lgb.Dataset(train.drop(columns=target_columns).values, train[target_columns].values)
    lgb_val   = lgb.Dataset(val.drop(columns=target_columns).values, val[target_columns].values)
    lgb_test  = test

    params = {
        "objective" : "mae",
        "metric" : "mape",
        "max_depth" : 7,
        "num_leaves" : 100,
        "learning_rate" : 0.05,
        "bagging_fraction" : 0.3,
        "feature_fraction" : 0.15,
        "lambda_l1" : 5,
        "lambda_l2" : 5,
        "bagging_seed" : 42,
        "verbosity" : 0,
        "seed": 42
    }
    
    clf = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_val],
        num_boost_round=500,
        early_stopping_rounds=10,
        verbose_eval=10
    )
    
    pred1 = clf.predict(train.iloc[:, :-1].values)
    sc1 = mean_absolute_percentage_error(train.iloc[:, -1].values, pred1)
    pred2 = clf.predict(val.iloc[:, :-1].values)
    sc2 = mean_absolute_percentage_error(val.iloc[:, -1].values, pred2)
    pred3 = clf.predict(test.iloc[:, :-1].values)
    sc3 = mean_absolute_percentage_error(test.iloc[:, -1].values, pred3)
    return (sc1, sc2, sc3)

In [6]:
def lgbm(position, position_attributes):  #ф-я подсчитывающая lgbm для каждого параметра
    x = len(position_attributes)
    A = [[0.01 for j in range(2)] for i in range(x)]
    df = pd.DataFrame(A)
    for i in range(x):
      sc = evaluate(position, position_attributes, position_attributes[i:(i + 1)])
      df[0][i] = sc[1]
      df[1][i] = sc[2]
    return(df)

In [7]:
def lgbm_minus_param(position, position_attributes, club_attributes):  #ф-я подсчитывающая lgbm для каждого параметра, при выкидывании одного из признаков
    x = len(position_attributes)
    A = [[0.01 for j in range(x)] for i in range(x)]
    df = pd.DataFrame(A)
    for i in range(x):
      _position_attributes = position_attributes[:i] + position_attributes[(i + 1):]  #выкидываем один из параметров
      for j in range(x):
        if(i != j):
          sc = evaluate(position, _position_attributes, club_attributes, position_attributes[j:(j + 1)])
          df[j][i] = sc[2]
    df = df.T 
    return(df)  

In [8]:
def lgbm_normalize(lgbm_df, lgbm_minus_param_df):  #ф-я подсчитывающая выгодность выкидывания одного из параметров
    l = len(lgbm_df)
    df = lgbm_minus_param_df
    for i in range(l):
      x = lgbm_df[0][i]
      for j in range(l):
        if(i != j):
          df[j][i] =  lgbm_minus_param_df[j][i] - x 
    return(df)

**Распишем для каждого вида позиций**

***Centerbacks***

In [9]:
CB_position = 'Centerbacks'
CB_position_attributes = [
 'aerials_won',
 'pressures',
 'tackles',
 'passes_pct',
 'interceptions']

In [10]:
CB = lgbm(CB_position, CB_position_attributes)
CB

aerials_won_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.439082
Early stopping, best iteration is:
[8]	valid_0's mape: 0.438003
pressures_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.453663
[20]	valid_0's mape: 0.436638
[30]	valid_0's mape: 0.4217
[40]	valid_0's mape: 0.401521
[50]	valid_0's mape: 0.385113
[60]	valid_0's mape: 0.376624
[70]	valid_0's mape: 0.380013
[80]	valid_0's mape: 0.361996
[90]	valid_0's mape: 0.353621
[100]	valid_0's mape: 0.347745
[110]	valid_0's mape: 0.340004
[120]	valid_0's mape: 0.334728
[130]	valid_0's mape: 0.332935
[140]	valid_0's mape: 0.329861
[150]	valid_0's mape: 0.331272
Early stopping, best iteration is:
[146]	valid_0's mape: 0.328199
tackles_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.627674
[20]	valid_0's mape: 0.605545
[30]	valid_0's mape: 0.620876
Early stopping, best iteration is:
[20]	valid_0's mape: 0.605545
passes_pc

Unnamed: 0,0,1
0,42.491927,50.837505
1,32.818614,45.896067
2,48.229277,49.563024
3,4.153507,5.07698
4,51.341886,50.980762


In [24]:
CB.index = CB_position_attributes
CB

Unnamed: 0,0,1
aerials_won,42.491927,50.837505
pressures,32.818614,45.896067
tackles,48.229277,49.563024
passes_pct,4.153507,5.07698
interceptions,51.341886,50.980762


***Fullbacks***

In [11]:
FB_position = 'Fullbacks'
FB_position_attributes = [
 'passes_into_penalty_area',
 'npxg_xa_per90',
 'passes_completed',
 'crosses',
 'carry_progressive_distance']

In [12]:
FB = lgbm(FB_position, FB_position_attributes)
FB

passes_into_penalty_area_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.732163
Early stopping, best iteration is:
[3]	valid_0's mape: 0.68451
npxg_xa_per90_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.0345819
[20]	valid_0's mape: 0.0333457
[30]	valid_0's mape: 0.0327387
[40]	valid_0's mape: 0.0323752
[50]	valid_0's mape: 0.0317578
[60]	valid_0's mape: 0.0310737
[70]	valid_0's mape: 0.0302245
[80]	valid_0's mape: 0.0302515
Early stopping, best iteration is:
[76]	valid_0's mape: 0.0301319
passes_completed_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.51794
Early stopping, best iteration is:
[1]	valid_0's mape: 0.458735
crosses_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 1.01602
Early stopping, best iteration is:
[2]	valid_0's mape: 0.984043
carry_progressive_distance_y
Training until validation scores don't improve for 10 roun

Unnamed: 0,0,1
0,50.011626,54.949463
1,100.0,100.0
2,43.201547,50.325239
3,50.133746,54.453157
4,41.595724,41.938078


In [23]:
FB.index = FB_position_attributes
FB

Unnamed: 0,0,1
passes_into_penalty_area,50.011626,54.949463
npxg_xa_per90,100.0,100.0
passes_completed,43.201547,50.325239
crosses,50.133746,54.453157
carry_progressive_distance,41.595724,41.938078


***Midfielders***

In [13]:
MF_position = 'Midfielders'
MF_position_attributes = [
 'touches',
 'passes_progressive_distance',
 'passes_pct',
 'tackles_won',
 'pressures']

In [14]:
MF = lgbm(MF_position, MF_position_attributes)
MF

touches_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.640479
[20]	valid_0's mape: 0.623613
[30]	valid_0's mape: 0.61725
[40]	valid_0's mape: 0.583841
[50]	valid_0's mape: 0.572714
[60]	valid_0's mape: 0.543639
[70]	valid_0's mape: 0.520731
[80]	valid_0's mape: 0.511596
[90]	valid_0's mape: 0.490707
[100]	valid_0's mape: 0.478854
[110]	valid_0's mape: 0.468889
[120]	valid_0's mape: 0.457815
[130]	valid_0's mape: 0.444557
[140]	valid_0's mape: 0.436852
[150]	valid_0's mape: 0.433478
[160]	valid_0's mape: 0.431079
[170]	valid_0's mape: 0.428202
[180]	valid_0's mape: 0.42309
[190]	valid_0's mape: 0.419439
[200]	valid_0's mape: 0.416749
[210]	valid_0's mape: 0.415545
[220]	valid_0's mape: 0.411156
[230]	valid_0's mape: 0.40924
[240]	valid_0's mape: 0.408145
[250]	valid_0's mape: 0.408265
Early stopping, best iteration is:
[243]	valid_0's mape: 0.40735
passes_progressive_distance_y
Training until validation scores don't improve for 10 rounds.
[10]	val

Unnamed: 0,0,1
0,29.441647,48.208538
1,39.391632,47.790873
2,6.461679,4.263765
3,37.707259,56.172282
4,40.081913,43.615923


In [22]:
MF.index = MF_position_attributes
MF

Unnamed: 0,0,1
touches,29.441647,48.208538
passes_progressive_distance,39.391632,47.790873
passes_pct,6.461679,4.263765
tackles_won,37.707259,56.172282
pressures,40.081913,43.615923


***Wingers***

In [15]:
W_position = 'Wingers'
W_position_attributes = [
 'dribbles_completed_pct',
 'passes_pct',
 'npxg_xa_per90',
 'sca_per90',
 'crosses']

In [16]:
W = lgbm(W_position, W_position_attributes)
W

dribbles_completed_pct_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.13944
[20]	valid_0's mape: 0.13592
[30]	valid_0's mape: 0.13583
Early stopping, best iteration is:
[22]	valid_0's mape: 0.135325
passes_pct_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.0625726
[20]	valid_0's mape: 0.0609918
[30]	valid_0's mape: 0.0583308
[40]	valid_0's mape: 0.0570905
[50]	valid_0's mape: 0.0559499
[60]	valid_0's mape: 0.0546226
[70]	valid_0's mape: 0.0533174
[80]	valid_0's mape: 0.052182
[90]	valid_0's mape: 0.0518023
[100]	valid_0's mape: 0.0513569
[110]	valid_0's mape: 0.050672
[120]	valid_0's mape: 0.0504409
[130]	valid_0's mape: 0.05019
[140]	valid_0's mape: 0.0492952
[150]	valid_0's mape: 0.0485958
[160]	valid_0's mape: 0.0483049
Early stopping, best iteration is:
[156]	valid_0's mape: 0.0482964
npxg_xa_per90_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.104575
[20]	valid_0

Unnamed: 0,0,1
0,13.440491,17.428999
1,4.830161,5.413624
2,100.0,100.0
3,33.599996,31.73202
4,60.560368,59.032637


In [21]:
W.index = W_position_attributes
W

Unnamed: 0,0,1
dribbles_completed_pct,13.440491,17.428999
passes_pct,4.830161,5.413624
npxg_xa_per90,100.0,100.0
sca_per90,33.599996,31.73202
crosses,60.560368,59.032637


***Forwards***

In [17]:
F_position = 'Forwards'
F_position_attributes = [
 'sca_per90',
 'npxg_per90',
 'shots_total_per90',
 'aerials_won',
 'gca_per90']

In [18]:
F = lgbm(F_position, F_position_attributes)
F

sca_per90_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.421296
Early stopping, best iteration is:
[2]	valid_0's mape: 0.411839
npxg_per90_y
Training until validation scores don't improve for 10 rounds.
[10]	valid_0's mape: 0.149977
[20]	valid_0's mape: 0.148898
[30]	valid_0's mape: 0.144315
[40]	valid_0's mape: 0.143043
[50]	valid_0's mape: 0.139026
[60]	valid_0's mape: 0.136905
[70]	valid_0's mape: 0.13456
[80]	valid_0's mape: 0.133028
[90]	valid_0's mape: 0.132149
[100]	valid_0's mape: 0.130817
[110]	valid_0's mape: 0.129037
[120]	valid_0's mape: 0.127273
[130]	valid_0's mape: 0.125896
[140]	valid_0's mape: 0.124761
[150]	valid_0's mape: 0.124149
[160]	valid_0's mape: 0.123533
[170]	valid_0's mape: 0.123057
[180]	valid_0's mape: 0.12243
[190]	valid_0's mape: 0.12153
[200]	valid_0's mape: 0.120472
[210]	valid_0's mape: 0.12005
Early stopping, best iteration is:
[205]	valid_0's mape: 0.119906
shots_total_per90_y
Training until validation scores 

Unnamed: 0,0,1
0,50.877132,47.093969
1,100.0,100.0
2,42.105237,35.726488
3,68.70995,57.65644
4,100.0,100.0


In [20]:
F.index = F_position_attributes
F

Unnamed: 0,0,1
sca_per90,50.877132,47.093969
npxg_per90,100.0,100.0
shots_total_per90,42.105237,35.726488
aerials_won,68.70995,57.65644
gca_per90,100.0,100.0


In [19]:
All_param = [
 'aerials_won',
 'carry_progressive_distance',
 'crosses',
 'dribbles_completed_pct',
 'gca_per90',
 'interceptions',
 'npxg_per90',
 'npxg_xa_per90',
 'passes_completed',
 'passes_into_penalty_area',
 'passes_pct',
 'passes_progressive_distance',
 'pressures',
 'sca_per90',
 'shots_total_per90',
 'tackles',
 'tackles_won',
 'touches']
All_param

['aerials_won',
 'carry_progressive_distance',
 'crosses',
 'dribbles_completed_pct',
 'gca_per90',
 'interceptions',
 'npxg_per90',
 'npxg_xa_per90',
 'passes_completed',
 'passes_into_penalty_area',
 'passes_pct',
 'passes_progressive_distance',
 'pressures',
 'sca_per90',
 'shots_total_per90',
 'tackles',
 'tackles_won',
 'touches']