In [10]:
import numpy as np
import pandas as pd
import copy
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler


path_to_folder = "C:/Users/Artyom/transfer_attributes_change-main/data replaced"

def getXY(df, attr, targ): #Оставить только нужные признаки и убрать строки с NaN
    attr_plus_targ = attr.copy()
    attr_plus_targ.append(targ)
    df = df[attr_plus_targ]
    df = df.dropna()
    X = df[attr]
    y = df[targ]
    return (X, y)
    
def build_model(attributes, target, path_to_train_data):
    df_train = pd.read_csv(path_to_train_data)
    
    X_train, y_train = getXY(df_train, attributes, target)
    
    #приводим все признаки к одному масштабу
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    #строим массив, из которого будем выбирать гиперпараметр лямбда
    a = np.logspace(-2, 3, 20)
    model = RidgeCV(alphas=a)
    model.fit(X_train, y_train)
    return (model, scaler)

def MAPE(y, y_pred):
    y = pd.DataFrame(y)
    y_pred = pd.DataFrame(y_pred)
    err = 0.
    for i in range(y.shape[1]):
        for j in range(y.shape[0]):
            if y.iloc[j, i] == 0:
                err += min((abs(y.iloc[j, i] - y_pred.iloc[j, i]) / (abs(y.iloc[j, i]) + 0.1)), 1)
            else:
                err += abs(y.iloc[j, i] - y_pred.iloc[j, i]) / abs(y.iloc[j, i])
    err /= (y.shape[1] * y.shape[0])
    return err

def predict_test_val(path_to_train, path_to_val, path_to_test, attr, targ):
    model, scaler = build_model(attr, targ, path_to_train) 
    
    df_val = pd.read_csv(path_to_val)
    df_test = pd.read_csv(path_to_test)
    
    X_val, y_val = getXY(df_val, attr, targ)
    X_test, y_test = getXY(df_test, attr, targ)
    
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    return (y_pred_val, y_pred_test, MAPE(y_val, y_pred_val), MAPE(y_test, y_pred_test))

#Функция, используемая пользователем: в position_dct загружается словарь для позиции игрока (создан ниже)
#в target то что хотим предсказывать, в path_to_df - путь до данных 

#def prediction(position_dct, target, path_to_df, path_to_train_data):
#    model = position_dct[target][0]
#    attr = position_dct[target][1]
#    
#    df_train = pd.read_csv(path_to_train_data)
#    df_train = df_train[attr]
#    df_train.dropna(inplace=True)
#    
#    scaler = StandardScaler()
#    scaler.fit(df_train)
    
#    df = pd.read_csv(path_to_df)
#    df = df[attr]
#    df.dropna(inplace=True)
#    scaler.transform(df)
#    
#    return model.predict(df)

In [11]:
import yaml

with open('D:/Users/Загрузки/Telegram Desktop/param.yaml') as f:
    dct = yaml.safe_load(f)
    
features_for_mid = ['sca_per90_y', #Действия, которые привели к удару по воротам (на 90 минут)
                    'gca_per90_y', #Действия, которыне привели к голу (на 90 минут)
                    'npxg_xa_per90_y', #Ожидаемое количество голов (без учета пенальти)
                                        # и ожидаемое количество ассистов на 90 минут
                    'passes_progressive_distance_y', #Общая дистанция, которую прошел мяч 
                                                    #после паса к воротам соперника
                    'passes_into_penalty_area_y',   
                    'passes_completed_y'
                   ]

features_for_fwd = ['gca_per90_y', 
                    'npxg_per90_y', 
                    'sca_per90_y', 
                    'shots_total_per90_y'
                   ]

features_for_win = ['sca_per90_y', #Действия, которые привели к удару по воротам (на 90 минут)
                   'gca_per90_y', #Действия, которыне привели к голу (на 90 минут)
                   'dribbles_completed_pct_y',
                   'crosses_y', #Количество кроссов в штрафную
                   'carry_progressive_distance_y' #Дистанция, пройденная с мячом, к воротам соперника
                    ]

features_for_cb = [ 'aerials_won_y',
                   'tackles_won_y',
                   'interceptions_y'       
                ]

dct_for_mid = {i: dct[i] for i in features_for_mid}

dct_for_fwd = {i: dct[i] for i in features_for_fwd}

dct_for_win = {i: dct[i] for i in features_for_win}

dct_for_cb = {i: dct[i] for i in features_for_cb}


In [12]:
for target in dct_for_mid:
    prediction = predict_test_val(path_to_folder + '/Midfielders_train.csv', 
                     path_to_folder + '/Midfielders_val.csv', 
                     path_to_folder + '/Midfielders_test.csv',
                    dct_for_mid[target], target)
    print("Prediction of", target)
    print("MAPE on val:", prediction[2])
    print("MAPE on test", prediction[3])
    print()

Prediction of sca_per90_y
MAPE on val: 0.5624157271034863
MAPE on test 0.5788776506975956

Prediction of gca_per90_y
MAPE on val: 0.9296784786483012
MAPE on test 0.5679836754616776

Prediction of npxg_xa_per90_y
MAPE on val: 0.6768522438084831
MAPE on test 0.8301617054424966

Prediction of passes_progressive_distance_y
MAPE on val: 0.530772901401812
MAPE on test 0.6570622552138434

Prediction of passes_into_penalty_area_y
MAPE on val: 0.8509289602278969
MAPE on test 1.1398296490876458

Prediction of passes_completed_y
MAPE on val: 0.6312315571787925
MAPE on test 0.6066536289587928



In [13]:
for target in dct_for_fwd:
    prediction = predict_test_val(path_to_folder + '/Forwards_train.csv', 
                     path_to_folder + '/Forwards_val.csv', 
                     path_to_folder + '/Forwards_test.csv',
                    dct_for_fwd[target], target)
    print("Prediction of", target)
    print("MAPE on val:", prediction[2])
    print("MAPE on test", prediction[3])
    print()

Prediction of gca_per90_y
MAPE on val: 0.7345799224833919
MAPE on test 0.6291198888610926

Prediction of npxg_per90_y
MAPE on val: 0.6229919156154499
MAPE on test 0.52418112938907

Prediction of sca_per90_y
MAPE on val: 0.5402946770450798
MAPE on test 0.2693136434800428

Prediction of shots_total_per90_y
MAPE on val: 0.252174029447813
MAPE on test 0.2819872618262192



In [14]:
for target in dct_for_win:
    prediction = predict_test_val(path_to_folder + '/Wingers_train.csv', 
                     path_to_folder + '/Wingers_val.csv', 
                     path_to_folder + '/Wingers_test.csv',
                    dct_for_win[target], target)
    print("Prediction of", target)
    print("MAPE on val:", prediction[2])
    print("MAPE on test", prediction[3])
    print()

Prediction of sca_per90_y
MAPE on val: 0.24453271467904497
MAPE on test 0.27934468806458773

Prediction of gca_per90_y
MAPE on val: 0.3933662560999921
MAPE on test 0.5024780951677699

Prediction of dribbles_completed_pct_y
MAPE on val: 0.1422860478436329
MAPE on test 0.20278125977291128

Prediction of crosses_y
MAPE on val: 1.6351064597787786
MAPE on test 1.7927553382601844

Prediction of carry_progressive_distance_y
MAPE on val: 1.8398115486598012
MAPE on test 1.1682526537601448



In [15]:
for target in dct_for_cb:
    prediction = predict_test_val(path_to_folder + '/Centerbacks_train.csv', 
                     path_to_folder + '/Centerbacks_val.csv', 
                     path_to_folder + '/Centerbacks_test.csv',
                    dct_for_cb[target], target)
    print("Prediction of", target)
    print("MAPE on val:", prediction[2])
    print("MAPE on test", prediction[3])
    print()

Prediction of aerials_won_y
MAPE on val: 0.5739508154360011
MAPE on test 0.8360478301559305

Prediction of tackles_won_y
MAPE on val: 0.6955958113795088
MAPE on test 0.7707653979179186

Prediction of interceptions_y
MAPE on val: 1.5765739785508253
MAPE on test 0.9146974569634914



In [51]:
path_dict = {}
path_dict['CB'] = path_to_folder + '/Centerbacks'
path_dict['FB'] = path_to_folder + '/Fullbacks'
path_dict['MID'] = path_to_folder + '/Midfielders'
path_dict['ST'] = path_to_folder + '/Forwards'
path_dict['WIN'] = path_to_folder + '/Wingers'
path_dict['all'] = path_to_folder + '/all'

cols =['CB', 'FB', 'MID', 'ST', 'WIN', 'all']

target = ['position', 'passes_completed_y', 'dribbles_completed_pct_y', 'passes_pct_y', 'sca_per90_y', 'npxg_per90_y', 'npxg_xa_per90_y',
          'gca_per90_y', 'shots_total_per90_y',
        'aerials_won_y', 'carry_progressive_distance_y',
       'crosses_y', 'interceptions_y',
     'passes_into_penalty_area_y', 
       'passes_progressive_distance_y', 'pressures_y', 'tackles_won_y', 'tackles_y', 'touches_y']

main_arr = []
for i in cols:
    curr_arr = []
    for j in target:
        if j == 'position':
            curr_arr.append(i)
        else:
            train = path_dict[i] + "_train.csv"
            test = path_dict[i] + "_test.csv"
            val = path_dict[i] + "_val.csv"
            tmp = predict_test_val(train, val, test, dct[j], j)
            curr_arr.append(tmp[3])
    main_arr.append(curr_arr)
        
df = pd.DataFrame(main_arr, columns=target)


In [38]:
df.to_csv('nazvanie.csv', index=False)

In [52]:
df

Unnamed: 0,position,passes_completed_y,dribbles_completed_pct_y,passes_pct_y,sca_per90_y,npxg_per90_y,npxg_xa_per90_y,gca_per90_y,shots_total_per90_y,aerials_won_y,carry_progressive_distance_y,crosses_y,interceptions_y,passes_into_penalty_area_y,passes_progressive_distance_y,pressures_y,tackles_won_y,tackles_y,touches_y
0,CB,0.674635,0.348192,0.043989,0.763837,0.683634,0.786734,0.540823,0.447663,0.836048,0.802484,1.275431,0.914697,0.851468,0.705267,0.665349,0.770765,0.759495,0.625054
1,FB,0.841164,0.240818,0.048864,0.393527,0.427138,0.365518,0.552207,0.920708,1.721795,0.924337,2.495274,1.199222,1.402586,0.900359,0.737134,0.966963,0.949574,0.801304
2,MID,0.606654,0.195762,0.036408,0.578878,0.676569,0.830162,0.567984,0.46067,0.852568,0.808628,1.140696,1.063305,1.13983,0.657062,0.717725,1.480318,1.801328,0.602877
3,ST,1.00839,0.251356,0.052385,0.269314,0.524181,0.32704,0.62912,0.281987,2.224868,1.441683,1.028982,0.737147,1.354222,1.005329,1.369487,0.918961,1.03649,0.862953
4,WIN,0.894346,0.202781,0.052115,0.279345,0.610877,0.313484,0.502478,0.333425,1.058182,1.168253,1.792755,1.202903,1.160451,1.196779,0.764224,0.954932,0.961564,0.889842
5,all,0.85916,0.24433,0.047444,0.571269,0.808367,0.737618,0.598318,0.577458,1.34846,1.145813,1.52626,1.085851,1.275553,1.14325,0.822791,1.048604,1.157642,0.785893


In [34]:
df = pd.read_csv("D:/Users/Загрузки/Telegram Desktop/asd")
df

Unnamed: 0,position,aerials_won_y,carry_progressive_distance_y,crosses_y,dribbles_completed_pct_y,gca_per90_y,interceptions_y,npxg_per90_y,npxg_xa_per90_y,passes_completed_y,passes_into_penalty_area_y,passes_pct_y,passes_progressive_distance_y,pressures_y,sca_per90_y,shots_total_per90_y,tackles_won_y,tackles_y,touches_y
0,CB,0.756542,0.632532,0.932949,0.332246,0.921167,1.136507,0.591602,0.619386,0.726817,0.794178,0.042875,0.654031,0.570938,0.471874,0.454752,0.729785,0.851673,0.826937
1,FB,1.076895,0.747221,1.265315,0.186005,0.682552,1.166285,0.755789,0.398239,0.691745,0.923907,0.057874,0.943379,0.729794,0.403373,0.606839,0.840529,0.875322,0.886264
2,MID,1.051158,1.123673,1.106258,0.37282,0.61653,0.657821,0.558424,0.452279,1.049395,0.888408,0.042288,1.014549,0.780358,0.309568,0.478503,0.901803,0.97035,0.848214
3,ST,1.019156,1.354829,1.027746,0.350953,0.582494,1.150204,0.302877,0.24418,0.740991,1.256399,0.052767,0.863469,0.89981,0.386574,0.401558,1.098001,0.955548,0.838725
4,WIN,1.192079,1.092697,1.22426,0.235402,0.498028,1.197848,0.33398,0.288451,1.059953,0.998545,0.051743,1.067765,1.013134,0.269256,0.291971,1.064732,1.162657,0.987596
5,all,1.02783,1.281418,1.141323,0.38162,0.607513,1.248108,0.509136,0.377259,1.087314,0.990724,0.044781,1.272365,0.943336,0.32355,0.389506,1.18179,1.14147,0.997517


In [18]:
dct['aerials_won_y']

['aerials_won_x',
 'tackles_won_x',
 'touches_x',
 'pressure_regain_pct_x',
 'possession_old',
 'aerials_won_old',
 'possession_new',
 'aerials_won_new']

In [53]:
best = ['position', 'dribbles_completed_pct_y', 'dribbles_completed_pct_y', 'passes_pct_y', 'shots_total_per90_y']
df1 = df[best]
df1

Unnamed: 0,position,dribbles_completed_pct_y,dribbles_completed_pct_y.1,passes_pct_y,shots_total_per90_y
0,CB,0.348192,0.348192,0.043989,0.447663
1,FB,0.240818,0.240818,0.048864,0.920708
2,MID,0.195762,0.195762,0.036408,0.46067
3,ST,0.251356,0.251356,0.052385,0.281987
4,WIN,0.202781,0.202781,0.052115,0.333425
5,all,0.24433,0.24433,0.047444,0.577458


In [50]:
df.to_csv('nazvanie2.csv', index=False)