In [12]:
import pandas as pd
import os
import numpy as np
os.environ['KERAS_BACKEND'] = 'torch'
# import keras

In [13]:
def findCorrelation(corr, cutoff=0.9, exact=None):
    """
    This function is the Python implementation of the R function 
    `findCorrelation()`.
    
    Relies on numpy and pandas, so must have them pre-installed.
    
    It searches through a correlation matrix and returns a list of column names 
    to remove to reduce pairwise correlations.
    
    For the documentation of the R function, see 
    https://www.rdocumentation.org/packages/caret/topics/findCorrelation
    and for the source code of `findCorrelation()`, see
    https://github.com/topepo/caret/blob/master/pkg/caret/R/findCorrelation.R
    
    -----------------------------------------------------------------------------

    Parameters:
    -----------
    corr: pandas dataframe.
        A correlation matrix as a pandas dataframe.
    cutoff: float, default: 0.9.
        A numeric value for the pairwise absolute correlation cutoff
    exact: bool, default: None
        A boolean value that determines whether the average correlations be 
        recomputed at each step
    -----------------------------------------------------------------------------
    Returns:
    --------
    list of column names
    -----------------------------------------------------------------------------
    Example:
    --------
    R1 = pd.DataFrame({
        'x1': [1.0, 0.86, 0.56, 0.32, 0.85],
        'x2': [0.86, 1.0, 0.01, 0.74, 0.32],
        'x3': [0.56, 0.01, 1.0, 0.65, 0.91],
        'x4': [0.32, 0.74, 0.65, 1.0, 0.36],
        'x5': [0.85, 0.32, 0.91, 0.36, 1.0]
    }, index=['x1', 'x2', 'x3', 'x4', 'x5'])

    findCorrelation(R1, cutoff=0.6, exact=False)  # ['x4', 'x5', 'x1', 'x3']
    findCorrelation(R1, cutoff=0.6, exact=True)   # ['x1', 'x5', 'x4'] 
    """
    
    def _findCorrelation_fast(corr, avg, cutoff):

        combsAboveCutoff = corr.where(lambda x: (np.tril(x)==0) & (x > cutoff)).stack().index

        rowsToCheck = combsAboveCutoff.get_level_values(0)
        colsToCheck = combsAboveCutoff.get_level_values(1)

        msk = avg[colsToCheck] > avg[rowsToCheck].values
        deletecol = pd.unique(np.r_[colsToCheck[msk], rowsToCheck[~msk]]).tolist()

        return deletecol


    def _findCorrelation_exact(corr, avg, cutoff):

        x = corr.loc[(*[avg.sort_values(ascending=False).index]*2,)]

        if (x.dtypes.values[:, None] == ['int64', 'int32', 'int16', 'int8']).any():
            x = x.astype(float)

        x.values[(*[np.arange(len(x))]*2,)] = np.nan

        deletecol = []
        for ix, i in enumerate(x.columns[:-1]):
            for j in x.columns[ix+1:]:
                if x.loc[i, j] > cutoff:
                    if x[i].mean() > x[j].mean():
                        deletecol.append(i)
                        x.loc[i] = x[i] = np.nan
                    else:
                        deletecol.append(j)
                        x.loc[j] = x[j] = np.nan
        return deletecol

    
    if not np.allclose(corr, corr.T) or any(corr.columns!=corr.index):
        raise ValueError("correlation matrix is not symmetric.")
        
    acorr = corr.abs()
    avg = acorr.mean()
        
    if exact or exact is None and corr.shape[1]<100:
        return _findCorrelation_exact(acorr, avg, cutoff)
    else:
        return _findCorrelation_fast(acorr, avg, cutoff)

In [14]:
# for file in os.listdir("data/whole"):
#     if "csv" in file:
#         print(file)
#         df = pd.read_csv("/mnt/e/Bigdata/Top5LeaguesPlayerData/data/whole/"+file)
#         df = df.fillna(-1)
#         df.to_csv("/mnt/e/Bigdata/Top5LeaguesPlayerData/data/whole/fillna/"+file, index=False)
#         print(df)
#         print("")   

In [15]:
serieAdf = []
for file in os.listdir("data/whole/fillna"):
    if "csv" in file:
        print(file)
        df = pd.read_csv("./data/whole/fillna/"+file)
        serieAdf.append(df)
        print("")

whole-2018,2019.csv-Bundesliga-with_value.csv

whole-2018,2019.csv-La-Liga-with_value.csv

whole-2018,2019.csv-Ligue-1-with_value.csv

whole-2018,2019.csv-Premier-League-with_value.csv

whole-2018,2019.csv-Serie-A-with_value.csv

whole-2019,2020.csv-Bundesliga-with_value.csv

whole-2019,2020.csv-La-Liga-with_value.csv

whole-2019,2020.csv-Ligue-1-with_value.csv

whole-2019,2020.csv-Premier-League-with_value.csv

whole-2019,2020.csv-Serie-A-with_value.csv

whole-2020,2021.csv-Bundesliga-with_value.csv

whole-2020,2021.csv-La-Liga-with_value.csv

whole-2020,2021.csv-Ligue-1-with_value.csv

whole-2020,2021.csv-Premier-League-with_value.csv

whole-2020,2021.csv-Serie-A-with_value.csv

whole-2021,2022.csv-Bundesliga-with_value.csv

whole-2021,2022.csv-La-Liga-with_value.csv

whole-2021,2022.csv-Ligue-1-with_value.csv

whole-2021,2022.csv-Premier-League-with_value.csv

whole-2021,2022.csv-Serie-A-with_value.csv

whole-2022,2023.csv-Bundesliga-with_value.csv

whole-2022,2023.csv-La-Liga-with_

In [16]:
serieAdf = pd.concat(serieAdf)

In [17]:
serieAdf.columns

Index(['Player', 'Nation', 'Pos', 'Age', 'MP', 'Playing Time_Starts',
       'Playing Time.1_Min', 'Playing Time.2_90s', 'Performance_Gls',
       'Performance.1_Ast', 'Performance.2_G+A', 'Performance.3_G-PK',
       'Performance.4_PK', 'Performance.5_PKatt', 'Performance.6_CrdY',
       'Performance.7_CrdR', 'Expected_xG', 'Expected.1_npxG',
       'Expected.2_xAG', 'Expected.3_npxG+xAG', 'Progression_PrgC',
       'Progression.1_PrgP', 'Progression.2_PrgR', 'Per 90 Minutes_Gls',
       'Per 90 Minutes.1_Ast', 'Per 90 Minutes.2_G+A', 'Per 90 Minutes.3_G-PK',
       'Per 90 Minutes.4_G+A-PK', 'Per 90 Minutes.5_xG',
       'Per 90 Minutes.6_xAG', 'Per 90 Minutes.7_xG+xAG',
       'Per 90 Minutes.8_npxG', 'Per 90 Minutes.9_npxG+xAG', 'Playing Time_MP',
       'Playing Time.1_Starts', 'Playing Time.2_Min', 'Playing Time.3_90s',
       'value', '0'],
      dtype='object')

In [18]:
serieAdf = serieAdf.fillna(-1)
serieAdf = serieAdf.drop(columns=['Player', 'Nation'])

In [19]:
def secondPosition(x):
    if "," in x:
        return x.split(",")[1]
    else:
        return None

serieAdf['SecondaryPos'] = serieAdf['Pos'].apply(secondPosition)
serieAdf['Pos'] = serieAdf['Pos'].apply(lambda x: x.split(",")[0])
serieAdf.reset_index(drop=True, inplace=True)

In [20]:
corr = serieAdf.corr(numeric_only=True)
hc = findCorrelation(corr, cutoff=0.75)
serieAdfTrimmed = serieAdf.drop(columns=hc)

In [21]:
serieAdfTrimmed

Unnamed: 0,Pos,Age,Playing Time_Starts,Performance.3_G-PK,Performance.5_PKatt,Performance.6_CrdY,Performance.7_CrdR,Progression.1_PrgP,Progression.2_PrgR,Per 90 Minutes.4_G+A-PK,Per 90 Minutes.8_npxG,Playing Time.1_Starts,value,0,SecondaryPos
0,MF,34.0,36.0,0.0,0.0,7.0,0.0,187.0,34.0,0.03,0.03,-1.0,-1.0,-1,
1,DF,24.0,34.0,4.0,0.0,7.0,0.0,72.0,27.0,0.15,0.05,-1.0,10000000.0,-1,MF
2,DF,24.0,34.0,4.0,0.0,2.0,0.0,89.0,155.0,0.24,0.08,-1.0,17000000.0,-1,MF
3,MF,24.0,32.0,7.0,1.0,5.0,0.0,102.0,145.0,0.28,0.34,-1.0,7000000.0,-1,FW
4,DF,28.0,29.0,3.0,0.0,1.0,0.0,109.0,87.0,0.34,0.03,-1.0,5000000.0,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17115,GK,18.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.00,-1.00,-1.0,50000.0,-1,
17116,MF,18.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.00,-1.00,-1.0,10000.0,-1,
17117,MF,17.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.00,-1.00,-1.0,-1.0,-1,
17118,MF,18.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.00,-1.00,-1.0,75000.0,-1,


In [43]:
# from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

X = serieAdfTrimmed.drop(columns=['value'])
X['Pos'] = X['Pos'].astype(str).fillna("Sconosciuta")
X['SecondaryPos'] = X['SecondaryPos'].astype(str).fillna("Sconosciuta")
y = serieAdfTrimmed['value']
scaler = StandardScaler()
pos = OrdinalEncoder().fit_transform(X['Pos'].values.reshape(-1, 1))
spos = OrdinalEncoder().fit_transform(X['SecondaryPos'].values.reshape(-1, 1))
X['Pos'] = pd.Series(pos.flatten())
X['SecondaryPos'] = pd.Series(spos.flatten())
X = scaler.fit_transform(X)
y = scaler.fit_transform(y.values.reshape(-1, 1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = ElasticNet()
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))


ValueError: could not convert string to float: 'Player                       Dominique Heintz\nNation                                 de GER\nPos                                        DF\nAge                                        28\nMP                                       12.0\nPlaying Time_Starts                       8.0\nPlaying Time.1_Min                      822.0\nPlaying Time.2_90s                        9.1\nPerformance_Gls                           0.0\nPerformance.1_Ast                         0.0\nPerformance.2_G+A                         0.0\nPerformance.3_G-PK                        0.0\nPerformance.4_PK                          0.0\nPerformance.5_PKatt                       0.0\nPerformance.6_CrdY                        1.0\nPerformance.7_CrdR                        0.0\nExpected_xG                               0.3\nExpected.1_npxG                           0.3\nExpected.2_xAG                            0.2\nExpected.3_npxG+xAG                       0.5\nProgression_PrgC                          3.0\nProgression.1_PrgP                       21.0\nProgression.2_PrgR                        2.0\nPer 90 Minutes_Gls                        0.0\nPer 90 Minutes.1_Ast                      0.0\nPer 90 Minutes.2_G+A                      0.0\nPer 90 Minutes.3_G-PK                     0.0\nPer 90 Minutes.4_G+A-PK                   0.0\nPer 90 Minutes.5_xG                      0.04\nPer 90 Minutes.6_xAG                     0.02\nPer 90 Minutes.7_xG+xAG                  0.06\nPer 90 Minutes.8_npxG                    0.04\nPer 90 Minutes.9_npxG+xAG                0.06\nPlaying Time_MP                           NaN\nPlaying Time.1_Starts                     NaN\nPlaying Time.2_Min                        NaN\nPlaying Time.3_90s                        NaN\nvalue                               1500000.0\nName: 125, dtype: object'

In [None]:
from keras import Sequential
from keras import layers
from keras import ops
model = Sequential(
    [
        layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.summary()

In [None]:
import mlflow.keras
run = mlflow.start_run(run_name="Keras")
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_squared_error', "r2_score", "mean_absolute_error", "mean_absolute_percentage_error"]
)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[mlflow.keras.MlflowCallback(run)])
test_scores = model.evaluate(X_test, y_test, verbose=2,  callbacks=[mlflow.keras.MlflowCallback(run)])
mlflow.end_run()



Epoch 1/100
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.0706 - mean_absolute_error: 0.1593 - mean_absolute_percentage_error: 290.4974 - mean_squared_error: 0.0706 - r2_score: 0.9322 - val_loss: 0.5326 - val_mean_absolute_error: 0.3709 - val_mean_absolute_percentage_error: 551.3778 - val_mean_squared_error: 0.5326 - val_r2_score: 0.3032
Epoch 2/100
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0541 - mean_absolute_error: 0.1365 - mean_absolute_percentage_error: 208.7081 - mean_squared_error: 0.0541 - r2_score: 0.9436 - val_loss: 0.5159 - val_mean_absolute_error: 0.3626 - val_mean_absolute_percentage_error: 492.6307 - val_mean_squared_error: 0.5159 - val_r2_score: 0.3251
Epoch 3/100
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0475 - mean_absolute_error: 0.1279 - mean_absolute_percentage_error: 264.7182 - mean_squared_error: 0.0475 - r2_score: 0.9531 - val_loss: 0.5311 - val

In [None]:
mlflow.end_run()