### First, we start by importing useful libraries and data

In [1]:
import pandas as pd
import numpy as np

# Read CSV file
data = pd.read_csv('player_valuations.csv')

# Print first 5 rows of data
print(data.head())

         date    datetime    dateweek  player_id  current_club_id  \
0  2013-08-07  2013-08-07  2013-08-05      99946             1095   
1  2014-01-13  2014-01-13  2014-01-13      99946             1095   
2  2010-01-29  2010-01-29  2010-01-25      76948              979   
3  2010-08-20  2010-08-20  2010-08-16      76948              979   
4  2011-01-17  2011-01-17  2011-01-17      76948              979   

   market_value_in_eur player_club_domestic_competition_id  
0               150000                                 FR1  
1               100000                                 FR1  
2               125000                                 PO1  
3               250000                                 PO1  
4               350000                                 PO1  


### Next, we convert data to a more convenient form for future analysis

In [2]:
# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Create an empty DataFrame to store the new data
new_data = pd.DataFrame(columns=['player_id', 'year', 'price'])

# Group data by player ID
grouped_data = data.groupby(['player_id'])

# Loop over each group of data
for group_id, group_data in grouped_data:

    # Sort data by date
    group_data = group_data.sort_values(by='date')
    
    # Get the minimum and maximum years for the player
    min_year = group_data['date'].dt.year.min()
    max_year = group_data['date'].dt.year.max()
    
    # Loop over each year for the player
    for year in range(min_year, max_year+1):
        
        # Get the price for the year
        try:
            price = group_data[group_data['date'].dt.year == year]['market_value_in_eur'].values[0]
        except IndexError:
            price = None
        
        # Add a new row to the new DataFrame
        new_row = {'player_id': group_id,
                   'year': year,
                   'price': price}
        
        # Convert the dictionary to a DataFrame
        new_row_df = pd.DataFrame([new_row])

        # Concatenate the new row to the new_data DataFrame
        new_data = pd.concat([new_data, new_row_df], ignore_index=True)
        
# Print the new DataFrame
print(new_data.head())

# Save new DataFrame
new_data.to_csv('new_data.csv', index=False)

  for group_id, group_data in grouped_data:


  player_id  year     price
0        10  2004   7000000
1        10  2005   9000000
2        10  2006  20000000
3        10  2007  23000000
4        10  2008  20000000


### Now we will extract birth year data

In [3]:
# load information about players to get their birth years
player_data = pd.read_csv('players.csv')

# Convert date column to datetime format
player_data['date_of_birth'] = pd.to_datetime(player_data['date_of_birth'])
# Extract player IDs and birth years
ids = []
birth_years = []
for index, row in player_data.iterrows():
    player_id = row['player_id']
    birth_year = row['date_of_birth'].year
    if player_id not in ids and birth_year != None:
        ids.append(player_id)
        birth_years.append(birth_year)

# Create a new DataFrame with the extracted data
birth_years = pd.DataFrame({'player_id': ids, 'birth_year': birth_years})

### Using birth year data we group data points by the ages of players at the time of measurement

In [4]:
# load new_data DataFrame
df =  pd.read_csv('new_data.csv')
temp = np.arange(10, 50)

# Create dictionary to store prices for each age group
ages = {i: [] for i in temp}
for index, row in df.iterrows():
    
    age = row['year'] - birth_years.loc[birth_years['player_id'] == row['player_id'], 'birth_year'].values[0]
    if age in temp and row['price'] > 0:
        ages[age].append(row['price'])

### We finish preparing data by computing percentiles and dividing data into features and labels

In [5]:
from scipy.stats import percentileofscore

# Add a 'percentile' column to the dataframe, which will be populated with percentiles computed based on the player's age and price
df['percentile'] = ''

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    
    # Compute the player's age by subtracting their birth year from the year in which the valuation was made
    age = row['year'] - birth_years.loc[birth_years['player_id'] == row['player_id'], 'birth_year'].values[0]
    
    # Check if the player's age is in the set of ages for which we have computed percentiles, and if the player's price is positive
    if age in temp and row['price'] > 0:  
        
        # Compute the percentile of the player's price based on their age
        df.loc[index, 'percentile'] = percentileofscore(ages[age], row['price'])

# Write the dataframe to a CSV file
df.to_csv('percentiles.csv', index=False)

# Read the percentiles CSV file back into a new dataframe
df = pd.read_csv('percentiles.csv')

# Initialize two numpy arrays to store the input and output for the LSTM model
x = np.empty((0, 4))
y = np.empty((0, 1))

# Initialize variables to keep track of the player ID and the number of percentiles seen so far for the current player
id_ = df.iloc[0]['player_id']
count = 0 

# Initialize a numpy array to store the most recent 5 percentiles for the current player
short_list = np.zeros(5)

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    
    # If the current row corresponds to the same player as the previous row, and we have not yet seen 5 percentiles for the player, and the percentile is not missing
    if id_ == row['player_id'] and count < 5 and row['percentile']:
        
        # Add the percentile to the short_list
        short_list[count] = row['percentile']
        count += 1
    
    # If we have seen 5 percentiles for the current player, and they are all positive
    elif count == 5 and np.all(short_list > 0):
        
        # Append the first 4 percentiles to x, and the 5th percentile to y
        x = np.vstack([x, [short_list[0], short_list[1], short_list[2], short_list[3]]])
        y = np.vstack([y, [short_list[4]]])
        
        # Reset the count and short_list for the next player
        count = 0 
        id_ = row['player_id']
    
    # If the current row corresponds to a new player, or we have not yet seen 5 positive percentiles for the current player
    else:
        
        # Reset the count and ID for the new player
        id_ = row['player_id']
        count = 0

### Finally, we train the machine learning model

In [6]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense


# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Reshape the input data to include a third dimension for the features
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define the LSTM model architecture
model = Sequential()
model.add(LSTM(units=64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

# Predict the fifth state using the trained model
y_pred = model.predict(X_test)

# Evaluate the model performance
print(np.mean((y_test - y_pred)**2))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
111.26074572059954


### Now we can predict players' future percentile ranking based on data from the last 4 years.

#### Prediction for failed transfers of the past

In [10]:
# https://www.transfermarkt.com/godfrey-oboabona/profil/spieler/217656
x_Oboabona = np.array([[percentileofscore(ages[25], 2500000), percentileofscore(ages[26], 1800000), percentileofscore(ages[27], 1800000), percentileofscore(ages[28], 650000)]]).reshape(1, X_train.shape[1], 1)  
print("Last four years: ", x_Oboabona.reshape(1, -1))
print("Prediction for the season with Dinamo Batumi:        ", model.predict(x_Oboabona, verbose=0))
print()
# https://www.transfermarkt.com/abraham-frimpong/profil/spieler/189402
x_Frimpong = np.array([[percentileofscore(ages[25], 2500000), percentileofscore(ages[26], 600000), percentileofscore(ages[27], 550000), percentileofscore(ages[28], 500000)]]).reshape(1, X_train.shape[1], 1)  
print("Last four years: ", x_Frimpong.reshape(1, -1))
print("Prediction for the season with Dinamo Batumi:        ", model.predict(x_Frimpong, verbose=0))
print()
# https://www.transfermarkt.com/lukas-grozurek/profil/spieler/75829
x_Grozurek = np.array([[percentileofscore(ages[25], 300000), percentileofscore(ages[26], 500000), percentileofscore(ages[27], 800000), percentileofscore(ages[28], 400000)]]).reshape(1, X_train.shape[1], 1)  
print("Last four years: ", x_Grozurek.reshape(1, -1))
print("Prediction for the season with Dinamo Batumi:        ", model.predict(x_Grozurek, verbose=0))

Last four years:  [[79.39946416 71.66619975 70.09111617 43.49641009]]
Prediction for the season with Dinamo Batumi:         [[39.386738]]

Last four years:  [[79.39946416 44.95377504 40.44419134 36.1245617 ]]
Prediction for the season with Dinamo Batumi:         [[33.73462]]

Last four years:  [[27.77559956 40.00560303 51.42748671 29.08665887]]
Prediction for the season with Dinamo Batumi:         [[28.297705]]


#### Prediction for the new Dinamo Batumi player for the current season

In [12]:
# https://www.transfermarkt.com/moussa-konate/profil/spieler/192774
x_Konaté = np.array([[percentileofscore(ages[26], 7000000), percentileofscore(ages[27], 4000000), percentileofscore(ages[28], 2500000), percentileofscore(ages[29], 1200000)]]).reshape(1, X_train.shape[1], 1)    
print("Last four years: ", x_Konaté.reshape(1, -1))
print("Prediction for the season with Dinamo Batumi:        ", model.predict(x_Konaté, verbose=0))

Last four years:  [[90.88457767 84.22551253 75.75972616 60.63432836]]
Prediction for the season with Dinamo Batumi:         [[52.63216]]
