### First, we start by importing useful libraries and data

In [1]:
import pandas as pd
import numpy as np

# Read CSV file
data = pd.read_csv('player_valuations.csv')

# Print first 5 rows of data
print(data.head())

         date    datetime    dateweek  player_id  current_club_id  \
0  2013-08-07  2013-08-07  2013-08-05      99946             1095   
1  2014-01-13  2014-01-13  2014-01-13      99946             1095   
2  2010-01-29  2010-01-29  2010-01-25      76948              979   
3  2010-08-20  2010-08-20  2010-08-16      76948              979   
4  2011-01-17  2011-01-17  2011-01-17      76948              979   

   market_value_in_eur player_club_domestic_competition_id  
0               150000                                 FR1  
1               100000                                 FR1  
2               125000                                 PO1  
3               250000                                 PO1  
4               350000                                 PO1  


### Next, we convert data to a more convenient form for future analysis

In [2]:
# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Create an empty DataFrame to store the new data
new_data = pd.DataFrame(columns=['player_id', 'year', 'price'])

# Group data by player ID
grouped_data = data.groupby(['player_id'])

# Loop over each group of data
for group_id, group_data in grouped_data:

    # Sort data by date
    group_data = group_data.sort_values(by='date')
    
    # Get the minimum and maximum years for the player
    min_year = group_data['date'].dt.year.min()
    max_year = group_data['date'].dt.year.max()
    
    # Loop over each year for the player
    for year in range(min_year, max_year+1):
        
        # Get the price for the year
        try:
            price = group_data[group_data['date'].dt.year == year]['market_value_in_eur'].values[0]
        except IndexError:
            price = None
        
        # Add a new row to the new DataFrame
        new_row = {'player_id': group_id,
                   'year': year,
                   'price': price}
        
        # Convert the dictionary to a DataFrame
        new_row_df = pd.DataFrame([new_row])

        # Concatenate the new row to the new_data DataFrame
        new_data = pd.concat([new_data, new_row_df], ignore_index=True)
        
# Print the new DataFrame
print(new_data.head())

# Save new DataFrame
new_data.to_csv('new_data.csv', index=False)

  for group_id, group_data in grouped_data:


  player_id  year     price
0        10  2004   7000000
1        10  2005   9000000
2        10  2006  20000000
3        10  2007  23000000
4        10  2008  20000000


### Now we will extract birth year data

In [3]:
# load information about players to get their birth years
player_data = pd.read_csv('players.csv')

# Convert date column to datetime format
player_data['date_of_birth'] = pd.to_datetime(player_data['date_of_birth'])
# Extract player IDs and birth years
ids = []
birth_years = []
for index, row in player_data.iterrows():
    player_id = row['player_id']
    birth_year = row['date_of_birth'].year
    if player_id not in ids and birth_year != None:
        ids.append(player_id)
        birth_years.append(birth_year)

# Create a new DataFrame with the extracted data
birth_years = pd.DataFrame({'player_id': ids, 'birth_year': birth_years})

### Using birth year data we group data points by the ages of players at the time of measurement

In [4]:
# load new_data DataFrame
df =  pd.read_csv('new_data.csv')
temp = np.arange(10, 50)

# Create dictionary to store prices for each age group
ages = {i: [] for i in temp}
for index, row in df.iterrows():
    
    age = row['year'] - birth_years.loc[birth_years['player_id'] == row['player_id'], 'birth_year'].values[0]
    if age in temp and row['price'] > 0:
        ages[age].append(row['price'])

### We finish preparing data by computing percentiles and dividing data into features and labels

In [5]:
from scipy.stats import percentileofscore
        
df['percentile'] = ''
for index, row in df.iterrows():
    age = row['year'] - birth_years.loc[birth_years['player_id'] == row['player_id'], 'birth_year'].values[0]
    if age in temp and row['price'] > 0:  
        df.loc[index, 'percentile'] = percentileofscore(ages[age], row['price'])


df.to_csv('percentiles.csv', index=False)

df =  pd.read_csv('percentiles.csv')
x = np.empty((0, 4))
y = np.empty((0, 1))
id_ = df.iloc[0]['player_id'] 
count = 0 
short_list = np.zeros(5)
for index, row in df.iterrows():
    if id_ ==  row['player_id'] and count < 5 and row['percentile']:
        short_list[count] = row['percentile']
        count += 1
    elif count == 5 and np.all(short_list > 0):
        x = np.vstack([x, [short_list[0], short_list[1], short_list[2], short_list[3]]])
        y = np.vstack([y, [short_list[4]]])
        count = 0 
        id_ = row['player_id']
    else:
        id_ = row['player_id']
        count = 0

### Finally, we train the machine learning model

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize the LinearRegression model
model = LinearRegression()

# Train the model on the training data
model.fit(x_train, y_train)

### Now we can predict players' future percentile ranking based on data from the last 4 years.

#### Prediction for failed transfers of the past

In [41]:
# https://www.transfermarkt.com/godfrey-oboabona/profil/spieler/217656
x_Oboabona = np.array([[percentileofscore(ages[25], 2500000), percentileofscore(ages[26], 1800000), percentileofscore(ages[27], 1800000), percentileofscore(ages[28], 650000)]])  
print("Last four years: ", x_Oboabona)
print("Prediction for season with Dinamo Batumi:            ", model.predict(x_Oboabona))

# https://www.transfermarkt.com/abraham-frimpong/profil/spieler/189402
x_Frimpong = np.array([[percentileofscore(ages[25], 2500000), percentileofscore(ages[26], 600000), percentileofscore(ages[27], 550000), percentileofscore(ages[28], 500000)]])  
print("Last four years: ", x_Frimpong)
print("Prediction for season with Dinamo Batumi:            ", model.predict(x_Frimpong))

# https://www.transfermarkt.com/lukas-grozurek/profil/spieler/75829
x_Grozurek = np.array([[percentileofscore(ages[25], 300000), percentileofscore(ages[26], 500000), percentileofscore(ages[27], 800000), percentileofscore(ages[28], 400000)]])  
print("Last four years: ", x_Grozurek)
print("Prediction for season with Dinamo Batumi:            ", model.predict(x_Grozurek))

# https://www.transfermarkt.com/vamara-sanogo/profil/spieler/374667
x_Sanogo = np.array([[percentileofscore(ages[22], 175000), percentileofscore(ages[23], 225000), percentileofscore(ages[24], 600000), percentileofscore(ages[25], 200000)]])  
print("Last four years: ", x_Sanogo)
print("Prediction for season with Dinamo Batumi:            ", model.predict(x_Sanogo))

Last four years:  [[79.39946416 71.66619975 70.09111617 43.49641009]]
Prediction for season with Dinamo Batumi:             [[38.91821194]]
Last four years:  [[79.39946416 44.95377504 40.44419134 36.1245617 ]]
Prediction for season with Dinamo Batumi:             [[34.85545279]]
Last four years:  [[27.77559956 40.00560303 51.42748671 29.08665887]]
Prediction for season with Dinamo Batumi:             [[25.15464842]]
Last four years:  [[27.8234271  28.77164306 52.4715184  16.69280533]]
Prediction for season with Dinamo Batumi:             [[11.70877219]]


#### Prediction for the new Dinamo Batumi player for the current season

In [43]:
# https://www.transfermarkt.com/moussa-konate/profil/spieler/192774
x_Konaté = np.array([[percentileofscore(ages[26], 7000000), percentileofscore(ages[27], 4000000), percentileofscore(ages[28], 2500000), percentileofscore(ages[29], 1200000)]])  
print("Last four years: ", x_Konaté)
print("Prediction for season with Dinamo Batumi:            ", model.predict(x_Konaté))

Last four years:  [[90.88457767 84.22551253 75.75972616 60.63432836]]
Prediction for season with Dinamo Batumi:             [[56.75379759]]
