In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
MV = pd.read_csv('Market Values.csv')

In [3]:
PS = pd.read_csv('Final_Player_Table.csv')
PS = PS.drop_duplicates('player',keep=False)
PS = PS.rename(columns={'player':'Player'})

In [4]:
Dataset = pd.merge(PS,MV,on=['Player'],how='inner')
Dataset

Unnamed: 0,Player,nationality,position,squad,age,birth_year,games,games_starts,minutes,goals,...,cards_yellow,cards_red,goals_per90,assists_per90,goals_assists_per90,goals_pens_per90,goals_assists_pens_per90,Foot,Market Value,Team_MV
0,Marko Alvir,hr CRO,MF,Viktoria Plzeň,26-269,1994,9,0,116,0,...,2,0,0.00,0.00,0.00,0.00,0.00,-,500,6400
1,Pavol Bajza,sk SVK,GK,Slovácko,29-131,1991,6,6,540,0,...,1,0,0.00,0.00,0.00,0.00,0.00,right,200,6600
2,David Bartek,cz CZE,MF,Bohemians 1905,32-335,1988,4,4,345,1,...,0,0,0.26,0.00,0.26,0.26,0.26,right,300,8675
3,Jean-David Beauguel,fr FRA,FW,Viktoria Plzeň,28-298,1992,14,5,604,6,...,0,0,0.89,0.15,1.04,0.75,0.89,right,1000,24600
4,Michal Beran,cz CZE,MF,Slovan Liberec,20-144,2000,10,7,610,0,...,0,0,0.00,0.30,0.30,0.00,0.30,-,300,58950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,Jan Vondra,cz CZE,DF,Bohemians 1905,25-122,1995,9,6,550,0,...,2,0,0.00,0.16,0.16,0.00,0.16,-,250,8675
128,Nataniel Wybraniec,pl POL,MF,Opava,20-018,2000,1,0,4,0,...,0,0,0.00,0.00,0.00,0.00,0.00,both,50,5875
129,Jakub Yunis,cz CZE,FW,Sigma Olomouc,24-294,1996,4,3,211,2,...,0,0,0.85,0.00,0.85,0.85,0.85,right,200,7925
130,Jan Zidek,cz CZE,DF,Opava,36-044,1984,11,11,990,2,...,3,0,0.18,0.00,0.18,0.09,0.09,left,150,5875


Now, we will add the current ranking in the Fortuna League by teams.

In [5]:
#by 1.1.2021
ranks = {'Slavia Prague':1,'Jablonec':2,'Sparta Prague':3,'Sigma Olomouc':4,'FK Pardubice':5,'Slovácko':6,'Slovan Liberec':7,'Baník Ostrava':8,'Karviná':9,'Viktoria Plzeň':10,'Teplice':11,'Fastav Zlín':12,'České Budĕjov.':13,'Bohemians 1905':14,'Příbram':15,'Zbrojovka Brno':16,'Mladá Boleslav':17,'Opava':18}

In [6]:
s=[]
for k in Dataset['squad']:
    s.append(ranks[k])
    
Dataset["Team_rank"] = s

We also divide the ranks into 3 bins (bin 1 ~ ranks 1 to 5, bin 2 ~ ranks 6 to 10, bin 3 ~ the rest).

In [7]:
team_rank_bins = []
for i in Dataset['Team_rank']:
    if i < 6:
        team_rank_bins.append(1)
    elif i < 11:
        team_rank_bins.append(2)
    else:
        team_rank_bins.append(3)
Dataset['Team_rank_bin'] = team_rank_bins

And further, we will scrape the latest FIFA nationality ranking and add it to the Dataset using dictionary.

In [8]:
url = 'https://www.fifa.com/fifa-world-ranking/ranking-table/men/' #by 10.12.2020 (latest)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
players = soup.find("table",{"id": "rank-table"}).tbody.find_all('tr')

keys = []
values = []
for idx,player in enumerate(players):
    keys.append(player.find("span",{"class":"fi-t__nTri"}).text)
    values.append(idx+1)
Nation_ranking = dict(zip(keys,values))

In [9]:
nation = []
for nat in Dataset['nationality']:
    try:
        nation.append(int(Nation_ranking[nat[-3:]]))
    except:
        nation.append(0)

Dataset['Nation_rank'] = nation
Dataset = Dataset[Dataset['Nation_rank']!=0]

The nation ranks are again divided into bins, where bin 1 ~ ranks 1 to 10, bin 2 ~ ranks 11 to 20, bin 3 ~ ranks 21 to 50, bin 4 ~ ranks 51 to 100, bin 5 ~ the rest.

In [10]:
nat_rank_bins = []
for i in Dataset['Nation_rank']:
    if i < 11:
        nat_rank_bins.append(1)
    elif i < 21:
            nat_rank_bins.append(2)
    elif i < 51:
            nat_rank_bins.append(3)
    elif i < 101:
            nat_rank_bins.append(4)
    else:
            nat_rank_bins.append(5)
Dataset['Nation_rank_bin'] = nat_rank_bins

#### Renaming the positions

Sicne there are some players who played on more than one position, we need to specify which is the dominant one to have only
3 levels. This is done in the following step.

In [11]:
pos2 = []
for p in Dataset['position']:
    if len(p) == 2:
        pos2.append(p)
    else:
        pos2.append(p[0:2])
Dataset['Pos2'] = pos2

In [12]:
d_pos = {'DF':'Defender','MF':'Midfielder','FW':'Forward'}
Position = []
Dataset = Dataset[Dataset['Pos2']!='GK']
for po in Dataset['Pos2']:
    Position.append(d_pos[po])
Dataset = Dataset.copy()    
Dataset['Position'] = Position

In [13]:
Dataset = Dataset.drop(columns=['position','Pos2'])

In [14]:
age = []
for i in Dataset['age']:
    age.append(int(str(i)[:2]))
Dataset['age'] = age    
age_sq = []
for i in Dataset['age']:
    age_sq.append(i*i)
Dataset['age_sq'] = age_sq

In [15]:
Dataset.columns

Index(['Player', 'nationality', 'squad', 'age', 'birth_year', 'games',
       'games_starts', 'minutes', 'goals', 'assists', 'pens_made', 'pens_att',
       'cards_yellow', 'cards_red', 'goals_per90', 'assists_per90',
       'goals_assists_per90', 'goals_pens_per90', 'goals_assists_pens_per90',
       'Foot', 'Market Value', 'Team_MV', 'Team_rank', 'Team_rank_bin',
       'Nation_rank', 'Nation_rank_bin', 'Position', 'age_sq'],
      dtype='object')

### OLS

First of all, we would like to say that since we have a lot of variables and comparatively low amount of observations,
the OLS is not ideal. This project is, however, more of a data science one (i.e. getting the data, merging it etc.)
than econometric one (checking the validity of assumptions.)

In [16]:
import statsmodels.api as sm

In [17]:
Dataset = pd.DataFrame(Dataset)
Y = Dataset['Market Value']
age = Dataset['age']
age_sq = Dataset['age_sq']
goals = Dataset['goals']
games = Dataset['games']
assists = Dataset['assists']
tr = Dataset['Team_rank']
tmv = Dataset['Team_MV']
nr = Dataset['Nation_rank']
nrb = Dataset['Nation_rank_bin']
trb = Dataset['Team_rank_bin']

X = np.column_stack((goals,assists,tmv,nrb,tr,age,age_sq))
X = sm.add_constant(X)

model = sm.OLS(np.log(Y),X)

In [18]:
fit = model.fit()

In [19]:
fit.summary()

0,1,2,3
Dep. Variable:,Market Value,R-squared:,0.647
Model:,OLS,Adj. R-squared:,0.625
Method:,Least Squares,F-statistic:,29.57
Date:,"Sat, 30 Jan 2021",Prob (F-statistic):,7.709999999999999e-23
Time:,18:12:07,Log-Likelihood:,-96.988
No. Observations:,121,AIC:,210.0
Df Residuals:,113,BIC:,232.3
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8767,1.617,-0.542,0.589,-4.081,2.327
x1,0.1162,0.038,3.088,0.003,0.042,0.191
x2,0.1057,0.058,1.830,0.070,-0.009,0.220
x3,2.888e-05,3.4e-06,8.485,0.000,2.21e-05,3.56e-05
x4,0.0233,0.077,0.301,0.764,-0.130,0.176
x5,-0.0311,0.012,-2.633,0.010,-0.055,-0.008
x6,0.4870,0.124,3.930,0.000,0.241,0.732
x7,-0.0090,0.002,-3.802,0.000,-0.014,-0.004

0,1,2,3
Omnibus:,3.809,Durbin-Watson:,1.948
Prob(Omnibus):,0.149,Jarque-Bera (JB):,4.276
Skew:,-0.118,Prob(JB):,0.118
Kurtosis:,3.89,Cond. No.,776000.0


Based on the estimated coefficients, we can conclude that:
* An increase in goals scored by a given player by one implies an increase in the player's market value by 11.6%.
* An increase in assists by one implies an increase in the market value by almost 10.6%.
* The market value of the player's team does not seem to have an effect on the player's value since this coefficient is very close to zero.
* Nation ranking is insignificant for our market value estimation.
* An improvement of the given player's team by one rank in their league table implies a decrease in player's market value by 3%.
* Age taken in its original format seems to have a strongly positive effect on player's market value. An increase in age by one implies an increase in market value by almost 49%. The square of age has a slightly negative effect as a unit increase in age squared implies a decrease in market value by 0.9%.