In [1]:
import os

import pandas as pd
import plotly.express as px

import utils

pd.set_option("display.max_columns", None)
os.chdir("..")

In [2]:
df = pd.read_csv(
    os.path.join(utils.DATA_PATH, utils.DATA["raw"]), sep=";", decimal=",", index_col=0
)
df.head()

Unnamed: 0,sofifa_id,short_name,overall,potential,age,height_cm,weight_kg,club_name,league_name,club_contract_valid_until,preferred_foot,pace,shooting,passing,dribbling,defending,physic,formation,player_tags,value_eur
1,255438,R. Araki,67,80,19,170,60,Kashima Antlers,Japanese J. League Division 1,2022,Right,83.0,61.0,61.0,72.0,22.0,43.0,mid,,2400000.0
2,246070,Iván Jaime,70,82,20,180,73,Futebol Clube de Famalicão,Portuguese Liga ZON SAGRES,2025,Right,72.0,70.0,65.0,71.0,50.0,60.0,mid,,3800000.0
3,203173,Noguera,68,68,31,177,65,FC Goa,Indian Super League,2022,Right,75.0,62.0,67.0,69.0,57.0,59.0,mid,,1000000.0
4,178090,D. Biseswar,75,75,33,176,79,PAOK,Greek Super League,2022,Right,71.0,69.0,76.0,78.0,33.0,60.0,mid,,3500000.0
5,223978,Manu García,73,82,23,169,66,Deportivo Alavés,Spain Primera Division,2022,Right,78.0,69.0,72.0,75.0,40.0,56.0,mid,,7000000.0


## Data overvierw

In [3]:
df.shape

(5969, 20)

In [4]:
# Look for missing values, display % of missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values = missing_values / df.shape[0] * 100

missing_values.round(1).sort_values(ascending=False)

player_tags    88.1
shooting        9.1
pace            9.1
passing         9.1
dribbling       9.1
defending       9.1
physic          9.1
dtype: float64

In [5]:
# Search for duplicated columns
duplicated_columns = df.columns.duplicated()
duplicated_columns = df.columns[duplicated_columns]
duplicated_columns.shape

(0,)

In [6]:
# Search for duplicated rows
duplicated_rows = df.duplicated()
duplicated_rows = df[duplicated_rows]
duplicated_rows.shape

(0, 20)

In [7]:
df.describe().round(1)

Unnamed: 0,sofifa_id,overall,potential,age,height_cm,weight_kg,club_contract_valid_until,pace,shooting,passing,dribbling,defending,physic,value_eur
count,5969.0,5969.0,5969.0,5969.0,5969.0,5969.0,5969.0,5426.0,5426.0,5426.0,5426.0,5426.0,5426.0,5969.0
mean,224226.8,69.5,72.6,26.9,181.1,75.5,2022.8,69.0,55.1,60.9,65.2,56.0,68.5,5200072.0
std,26424.5,6.3,6.3,4.3,6.9,7.1,1.3,11.6,14.6,9.8,9.7,15.8,8.5,11742494.0
min,41.0,48.0,53.0,17.0,156.0,53.0,2021.0,28.0,19.0,25.0,29.0,17.0,37.0,25000.0
25%,208455.0,65.0,68.0,24.0,176.0,70.0,2022.0,63.0,44.0,54.0,60.0,43.0,63.0,775000.0
50%,228834.0,69.0,72.0,27.0,181.0,75.0,2023.0,70.0,58.0,61.0,66.0,61.0,69.0,1600000.0
75%,243007.0,73.0,77.0,30.0,186.0,80.0,2024.0,77.0,66.0,68.0,72.0,68.0,75.0,3700000.0
max,264481.0,93.0,95.0,43.0,203.0,103.0,2031.0,97.0,94.0,93.0,95.0,91.0,90.0,194000000.0


## Target variable overview
#### Value (EUR)

In [8]:
fig = px.histogram(df, x='value_eur', nbins=50, title='Value EUR')
fig.update_traces(marker_color=utils.COLORS_DICT['blue'], marker_line_color=utils.COLORS_DICT['gray'], marker_line_width=1)

fig.update_layout(
    width=800, 
    height=400,
    plot_bgcolor="white", 
    yaxis=dict(gridcolor='lightgray'),
    xaxis=dict(gridcolor='lightgray'),
)

fig.show()

In [12]:
fig = px.box(df, y='value_eur', title='Value EUR')

fig.update_traces(marker_color=utils.COLORS_DICT['blue'], marker_line_color=utils.COLORS_DICT['gray'], marker_line_width=1)
fig.update_layout(
    width=800, 
    height=400,
    plot_bgcolor="white", 
    yaxis=dict(gridcolor='lightgray'),
    xaxis=dict(gridcolor='lightgray'),
)

fig.show()

In [10]:
df['value_eur'].skew().round(2)

np.float64(5.65)

In [11]:
df[df['value_eur'] == df['value_eur'].max()]

Unnamed: 0,sofifa_id,short_name,overall,potential,age,height_cm,weight_kg,club_name,league_name,club_contract_valid_until,preferred_foot,pace,shooting,passing,dribbling,defending,physic,formation,player_tags,value_eur
5694,231747,K. Mbappé,91,95,22,182,73,Paris Saint-Germain,French Ligue 1,2022,Right,97.0,88.0,80.0,92.0,36.0,77.0,att,"#Speedster, #Dribbler, #Acrobat, #Clinical Fin...",194000000.0


### Conclusions

1. **Skewnees**
- The distribution of `value_eur` is heavily right-skewed
    - Large number of players having low values 
    - Few players having extremely high values (up to around 180 million euros)

2. **High values domination**
- The outliers with extremely high values can dominate the model
- We can remove outliers or process variable so it will not bias further predictions
    - Extremely high player values are not errors — they represent some of the most valuable players in the world
    - Instead of removing high values, I will transform variable with logarithm, which will make it more manageable

3. **One player is extrmly dominating the dataset**
- Player with id 231747 seems to be an evenement, and I am going to remove this record from the dataset

In [13]:
!jupyter nbconvert --to html notebooks\00_data_overview.ipynb --output=00_data_overview.html

[NbConvertApp] Converting notebook notebooks\00_data_overview.ipynb to html
  {%- elif type == 'text/vnd.mermaid' -%}
[NbConvertApp] Writing 297864 bytes to notebooks\00_data_overview.html
