# Project FIFA Moneyball
by Kevin Spurk

# 1  Descrption

#### | summary |

In the project, I use a dataset from the game FIFA21, that contains data of ca. 17.000 football players, listing a variety of information for them, such as club, wage, different scores for their skills and many more.

#### | objectives |

The objective is to build a linear regression model that predicts the players market value as accurate as possible. Moreover I want to answer the following 3 questions:

1. How much does the prediction quality vary in to the value? 
E.g. is there a trend of the model getting worse the highger the value is?
2. Which field position has the highest average value?
3.   How many players have a maximum overall score?

# 2 Setup 

#### | library imports |

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import norm
from scipy.special import inv_boxcox
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import os
import math
import random
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None
pd.set_option('max_row', None)


##### | Data import / overview |

In [None]:
data = pd.read_csv('data_files/fifa21_male2.csv')
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
# check if column id is ready to be used as index column

data['ID'].nunique()


# 3 Cleaning/Wrangling

#### | clean column names | 

In [None]:
data.rename(columns={'W/F':'r_weakfoot', 'A/W':'r_attacking_work', 'D/W':'r_defensive_work', 'IR':'r_intr', 'SM':'r_skillmove'}, inplace=True)

def clean_headers(df):
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df

In [None]:
clean_headers(data)

data.columns

#### | Drop columns |

In [None]:
# drop unnecessary columns

data = data.drop(['player_photo','club_logo','flag_photo', 'gender'], axis=1)

In [None]:
data['loan_date_end'].count()

In [None]:
data[~data['loan_date_end'].isnull()].head()

-> the ratio of players, that are loaned to another team is very low. The column is not going to be helpful to create a model.

-> The column 'team_&_contract' is redundant because it doesn't contain any information that is not already present in other columns

In [None]:
data = data.drop(['team_&_contract','loan_date_end'], axis=1)

#### | indexing |

In [None]:
data = data.set_index('id')
data = data.reset_index(drop=True)

#### | cleaning |
parametrization of cathegorical columns
and cleaning columns to prepare some for conversions to numerical columns

In [None]:
cat_select = ['nationality', 'club', 'bp', 'position', 'foot', 'r_attacking_work', 'r_defensive_work']

for column in cat_select:
    print(column, data[column].unique())

In [None]:
col_cat = list(data.select_dtypes(include=[np.object]).columns.values)
col_num = list(data.select_dtypes(include=[np.number]).columns.values)
col_money = ['value', 'wage', 'release_clause']
col_ratings = ['r_weakfoot', 'r_skillmove', 'r_intr']
pos_score = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']

# initial cleaning

def first_clean(df, columns=col_cat, skip_columns=[]):
    for column in columns:
        if column not in skip_columns:
            df[column] = df[column].str.strip().str.lower().str.replace(' ','_')
    return df

data = first_clean(data, skip_columns=['name', 'contract', 'joined'])


| cleaning of individual columns and conversion |

In [None]:
data['nationality'] = data['nationality'].str.replace('&amp;','').str.replace('__','_').str.replace('___','_')

data['height'] = data['height'].str.replace('"','')
data['height'] = data['height'].apply(lambda x: round(int(x.split("'")[0])*30.48 + int(x.split("'")[1])*2.54))

data['weight'] = data['weight'].str.replace('lbs','')

# converting contract to contract duration so 'joined' and 'contract' are not redundant and the duration might be relevant. 
# Toms reasoning regarding contract end is a good alternative 

data['joined'] = pd.to_datetime(data['joined'])
data['joined'] = data['joined'].dt.strftime('%Y')

def contract_to_num(x):
    years = [int(i) for i in x.split() if i.isdigit()]
    if (('loan') not in str(x).lower()):
        if len(years) == 2:
            x = years[1] - years[0]
            return int(x)
        elif len(years) == 1:
            x = 1
            return int(x)
        else:
            return np.NaN
    else:
        return np.NaN

data['contract'] = data['contract'].apply(contract_to_num)

In [None]:
'''
# NOT WORKING
# cleaning columns in bulk and converting to numerical values 

def money_to_num(df, columns=[]):
    for column in columns:
        df[column] = df[column].str.replace('€','')
        if ('m') in df[column]:
            df[column] = df[column].str.replace('m','')
            df[column] = int(round(float(df[column])*1000000))
        elif ('k') in df[column]:
            df[column] = df[column].str.replace('k','')
            df[column] = int(round(float(df[column])*1000))
        else:
            df[column] = int(round(float(df[column])))
    return df

def ratings_to_num(df, columns=[]):
    for column in columns:
        df[column] = int(df[column].str.replace('★','').str.replace('_',''))
    return df

def score_to_num(df, columns=[]):
    for column in columns:
        df[column] = df[column].str.replace('+-','-')
        if ('+') in df[column]:
            df[column] = int(df[column].split('+')[0]) + int(df[column].split('+')[1])
        if ('-') in df[column]:
            df[column] = int(df[column].split('-')[1]) + int(df[column].split('-')[0])
        else:
            df[column] = int(df[column])
    return df

data = money_to_num(data, columns=col_money)
data = ratings_to_num(data, columns=col_ratings)
data = score_to_num(data, columns=pos_score)

'''

| cleaning columns in bulk and converting to numerical values |

In [None]:
def money_to_num(x):
    # columns value, wage, release_clause
    x = str(x).replace('€','')
    if (('m') in str(x).lower()):
        x = x.replace('m','')
        x = int(round(float(x)*1000000))
        return x
    elif (('k') in str(x).lower()):
        x = x.replace('k','')
        x = int(round(float(x)*1000))
        return x
    else:
        return int(round(float(x)))
    
def ratings_to_num(x):
    # columns r_weakfoot, r_skillmove, r_intr
    x = int(str(x).replace('★','').replace('_',''))
    return int(x)

def score_to_num(x):
    # columns with positional scores
    x = str(x).replace('+-','-')
    if ('+') in x:
        x = int(x.split('+')[0]) + int(x.split('+')[1])
    elif ('-') in x:
        x = int(x.split('-')[1]) + int(x.split('-')[0])
    else:
        x = int(x)
    return x

for column in col_money:
    data[column] = data[column].apply(money_to_num)
    
for column in col_ratings:
    data[column] = data[column].apply(ratings_to_num)
    
for column in pos_score:
    data[column] = data[column].apply(score_to_num)

In [None]:
data.head()

#### | handling of null values |

In [None]:
for column in data.columns:
    print(column, data[column].isna().sum())

In [None]:
data['contract'].value_counts()
data['hits'].value_counts()
data['composure'].value_counts()
data['r_attacking_work'].value_counts()
data['r_defensive_work'].value_counts()

In [None]:
col_value_copy = ['position']
col_fill_median = ['composure']
col_fill_mode = ['hits','r_attacking_work', 'r_defensive_work']

# 'position' gets filled with the respective value from column 'bp'.  

for column in col_value_copy:
    data[column] = data[column].fillna(data['bp'])

'''
# NOT WORKING
# composure, r_attacking_work, r_defensive_work get filled with the median or mode value of the player tier, the player is in (top tier = players with ova > 66, mid tier = players with ova between 33 and 66), btm tier = players with ova < 30

for column in col_fill_median:
    median_toptier = data[data['ova'] > 66][column].median()
    median_midtier = data[(data['ova'] > 33) & (data['ova'] <= 66)][column].median()
    median_btmtier = data[data['ova'] <= 33][column].median()
    
    data[data['ova'] > 66][column] = data[data['ova'] > 66][column].fillna(median_toptier)
    data[(data['ova'] > 33) & (data['ova'] <= 66)][column] = data[(data['ova'] > 33) & (data['ova'] <= 66)][column].fillna(median_midtier)
    data[data['ova'] < 33][column] = data[data['ova'] < 33][column].fillna(median_btmtier)
'''    

# composure, r_attacking_work, r_defensive_work get filled with the median or mode value of the column

for column in col_fill_median:
    median_value = data[column].median()
    data[column] = data[column].fillna(median_value)
    
for column in col_fill_mode:
    mode_value = data[column].mode()
    data[column] = data[column].fillna(mode_value[0]) 

for column in data.columns:
    print(column, data[column].isna().sum())
    

In [None]:
data['contract'].value_counts()

In [None]:
# replace null in column 'contract' with the tp 10 occuring values taking into account how often they appear

def topx_picker(df, in_column, topx):
    column_pick = in_column
    values_sorted = df[column_pick].value_counts()
    value_list = []
    amount_list = []
    upper_limits = []
    lower_limits = []
    diff = 0
    top_count = topx

    for i in range(values_sorted.size):
        if len(value_list) < top_count:
            if values_sorted.index[i] != '':
                value_list.append(values_sorted.index[i])
                amount_list.append(values_sorted.iloc[i])
            else:
                pass
        else:
            break

    amount_sum = sum(amount_list)

    for j in amount_list:
        upper_limits.append(amount_sum - diff)
        diff += j
        lower_limits.append(amount_sum - diff)

    top_values = list(zip(value_list, upper_limits, lower_limits))
    freq_value = random.randint(1, amount_sum)
    value_pick = 0
    x = 0
    
    while x < len(top_values):
        if (freq_value <= top_values[x][1]) & (freq_value > top_values[x][2]):
            value_pick = top_values[x][0]
            x = len(top_values)
        else:
            x +=1
    return value_pick
    # change to nested function 
    
    
for row in range(len(data['contract'])):
    if np.isnan(data['contract'][row]) == True:
        top10_pick = topx_picker(df=data, in_column='contract', topx=10)
        data['contract'][row] = top10_pick 
    else:
        pass


In [None]:
# droping line with null values in other columns. All columns with 58 missing values seem to be missing for the same players, so theres going to be neglegible data loss
data = data.dropna()

In [None]:
# conversion of object columns to numerical columns e.g. weight, height, r_sillmove    

In [None]:
# cleaning 'hits' as preparation for conversion

def k_m_to_num(x):
    if (('m') in str(x).lower()):
        x = x.replace('m','')
        x = int(round(float(x)*1000000))
        return x
    elif (('k') in str(x).lower()):
        x = x.replace('k','')
        x = int(round(float(x)*1000))
        return x
    else:
        return int(round(float(x)))

data['hits'] = data['hits'].apply(k_m_to_num)

In [None]:
col_numconv = ['weight', 'joined', 'contract', 'hits']

for column in col_numconv:
    data[column] = data[column].apply(pd.to_numeric, errors='coerce')

# checking if conversion produced null values
for column in data.columns:
    print(column, data[column].isna().sum())

In [None]:
def float_to_int(x):
    x = int(round(x))
    return x

col_num = list(data.select_dtypes(include=[np.number]).columns.values)

for column in col_num:
    data[column] = data[column].apply(float_to_int)
    data[column] = data[column].apply(pd.to_numeric, downcast='integer')

In [None]:
data.dtypes

# 4 EDA

In [None]:
data_m = data.copy()
data_cat = data_m.select_dtypes(np.object)
data_num = data_m.select_dtypes(np.number)

#### | numerical columns |

In [None]:
# initial overview
data_num.describe().T

In [None]:
for column in data_num:
    plt.figure(figsize=(8,5))
    sns.distplot(data_num[column])
    plt.show()

In [None]:
for column in data_num:
    plt.figure(figsize=(8,5))
    sns.boxplot(x=data_num[column])
    plt.show()

In [None]:
'''
# NOT WORKING

numcol_count = len(data_num.columns)
i = 0
fig, axs = plt.subplots(numcol_count, 2)

for column in data_num:
    sns.distplot(data_num[column], ax=axs[i, 0])
    sns.boxplot(x=data_num[column], ax=axs[i, 1])
    plt.show()
    i += 1
'''

#### ! data improvements for better EDA !

#### | data splitting / grouping |

There is too much data for a sound overview of the correlations. Therefore I'm splitting data as follows:

1. Creating two separate datasets where one contains individual attributes and one umbrella attributes. 
    Umbrella attributes contain values that are aggregations or, weighted averages of individual attributes.
2. Adding columns for standard deviations of the values within umbrella attributes and accross attributes to check later on 
    if a homogeneity in skills plays a role in predicting performance/value



In [None]:
# data_umb for umbrella attributes, data_att for individual attributes
data_umb = data_num.copy()
data_att = data_num.copy()

In [None]:
def cc_stdev(df, columns, target_column):
    # function to store the stdev accross multiple columns for each row in a target column and delete original columns
    df[target_column] = ''
    
    for i in range(len(df)):
        df[target_column].iloc[i] = round(np.nanstd(df[columns].iloc[i]), 2)
    
    df[target_column] = df[target_column].apply(pd.to_numeric, downcast='float', errors='coerce')
    df = df.drop(columns=columns, axis=1)
    return df

def cc_mean(df, columns, target_column):
    # function to calculate mean accross multiple columns for each row, store in a target column and delete original
    df[target_column] = ''
    
    for i in range(len(df)):
        df[target_column].iloc[i] = round(np.nanmean(df[columns].iloc[i]), 2)
   
    df[target_column] = df[target_column].apply(pd.to_numeric, downcast='float', errors='coerce')
    df = df.drop(columns=columns, axis=1)
    return df

In [None]:
# replacing attribute columns with stdev or average columns

att_pac = ['acceleration', 'sprint_speed']
att_sho = ['finishing', 'long_shots', 'shot_power', 'penalties', 'volleys', 'positioning']
att_pas = ['crossing', 'curve', 'fk_accuracy', 'long_passing', 'short_passing', 'vision']
att_dri = ['agility', 'balance', 'ball_control', 'composure', 'dribbling', 'reactions']
att_def = ['heading_accuracy', 'interceptions', 'sliding_tackle', 'standing_tackle', 'marking']
att_phy = ['aggression', 'jumping', 'stamina', 'strength']
att_gk = ['gk_handling', 'gk_kicking', 'gk_diving', 'gk_positioning', 'gk_reflexes']
pos = ['ls', 'st', 'rs','lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']
pos_s = ['ls', 'st', 'rs']
pos_m = ['lw', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'ldm', 'cdm', 'rdm']
pos_b = ['lf', 'cf', 'rf', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'lwb']

data_umb = cc_stdev(data_umb, columns=att_pac, target_column='pac_stdev')
data_umb = cc_stdev(data_umb, columns=att_sho, target_column='sho_stdev')
data_umb = cc_stdev(data_umb, columns=att_pas, target_column='pas_stdev')
data_umb = cc_stdev(data_umb, columns=att_dri, target_column='dri_stdev')
data_umb = cc_stdev(data_umb, columns=att_def, target_column='def_stdev')
data_umb = cc_stdev(data_umb, columns=att_phy, target_column='phy_stdev')
data_umb = cc_stdev(data_umb, columns=att_gk, target_column='gk_stdev')
data_umb = cc_mean(data_umb, columns=pos, target_column='pos_avg')

data_att = cc_mean(data_att, columns=pos_s, target_column='s_avg')
data_att = cc_mean(data_att, columns=pos_m, target_column='m_avg')
data_att = cc_mean(data_att, columns=pos_b, target_column='b_avg')


In [None]:
# dropping columns with umbrella attributes
drop_umb = ['attacking', 'skill', 'movement', 'power', 'mentality', 'pac', 'sho', 'pas', 'dri', 'def', 'phy', 'total_stats', 'base_stats']

data_att = data_att.drop(columns=drop_umb, axis=1)


In [None]:
# heatmap for data including umbrella attributes

mask = np.zeros_like(data_umb.corr())

mask[np.triu_indices_from(mask)] = True 

fig, ax = plt.subplots(figsize=(20, 18))
ax = sns.heatmap(data_umb.corr(), mask=mask, annot=True)
plt.show()

In [None]:
# heatmap for data including individual attributes

mask = np.zeros_like(data_att.corr())

mask[np.triu_indices_from(mask)] = True 

fig, ax = plt.subplots(figsize=(20, 18))
ax = sns.heatmap(data_att.corr(), mask=mask, annot=True)
plt.show()

# 5 Processing Data

#### | Baseline model |

initial modeling to compare with coming models after data changes (transformations, encoding, ...)

In [None]:
# model with data including umbrella attributes

x_1 = data_umb.drop(['value'], axis=1)
y_1 = data_umb['value']

x_1 = sm.add_constant(x_1)

model = sm.OLS(y_1,x_1).fit() 

print(model.summary())

In [None]:
# model with data including individual attributes

x_2 = data_att.drop(['value'], axis=1)
y_2 = data_att['value']

x_2 = sm.add_constant(x_2)

model = sm.OLS(y_2,x_2).fit() 

print(model.summary())

In [None]:
# train-test-split pre-engineering

x_train_pre, x_test_pre, y_train_pre, y_test_pre = train_test_split(x_1, y_1, test_size=0.25, random_state=8)

model = LinearRegression()
model.fit(x_train_pre, y_train_pre)

predictions = model.predict(x_test_pre)

r2_score(y_test_pre, predictions), mean_absolute_error(y_test_pre, predictions), mean_squared_error(y_test_pre, predictions, squared=False)

-> Continuing with the umbrella attribute data. Both dataset perform very similar but, it has fewer columns


#### | feature selection (categorial) |

- dropping columns 'name', 'foot' because not relevant
- dropping column 'club'. Despite being possibly relevant, 
    it's difficult to use in this type of model because of the large amount of unique values, 
    and grouping takes too long
- dropping column 'position'. Alternatively, e.g. convert to numerical to see
    if the number of possible positions of a player is relevant, but a players versatility is arguably
    better captured by the position scores
- reducing the amount of unique values in 'nationality' and 'bp' for better encoding later 
    with grouping or picking the top 10 values and grouping the rest 


In [None]:
data_cat = data_cat.drop(columns=['name', 'foot','club', 'position'], axis=1)

In [None]:
# limiting column to the top 10 values

def topx_limit(df, in_columns, topx):
    value_list = []
    
    for col in in_columns:
        values_sorted = df[col].value_counts()
        
        for i in range(values_sorted.size):
            if len(value_list) < topx:
                if values_sorted.index[i] != '':
                    value_list.append(values_sorted.index[i])
                else:
                    pass
            else:
                break
            
        for j in range(len(df[col])):
            if df[col].iloc[j] in value_list:
                df[col].iloc[j] = df[col].iloc[j] 
            else: 
                df[col].iloc[j] = 'other'
    return df     

In [None]:
data_cat = topx_limit(df=data_cat, in_columns=['nationality'], topx=10)

In [None]:
# grouping 'bp' values into values for strikers, midfielder, defense and goalkeepers

for i in range(len(data_cat)):
    if data_cat['bp'].iloc[i] in pos_s:
        data_cat['bp'].iloc[i] = 's'
    elif data_cat['bp'].iloc[i] in pos_m:
        data_cat['bp'].iloc[i] = 'm'
    elif data_cat['bp'].iloc[i] in pos_b:
        data_cat['bp'].iloc[i] = 'b'
    elif data_cat['bp'].iloc[i] == 'gk':
        data_cat['bp'].iloc[i] = 'gk'
    else:
        data_cat['bp'].iloc[i] = 'other'


In [None]:
# concat numerical and categorial data

players = pd.concat([data_umb, data_cat], axis=1)

In [None]:
players.head()

#### | feature selection (numerical) |

In [None]:
# dropping columns with p values above 0.05 or very high colinearity with other columns (e.g. pos_avg, attacking)

feat_drop1 = ['height', 'weight', 'mentality', 'defending', 'goalkeeping', 'base_stats', 'r_weakfoot', 'pac', 'def', 'hits', 'sho_stdev', 'dri_stdev', 'def_stdev', 'pos_avg', 'attacking', 'skill', 'mentality'] 
players = players.drop(columns=feat_drop1, axis=1)



#### | encoding cathegorial columns |

In [None]:
col_encode = ['nationality', 'bp', 'r_attacking_work', 'r_defensive_work']
players = pd.get_dummies(players, columns=col_encode, drop_first=True)
players.head()

#### | model comparison |

In [None]:
x_e = players.drop(['value'], axis=1)
y_e = players['value']

x_train_e, x_test_e, y_train_e, y_test_e = train_test_split(x_e, y_e, test_size=0.25, random_state=8)

model = LinearRegression()
model.fit(x_train_e, y_train_e)

predictions = model.predict(x_test_e)

r2_score(y_test_e, predictions), mean_absolute_error(y_test_e, predictions), mean_squared_error(y_test_e, predictions, squared=False)

##### | data selection |

There are double peaks present in many columns distibution plots, suggesting that players might fall 
into two distinct groups. An educated guess is field players and goalkeepers. 
Separating goalkeepers and field players into different datasets 
and deleting column that are likely irrelevant for their performance/value


In [None]:
players_filtered = players[players['bp_gk'] == 0]
players_filtered = players_filtered.drop(['gk'], axis=1)

#### | model comparison |

In [None]:
x_f = players_filtered.drop(['value'], axis=1)
y_f = players_filtered['value']

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(x_f, y_f, test_size=0.25, random_state=8)

model = LinearRegression()
model.fit(x_train_f, y_train_f)

predictions = model.predict(x_test_f)

r2_score(y_test_f, predictions), mean_absolute_error(y_test_f, predictions), mean_squared_error(y_test_f, predictions, squared=False)

#### | transformations |

In [None]:
for column in players_filtered.columns:
    plt.figure(figsize=(8,5))
    sns.distplot(players_filtered[column])
    plt.show()

In [None]:
# boxcox transform some columns

def boxcox_transform(df, columns):
    _ci = {column: None for column in columns}
    for column in columns:
        df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
        df[column] = df[column].fillna(df[column].median())
        transformed_data, ci = stats.boxcox(df[column])
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci


In [None]:
# col_boxcox = ['age', 'value', 'wage', 'release_clause', 'movement', 'power', 'total_stats', 'sho', 'phy']

# players_filtered, _ci = boxcox_transform(players_filtered, columns=col_boxcox)

-> not performing any transformations after trying a few diffrent combinations of columns but r2_score always decreasing between .02 and .13


#### | removing outliers |

In [None]:
for column in players_filtered.columns:
    plt.figure(figsize=(8,5))
    sns.boxplot(players_filtered[column])
    plt.show()

In [None]:
def remove_outliers(df, threshold=1.5, in_columns=[], skip_columns=[]):
    for column in in_columns:
        if column not in skip_columns:
            upper = np.percentile(df[column],75)
            lower = np.percentile(df[column],25)
            iqr = upper - lower
            upper_limit = upper + (threshold * iqr)
            lower_limit = lower - (threshold * iqr)
            df = df[(df[column]>lower_limit) & (df[column]<upper_limit)]
    return df


In [None]:
'''
col_outl = ['age', 'ova', 'bov', 'pot', 'growth', 'joined', 'value', 'wage', 'release_clause', 
            'movement', 'power', 'contract', 'total_stats', 'r_skillmove', 'pas', 'dri', 'phy', 
            'pac_stdev', 'pas_stdev', 'phy_stdev', 'gk_stdev']

players_filtered = remove_outliers(players_filtered, in_columns=col_outl)

'''

-> not removing outliers. After trying, r2_score dropped significantly (around .28)

# 6 Final Modeling / Validation


In [None]:
x_final = players_filtered.drop(['value'], axis=1)
y_final = players_filtered['value']

x_train_final, x_test_final, y_train_final, y_test_final = train_test_split(x_final, y_final, test_size=0.25, random_state=8)

model = LinearRegression()
model.fit(x_train_final, y_train_final)

predictions = model.predict(x_test_final)

r2_score(y_test_final, predictions), mean_absolute_error(y_test_final, predictions), mean_squared_error(y_test_final, predictions, squared=False)

# 7 Reporting


In [None]:
results = pd.DataFrame()
results['value'] = y_test_final
results['pred_value'] = predictions
results['residual'] = results.apply(lambda x: abs(x['value'] - x['pred_value']), axis=1)
results.head()

-> confused about the results. Because of a lack of time I wont be able to check whats going on, but I'm going to ask in class.

-> ToDo: Add a function that produces a df with the results of the different models for comparison (name model/comb. of models, r2, ...)


# Q1 

How much does the prediction quality vary in relation to the value? 
E.g. is there a trend of the model getting worse the higher the value is (outliers)?


In [None]:
results['dev_percent'] = results.apply(lambda x: abs((x['residual'] / x['value'])*100), axis=1)

plt.figure(figsize=(20,8))
sns.scatterplot(data=results, x=results['value'], y=results['residual'])
#sns.barplot(data=results, x=results['value'], y=results['dev_percent'], bins=50, ax=axs[1, 0])
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(data=results, x=results['value'], y=results['dev_percent'])
plt.show()

-> The results are a little puzzling. They suggests that the models acurracy is highly skewed towards the higher end (i.e. outliers) of the value distribution. 

# Q2

Which position has the highest average value?


In [None]:
pos_gk = ['ls', 'st', 'rs','lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
pos_s = ['ls', 'st', 'rs']
pos_m = ['lw', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'ldm', 'cdm', 'rdm']
pos_b = ['lf', 'cf', 'rf', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'lwb']

In [None]:
data3 = data_m.copy()
data3['field_area'] = ''

pos_value = pd.DataFrame()
pos_value['pos'] = pos_gk
pos_value['field_area'] = ''
pos_value['pos_avg'] = ''

for i in range(len(data3)):
    if data3['bp'].iloc[i] in pos_s:
        data3['field_area'] = 's'
    elif data3['bp'].iloc[i] in pos_m:
        data3['field_area'] = 'm'
    elif data3['bp'].iloc[i] in pos_b:
        data3['field_area'] = 'b'
    elif data3['bp'].iloc[i] == 'gk':
        data3['field_area'] = 'gk'
    else:
        data3['field_area'] = 'other'
        
for j in pos_value['pos']:
    pos_value['pos_avg'] = data3[data3['bp'] == j]['value'].mean()
    if j in pos_s:
        pos_value['field_area'] = 's'
    elif j in pos_m:
        pos_value['field_area'] = 'm'
    elif j in pos_b:
        pos_value['field_area'] = 'b'
    elif j == 'gk':
        pos_value['field_area'] = 'gk'
    else:
        pos_value['field_area'] = 'other'

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(x="pos", y="pos_avg", data=pos_value) # x="field_area", hue="pos"
plt.show()

# Q3 
How many players have a maximum overall score?

In [None]:
ova_max = data3['ova'].max()

data3[data3['ova'] == ova_max]

In [None]:
print(len(data3[data3['ova'] == ova_max]))