In [None]:
pip install category_encoders

In [1]:
# STANDARD LIBRARIES
import os
import warnings
warnings.filterwarnings("ignore")

# THIRD PARTY LIBRARIES
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pydataset
import scipy.stats as stats
import category_encoders as ce

#statistical tests
from scipy import stats
from scipy.stats import pearsonr, spearmanr



# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import sklearn.linear_model
import sklearn.feature_selection
import sklearn.preprocessing
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.feature_selection import RFE
import matplotlib
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from math import sqrt

In [2]:
import acquire 
import prepare

In [3]:
df = acquire.get_fifa_data()

In [4]:
df = prepare.prepped_data(df)

Before dropping nulls, 20700 rows, 111 cols
After dropping nulls. 19457 rows. 66 cols
After cleaning the data and adding additional columns there are: 15373 rows. 83 cols


In [6]:
goalkeeper_df, forward_df, midfielder_df, defender_df = prepare.acquire_players_by_position(df)

In [None]:
goalkeeper_df.he

# Step 1: Assign all player positions to a dataframe

In [None]:
#GK
goalkeepers = df[(df.club_position == 'GK')]

In [None]:
goalkeepers.shape

In [None]:
#ST, RS, LS, LW, RW, LF, CF
forwards = df[(df.club_position == 'ST') | (df.club_position == 'RS') | (df.club_position == 'LS') | (df.club_position == 'LW') | (df.club_position == 'RW') | (df.club_position == 'LF') | (df.club_position == 'CF')]


In [None]:
forwards.shape

In [None]:
#LCM, RCM, RM, LM, CAM, LDM, RDM, CDM, LAM, RAM 
midfielders = df[(df.club_position == 'LCM') | (df.club_position == 'RCM') | (df.club_position == 'RM') | (df.club_position == 'LM') | (df.club_position == 'CAM') | (df.club_position == 'LDM') | (df.club_position == 'RDM') | (df.club_position == 'CDM') | (df.club_position == 'LAM') | (df.club_position == 'RAM')] 


In [None]:
midfielders.shape

In [None]:
#RCB, LCB, LB, RB, CB, RWB, LWB
defenders = df[(df.club_position == 'RCB') | (df.club_position == 'LCB') | (df.club_position == 'LB') | (df.club_position == 'RB') | (df.club_position == 'CB') | (df.club_position == 'RWB') | (df.club_position == 'LWB')]

In [None]:
defenders.shape

In [None]:
reserve = df[df.club_position == 'RES']


In [None]:
reserve.shape

In [None]:
substitute = df[df.club_position == 'SUB']

In [None]:
substitute.shape

In [None]:
df.shape

# Step 2: Separate RESERVE Player dataframe by player position

In [None]:
reserve.shape

In [None]:
reserve['player_positions'] = reserve['player_positions'].str.split(',').str[0]

In [None]:
df_reserve_goalkeepers = reserve[reserve.player_positions == 'GK']

In [None]:
df_reserve_goalkeepers.shape

In [None]:
#ST, RS, LS, LW, RW, LF, CF
df_reserve_forwards = reserve[(reserve.player_positions == 'ST') | (reserve.player_positions == 'RW') | (reserve.player_positions == 'LW') | (reserve.player_positions == 'CF')]

In [None]:
df_reserve_forwards.shape

In [None]:
#LCM, RCM, RM, LM, CAM, LDM, RDM, CDM, LAM, RAM 
df_reserve_midfielders = reserve[(reserve.player_positions == 'CM') | (reserve.player_positions == 'CDM') | (reserve.player_positions == 'CAM') | (reserve.player_positions == 'RM') | (reserve.player_positions == 'LM')]

In [None]:
df_reserve_midfielders.shape

In [None]:
#RCB, LCB, LB, RB, CB, RWB, LWB
df_reserve_defenders = reserve[(reserve.player_positions == 'CB') | (reserve.player_positions == 'LB') | (reserve.player_positions == 'RB') | (reserve.player_positions == 'LWB') | (reserve.player_positions == 'RWB')]

In [None]:
df_reserve_defenders.shape

# Step 3: Separate SUBSTITUTE Player dataframe by player position

In [None]:
substitute.shape

In [None]:
substitute['player_positions'] = substitute['player_positions'].str.split(',').str[0]


In [None]:
df_substitute_goalkeeper = substitute[substitute.player_positions == 'GK']

In [None]:
df_substitute_goalkeeper.shape

In [None]:
df_substitute_forward = substitute[(substitute.player_positions == 'ST') | (substitute.player_positions == 'RW') | (substitute.player_positions == 'LW') | (substitute.player_positions == 'CF')]

In [None]:
df_substitute_forward.shape

In [None]:
df_substitute_midfielders = substitute[(substitute.player_positions == 'CM') | (substitute.player_positions == 'CDM') | (substitute.player_positions == 'CAM') | (substitute.player_positions == 'LM') | (substitute.player_positions == 'RM')]

In [None]:
df_substitute_midfielders.shape

In [None]:
df_substitute_defenders = substitute[(substitute.player_positions == 'CB') | (substitute.player_positions == 'LB') | (substitute.player_positions == 'RB') | (substitute.player_positions == 'RWB') | (substitute.player_positions == 'LWB')]

In [None]:
df_substitute_defenders.shape

# Step 4: Concat Dataframes together 


In [None]:
goalkeeper_df = pd.concat([goalkeepers, df_substitute_goalkeeper, df_reserve_goalkeepers], axis=0)

In [None]:
forward_df = pd.concat([forwards, df_substitute_forward, df_reserve_forwards], axis=0)

In [None]:
midfielder_df = pd.concat([midfielders, df_substitute_midfielders, df_reserve_midfielders], axis=0)

In [None]:
defender_df = pd.concat([defenders, df_substitute_defenders, df_reserve_defenders], axis=0)

# Step 5: Correlation for each position

In [None]:
goalkeeper_df.corr()[['wage_eur']].sort_values(by='wage_eur', ascending=False).head(10)

In [None]:
forward_df.corr()[['wage_eur']].sort_values(by='wage_eur', ascending=False).head(10)

In [None]:
midfielder_df.corr()[['wage_eur']].sort_values(by='wage_eur', ascending=False).head(10)

In [None]:
defender_df.corr()[['wage_eur']].sort_values(by='wage_eur', ascending=False).head(15)

# Clusters based on Correlation

_Prepare the Data_

In [None]:
#split
train, validate, test = prepare.split(goalkeeper_df)

In [None]:
#split data
X_train = train.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_train = train[['wage_eur']]

X_validate = validate.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_validate = validate[['wage_eur']]

X_test = test.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_test = test[['wage_eur']]

_Encode categorical variables_ 

In [None]:
#encode categorical variables
encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)
X_test_loo = encoder.transform(X_test)
X_train_loo.shape

In [None]:
#scale data 
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_loo, y_train)
X_train_scaled.shape

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled_df.describe()

# Goalkeeper Clusters

#### ANOVA test on gk_handling, gk_diving, gk_reflexes

In [None]:
#significance level 
a = 0.05 
#define x 
X = X_train_scaled_df[['gk_handling', 'gk_diving', 'gk_reflexes']]
#define kmeans
kmeans = KMeans(n_clusters=4)
#fit 
kmeans.fit(X)

In [None]:
train['clusters'] = kmeans.predict(X)

In [None]:
# Find K: evaluate best k using elbow method 
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(12, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 15)}).plot(marker='x')
    plt.xticks(range(2, 15))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
train.groupby('clusters')[ 'gk_handling', 'gk_diving', 'gk_reflexes'].median()

#### _The ANOVA test will be used to measure the significance or lack there of variance between clusters_

 - H0: There is no significant difference between salaries of each cluster 
 - HA: There is a significant difference between salaries of each cluster

In [None]:
alpha = 0.05

F, p = stats.f_oneway(train[train.clusters == 0].wage_eur,
                      train[train.clusters == 1].wage_eur,
                      train[train.clusters == 2].wage_eur,
                      train[train.clusters == 3].wage_eur)

print('Anova Test Results on goalkeeper handling/diving/reflexes Cluster')
print('F-value: ',F)
print('p-value: ',p)
#if p > alpha:
    #print("\nWe fail to reject null hypothesis.")
#elif t < 0:
    #print("We fail to reject null hypothesis.")
#else:
    #print("We reject null hypothesis.\n\n")

# Prepare Forwards

In [None]:
#split
train, validate, test = prepare.split(forward_df)

In [None]:
#split data
X_train = train.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_train = train[['wage_eur']]

X_validate = validate.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_validate = validate[['wage_eur']]

X_test = test.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_test = test[['wage_eur']]

In [None]:
#encode categorical variables
encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)
X_test_loo = encoder.transform(X_test)
X_train_loo.shape

In [None]:
#scale data 
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_loo, y_train)
X_train_scaled.shape

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# FORWARD CLUSTERS

#### ANOVA test on ball_control, reactions, dribbling

In [None]:
#significance level 
a = 0.05 
#define x 
X = X_train_scaled_df[['ball_control', 'reactions', 'dribbling']]
#define kmeans
kmeans = KMeans(n_clusters=4)
#fit 
kmeans.fit(X)

In [None]:
train['clusters'] = kmeans.predict(X)

In [None]:
# Find K: evaluate best k using elbow method 
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(12, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 15)}).plot(marker='x')
    plt.xticks(range(2, 15))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
train.groupby('clusters')[ 'ball_control', 'reactions', 'dribbling'].median()

#### _The ANOVA test will be used to measure the significance or lack there of variance between clusters_

 - H0: There is no significant difference between salaries of each cluster 
 - HA: There is a significant difference between salaries of each cluster

In [None]:
alpha = 0.05

F, p = stats.f_oneway(train[train.clusters == 0].wage_eur,
                      train[train.clusters == 1].wage_eur,
                      train[train.clusters == 2].wage_eur,
                      train[train.clusters == 3].wage_eur)

print('Anova Test Results on goalkeeper handling/diving/reflexes Cluster')
print('F-value: ',F)
print('p-value: ',p)
#if p > alpha:
    #print("\nWe fail to reject null hypothesis.")
#elif t < 0:
    #print("We fail to reject null hypothesis.")
#else:
    #print("We reject null hypothesis.\n\n")

# Prepare Midfielders

In [None]:
#split
train, validate, test = prepare.split(midfielder_df)

In [None]:
#split data
X_train = train.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_train = train[['wage_eur']]

X_validate = validate.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_validate = validate[['wage_eur']]

X_test = test.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_test = test[['wage_eur']]

In [None]:
#encode categorical variables
encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)
X_test_loo = encoder.transform(X_test)
X_train_loo.shape

In [None]:
#scale data 
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_loo, y_train)
X_train_scaled.shape

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Midfielder Cluster

#### ANOVA test on ball_control, reactions, passing

In [None]:
#significance level 
a = 0.05 
#define x 
X = X_train_scaled_df[['ball_control', 'reactions', 'passing']]
#define kmeans
kmeans = KMeans(n_clusters=4)
#fit 
kmeans.fit(X)

In [None]:
train['clusters'] = kmeans.predict(X)

In [None]:
# Find K: evaluate best k using elbow method 
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(12, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 15)}).plot(marker='x')
    plt.xticks(range(2, 15))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
train.groupby('clusters')[ 'ball_control', 'reactions', 'passing'].median()

#### _The ANOVA test will be used to measure the significance or lack there of variance between clusters_

 - H0: There is no significant difference between salaries of each cluster 
 - HA: There is a significant difference between salaries of each cluster

In [None]:
alpha = 0.05

F, p = stats.f_oneway(train[train.clusters == 0].wage_eur,
                      train[train.clusters == 1].wage_eur,
                      train[train.clusters == 2].wage_eur,
                      train[train.clusters == 3].wage_eur)

print('Anova Test Results on goalkeeper handling/diving/reflexes Cluster')
print('F-value: ',F)
print('p-value: ',p)
#if p > alpha:
    #print("\nWe fail to reject null hypothesis.")
#elif t < 0:
    #print("We fail to reject null hypothesis.")
#else:
    #print("We reject null hypothesis.\n\n")

# Prepare Defenders

In [None]:
#split
train, validate, test = prepare.split(defender_df)

In [None]:
#split data
X_train = train.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_train = train[['wage_eur']]

X_validate = validate.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_validate = validate[['wage_eur']]

X_test = test.drop(columns=['wage_eur', 'total_wage', 'value_eur'])
y_test = test[['wage_eur']]

In [None]:
#encode categorical variables
encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)
X_test_loo = encoder.transform(X_test)
X_train_loo.shape

In [None]:
#scale data 
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_loo, y_train)
X_train_scaled.shape

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Defender Clusters 

#### ANOVA test on marking, potential, short_passing

In [None]:
#significance level 
a = 0.05 
#define x 
X = X_train_scaled_df[['marking', 'potential', 'short_passing']]
#define kmeans
kmeans = KMeans(n_clusters=4)
#fit 
kmeans.fit(X)

In [None]:
train['clusters'] = kmeans.predict(X)

In [None]:
# Find K: evaluate best k using elbow method 
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(12, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 15)}).plot(marker='x')
    plt.xticks(range(2, 15))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
train.groupby('clusters')[ 'ball_control', 'reactions', 'passing'].median()

#### _The ANOVA test will be used to measure the significance or lack there of variance between clusters_

 - H0: There is no significant difference between salaries of each cluster 
 - HA: There is a significant difference between salaries of each cluster

In [None]:
alpha = 0.05

F, p = stats.f_oneway(train[train.clusters == 0].wage_eur,
                      train[train.clusters == 1].wage_eur,
                      train[train.clusters == 2].wage_eur,
                      train[train.clusters == 3].wage_eur)

print('Anova Test Results on goalkeeper handling/diving/reflexes Cluster')
print('F-value: ',F)
print('p-value: ',p)
#if p > alpha:
    #print("\nWe fail to reject null hypothesis.")
#elif t < 0:
    #print("We fail to reject null hypothesis.")
#else:
    #print("We reject null hypothesis.\n\n")