In [263]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

In [264]:
fifa_dataset = pd.read_csv("data.csv") #reads the dataset
fifa_dataset.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [265]:
print(fifa_dataset.size)
print(fifa_dataset.shape)

1620423
(18207, 89)


In [266]:
#fifa_dataset.info() #gives us the information of each column in the dataset and what kinds of vlaues it holds

In [267]:
fifa_dataset.drop(columns=['Unnamed: 0','Name','Photo','Nationality','Flag','Club','Club Logo','Real Face','Joined','Loaned From','Contract Valid Until','Release Clause','Preferred Foot','Weak Foot'],inplace=True)
print(fifa_dataset.size) #we drop the above fields because even without doing any calculaions, these attributes do not contribute to the value field that we want to predict
print(fifa_dataset.shape)

1365525
(18207, 75)


In [268]:
fifa_dataset.columns #after dropping we see the columns again

Index(['ID', 'Age', 'Overall', 'Potential', 'Value', 'Wage', 'Special',
       'International Reputation', 'Skill Moves', 'Work Rate', 'Body Type',
       'Position', 'Jersey Number', 'Height', 'Weight', 'LS', 'ST', 'RS', 'LW',
       'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM',
       'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB',
       'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes'],
      dtype='object')

In [269]:
#Some of these attributes like the Value, wage, LS, LM, etc all have string values that can be converted into numerical values
#So we convert them to numerical values

def MK_converter(values): #example take the field of Value, Christiano Ronaldo is worth 77 million pouds. The worth is given with the currency symbol and an M next to the number. So we use that to convert the values
    if values[-1] == 'M':
        worth = values[1:-1]
        worth = float(worth) * 1000000
        return worth
    elif values[-1] == 'K':
        worth = values[1:-1]
        worth = float(worth) * 1000
        return worth
    else:
        return 0
    
fifa_dataset['Value'] = fifa_dataset['Value'].apply(MK_converter)
fifa_dataset['Wage'] = fifa_dataset['Wage'].apply(MK_converter)
#fifa_dataset.describe()

In [270]:
fifa_dataset.fillna(value=0, inplace=True)
fifa_dataset.head()

Unnamed: 0,ID,Age,Overall,Potential,Value,Wage,Special,International Reputation,Skill Moves,Work Rate,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,158023,31,94,94,110500000.0,565000.0,2202,5.0,4.0,Medium/ Medium,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,20801,33,94,94,77000000.0,405000.0,2228,5.0,5.0,High/ Low,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,190871,26,92,93,118500000.0,290000.0,2143,5.0,5.0,High/ Medium,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,193080,27,91,93,72000000.0,260000.0,1471,4.0,1.0,Medium/ Medium,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,192985,27,91,92,102000000.0,355000.0,2281,4.0,4.0,High/ High,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [271]:
#The fields LS, LM, ST etc are the differet positions in a team's formation. The values in the field mean that if you were to score the player's performance in that position, it would be that value
#Example, if you put Christiano Ronaldo as a LS, ST OR RS whihc stands for Left Striker, Striker and Right Striker respectively, then his score will be in the 90's because that is the position he plays
#But if you were to put him in LCB, CB or RCB which stands for Left Center Back, Center Back and Right Center Back, then he will have a very poor score of 47 because he is not trained to play in those postions
#The fields have values of two number with a plus sign, so we add those numbers and make them integers4

def position_adder(x):
    nums = str(x)
    num1 = nums[0:2]
    num2 = nums[-1]
    return int(num1) + int(num2)

positions = ['LS','ST','RS','LW','LF','CF','RF','RW','LAM','CAM','RAM','LM','LCM','CM','RCM','RM','LWB','LDM','CDM','RDM','RWB','LB','LCB','CB','RCB','RB']

for position in positions:
    fifa_dataset[position] = fifa_dataset[position].apply(position_adder)
    
fifa_dataset[['LS','ST','RS','LW','LF','CF','RF','RW','LAM','CAM','RAM','LM','LCM','CM','RCM','RM','LWB','LDM','CDM','RDM','RWB','LB','LCB','CB','RCB','RB']].head()

Unnamed: 0,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,...,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,90,90,90,94,95,95,95,94,95,95,...,66,63,63,63,66,61,49,49,49,61
1,94,94,94,92,93,93,93,92,91,91,...,68,64,64,64,68,64,56,56,56,64
2,87,87,87,92,92,92,92,92,92,92,...,68,63,63,63,68,63,50,50,50,63
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,85,85,85,90,90,90,90,90,91,91,...,80,80,80,80,80,76,69,69,69,76


In [272]:
# In the dataset we have the player's height is in terms of feet and inches and the weight is in terms of pounds
# The height needs to be converted into centimeters. We do this because using centimeneters tends give a more acuuracte measure
# The weight needs converted into an int and we need to remove the lbs

fifa_dataset[['Height','Weight']].head() #This is the current state of the dataset

Unnamed: 0,Height,Weight
0,5'7,159lbs
1,6'2,183lbs
2,5'9,150lbs
3,6'4,168lbs
4,5'11,154lbs


In [273]:
def height_to_cm(x):
    t = str(x)
    elements = t.split("'")
    height = elements[0]
    inches = elements[-1]
    
    height = int(height)
    inches = int(inches)
    
    height_in_cm = height*30.48 + inches*2.54
    return height_in_cm

fifa_dataset['Height'] = fifa_dataset['Height'].apply(height_to_cm)
fifa_dataset['Weight'] = fifa_dataset['Weight'].apply(lambda x: str(x).split("lbs")[0])

fifa_dataset[['Height','Weight']].head()

Unnamed: 0,Height,Weight
0,170.18,159
1,187.96,183
2,175.26,150
3,193.04,168
4,180.34,154


In [274]:
fifa_dataset.head()

Unnamed: 0,ID,Age,Overall,Potential,Value,Wage,Special,International Reputation,Skill Moves,Work Rate,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,158023,31,94,94,110500000.0,565000.0,2202,5.0,4.0,Medium/ Medium,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,20801,33,94,94,77000000.0,405000.0,2228,5.0,5.0,High/ Low,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,190871,26,92,93,118500000.0,290000.0,2143,5.0,5.0,High/ Medium,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,193080,27,91,93,72000000.0,260000.0,1471,4.0,1.0,Medium/ Medium,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,192985,27,91,92,102000000.0,355000.0,2281,4.0,4.0,High/ High,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [275]:
#The body type attribute has a little inconsistencies. The three types of body types that are aupposed to be in the dataset are Lean, Normal and Stocky
#Some players like Messi and Ronaldo have body types that have the same as their name which does not make sense. So we take these cases and we convert them to the three types
body_types = fifa_dataset['Body Type'].unique() #This is how the data set looks like now
print(body_types)

['Messi' 'C. Ronaldo' 'Neymar' 'Lean' 'Normal' 'Courtois' 'Stocky'
 'PLAYER_BODY_TYPE_25' 'Shaqiri' 'Akinfenwa' 0]


In [276]:
fifa_dataset[fifa_dataset['Body Type'] == 'Messi'] = 'Lean'
fifa_dataset[fifa_dataset['Body Type'] == 'C. Ronaldo'] = 'Normal'
fifa_dataset[fifa_dataset['Body Type'] == 'Courtois'] = 'Lean'
fifa_dataset[fifa_dataset['Body Type'] == 'Neymar'] = 'Lean'
fifa_dataset[fifa_dataset['Body Type'] == 'PLAYER_BODY_TYPE_25'] = 'Normal'
fifa_dataset[fifa_dataset['Body Type'] == 'Shaqiri'] = 'Stocky'
fifa_dataset[fifa_dataset['Body Type'] == 'Akinfenwa'] = 'Stocky'
body_types = fifa_dataset['Body Type'].unique() #This is how the data set looks like now
print(body_types)

['Lean' 'Normal' 'Stocky' 0]


In [277]:
fifa_dataset.head()

Unnamed: 0,ID,Age,Overall,Potential,Value,Wage,Special,International Reputation,Skill Moves,Work Rate,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,...,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean
1,Normal,Normal,Normal,Normal,Normal,Normal,Normal,Normal,Normal,Normal,...,Normal,Normal,Normal,Normal,Normal,Normal,Normal,Normal,Normal,Normal
2,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,...,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean,Lean
3,193080,27,91,93,7.2e+07,260000,1471,4,1,Medium/ Medium,...,40,68,15,21,13,90,85,87,88,94
4,192985,27,91,92,1.02e+08,355000,2281,4,4,High/ High,...,79,88,68,58,51,15,13,5,10,13


In [278]:
fifa_dataset.columns

Index(['ID', 'Age', 'Overall', 'Potential', 'Value', 'Wage', 'Special',
       'International Reputation', 'Skill Moves', 'Work Rate', 'Body Type',
       'Position', 'Jersey Number', 'Height', 'Weight', 'LS', 'ST', 'RS', 'LW',
       'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM',
       'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB',
       'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes'],
      dtype='object')

In [279]:
attack_work_rate = []
defense_work_rate = []

def work_rate_split(x):
    t = str(x)
    elements = t.split("/")
    attack_work_rate.append(elements[0])
    defense_work_rate.append(elements[-1])
    
fifa_dataset['Work Rate'] = fifa_dataset['Work Rate'].apply(work_rate_split)
fifa_dataset.drop('Work Rate', axis=1, inplace=True)
fifa_dataset.insert(13, 'Attack Work Rate', attack_work_rate)
fifa_dataset.insert(14, 'Defense Work Rate', defense_work_rate)

fifa_dataset.columns

Index(['ID', 'Age', 'Overall', 'Potential', 'Value', 'Wage', 'Special',
       'International Reputation', 'Skill Moves', 'Body Type', 'Position',
       'Jersey Number', 'Height', 'Attack Work Rate', 'Defense Work Rate',
       'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
       'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM',
       'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes'],
      dtype='object')

In [283]:
#corr = st.spearmanr(fifa_dataset, fifa_dataset['Potential'])
#print(corr)