In [87]:
import pandas as pd
import numpy as np

In [88]:
## read the Dataset
# https://www.kaggle.com/code/muhammedafsalpm/eda-fifa-21

In [89]:
data = pd.read_csv("FIFA21.csv")

In [90]:
#data

# Remove Non Essentital Columns



In [91]:
drops = {"N":['Name',"photoUrl",'playerUrl','Joined','Loan Date End','Club','Positions','LongName','Nationality']}
data = data.drop(drops['N'], axis=1)
data = data.iloc[:, :18] # Select only the first 18 columns (index 0 to 17)
print(data.columns)

Index(['Unnamed: 0', 'ID', 'Age', '↓OVA', 'POT', 'Contract', 'Height',
       'Weight', 'Preferred Foot', 'BOV', 'Best Position', 'Value', 'Wage',
       'Release Clause', 'Attacking', 'Crossing', 'Finishing',
       'Heading Accuracy'],
      dtype='object')


In [92]:
# Remove Any Non Numeric Charecters 

In [93]:
#data

In [105]:
converted_heights = []
for height in data['Height']:
    # Check if the height is in 'ft' and 'inches' format (e.g., "5'11\"")
    if "'" in height and "\"" in height:
        # Extract feet and inches from the string
        feet, inches = height.split("'")
        inches = inches.replace("\"", "")  # Remove the closing quote symbol
        # Convert to total inches
        total_inches = int(feet) * 12 + int(inches)
        # Convert inches to centimeters
        height_cm = total_inches * 2.54
    else:
        # Convert directly if height is already in cm (e.g., '180')
        height_cm = int(height.replace("cm", "").strip())
        
    converted_heights.append(height_cm)
data['Height'] = converted_heights

In [106]:
converted_weights = []
for i in data['Weight']:
    if 'lbs' in i:
        weight_value = ''.join(filter(str.isdigit, i))  # Remove non-numeric characters
        weight_kg = int(weight_value) * 0.453592  # Convert lbs to kg
    elif 'kg' in i:
        weight_value = ''.join(filter(str.isdigit, i))  # Remove non-numeric characters
        weight_kg = int(weight_value)  # Already in kg, just convert to integer
    converted_weights.append(weight_kg)

data['Weight'] = converted_weights

In [95]:
def clean_value(value):
    # Remove the '€' symbol and convert M (Million) or K (Thousand) to numeric
    if 'M' in value:
        return float(value.replace('€', '').replace('M', '')) * 1_000_000
    elif 'K' in value:
        return float(value.replace('€', '').replace('K', '')) * 1_000
    else:
        # Handle unexpected formats (optional)
        return float(value.replace('€', ''))

# Apply the function to the 'Value' column
data['Value'] = data['Value'].apply(clean_value)

In [96]:
def clean_wage(value):
    # Remove the '€' symbol and convert M (Million) or K (Thousand) to numeric
    if 'M' in value:
        return float(value.replace('€', '').replace('M', '')) * 1_000_000
    elif 'K' in value:
        return float(value.replace('€', '').replace('K', '')) * 1_000
    else:
        # Handle unexpected formats (optional)
        return float(value.replace('€', ''))

# Apply the function to the 'Value' column
data['Wage'] = data['Wage'].apply(clean_wage)

In [97]:
def clean_clause(value):
    # Remove the '€' symbol and convert M (Million) or K (Thousand) to numeric
    if 'M' in value:
        return float(value.replace('€', '').replace('M', '')) * 1_000_000
    elif 'K' in value:
        return float(value.replace('€', '').replace('K', '')) * 1_000
    else:
        # Handle unexpected formats (optional)
        return 0

# Apply the function to the 'Value' column
data['Release Clause'] = data['Release Clause'].apply(clean_clause)

In [107]:
data.dtypes

Unnamed: 0            int64
ID                    int64
Age                   int64
↓OVA                  int64
POT                   int64
Height              float64
Weight              float64
Preferred Foot        int64
BOV                   int64
Best Position         int64
Value               float64
Wage                float64
Release Clause      float64
Attacking             int64
Crossing              int64
Finishing             int64
Heading Accuracy      int64
start                 int32
end                   int32
loan                  int32
dtype: object

In [32]:
# Make the dataset friendly for one hot encoding

In [99]:
data['Preferred Foot'] = data['Preferred Foot'].map({'Left': 0, 'Right': 1})

In [100]:
data['Best Position'] = data['Best Position'].map({'ST': 1, 'CF': 2,'LW': 3, 'RW': 4,
                                                    'CAM': 5, 'CDM': 6,'CM': 7, 'LM': 8,
                                                    'RM': 9, 'GK': 0,'CB': 10, 'LB': 11,
                                                    'RB': 12, 'RWB': 13,'LWB': 14})

In [80]:
# Split columns if required. 

In [101]:
# Create new columns
data['start'] = np.nan
data['end'] = np.nan
data['loan'] = 0

# Function to process the Contract column
def process_contract(contract):
    if 'On Loan' in contract:
        # Handle "On Loan" entries
        parts = contract.split()
        start_year = int(parts[-3])  # Extract the start year
        return start_year, start_year + 2, 1
    elif '~' in contract:
        # Handle range entries
        start_year, end_year = map(int, contract.split('~'))
        return start_year, end_year, 0
    elif 'Free' in contract:
        return 0, 0, 0  # Default values for 'Free'
    else:
        # Handle standalone year
        year = int(contract.strip())
        return year, year + 2, 0

# Apply function and split values into the new columns
data[['start', 'end', 'loan']] = data['Contract'].apply(lambda x: pd.Series(process_contract(x)))

# Convert to integers for clarity
data['start'] = data['start'].astype(int)
data['end'] = data['end'].astype(int)
data['loan'] = data['loan'].astype(int)

In [103]:

data = data.drop(['Contract'], axis=1)
