## Importing the libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Loading the data


In [2]:
df = pd.read_csv('data/FIFA22_official_data.csv')
df.head(2)

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Best Position,Best Overall Rating,Release Clause,DefensiveAwareness
0,212198,Bruno Fernandes,26,https://cdn.sofifa.com/players/212/198/22_60.png,Portugal,https://cdn.sofifa.com/flags/pt.png,88,89,Manchester United,https://cdn.sofifa.com/teams/11/30.png,...,65.0,12.0,14.0,15.0,8.0,14.0,CAM,88.0,€206.9M,72.0
1,209658,L. Goretzka,26,https://cdn.sofifa.com/players/209/658/22_60.png,Germany,https://cdn.sofifa.com/flags/de.png,87,88,FC Bayern München,https://cdn.sofifa.com/teams/21/30.png,...,77.0,13.0,8.0,15.0,11.0,9.0,CM,87.0,€160.4M,74.0


## Cleaning and transforming the data


In [26]:
# TODO AFTER APPLY THE ALGORITHMS:
# Keeping track of the players names so we can plot the names in the clusters
"""
player_names = pd.read_csv('player_names.csv')  
  
# Assigning clusters to players  
player_clusters = {name: cluster for name, cluster in zip(player_names, kmeans.labels_)} 

"""
player_names = df['Name']
player_names.to_csv('data/prepared_data/player_names.csv', index=False)  

In [3]:
def clean_data(df):
  # Drop unsued columns
  columns_to_drop = ['ID', 'Name', 'Photo', 'Nationality', 'Flag', 'Club', 'Club Logo', 'Special', 'Real Face', 'Position', 'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until', 'Best Overall Rating']
  df_cleaned = df = df.drop(columns_to_drop, axis=1)

  # Convert types and conventions
  df_cleaned['Value'] = df_cleaned['Value'].str.replace('€', '').str.replace('M', '').str.replace('K', '').astype(float)
  df_cleaned['Wage'] = df_cleaned['Wage'].str.replace('€', '').str.replace('M', '').str.replace('K', '').astype(float)
  df_cleaned['Release Clause'] = df_cleaned['Release Clause'].str.replace('€', '').str.replace('M', '').str.replace('K', '').astype(float)
  df_cleaned['Weight'] = df_cleaned['Weight'].apply(weight_to_kg)
  df_cleaned['Height'] = df_cleaned['Height'].apply(height_to_cm)

  return df_cleaned

def weight_to_kg(s):
    if 'kg' in s:
        return float(s.replace('kg', ''))
    else:
        return float(s.strip('lbs')) * 0.453592

def height_to_cm(s):
    if 'cm' in s:
        return float(s.replace('cm', ''))
    else:
        # Convert feet'inches" to cm
        feet, inches = map(int, s.split("'"))
        return (feet * 30.48) + (inches * 2.54)

In [6]:
df_cleaned = clean_data(df)

# Preparing the data to apply the K-MEANS algorithm

In [30]:
def prepare_data_to_kmeans(df, save = True, position_name=None):
  df_numeric = df.select_dtypes(include=[np.number])

  df_numeric = df_numeric.fillna(df_numeric.mean())

  if position_name:  
    df_numeric.to_csv(f'data/cleaned_data/{position_name}_original.csv', index=False)

  scaler = MinMaxScaler(feature_range=(0,1))
  df_standard = scaler.fit_transform(df_numeric)

  df_kmeans = pd.DataFrame(df_standard, columns=df_numeric.columns)

  if save:
    df_kmeans.to_csv('data/prepared_data/fifa_data_to_kmeans2.csv')
  return df_kmeans

In [16]:
df_kmeans = prepare_data_to_kmeans(df_cleaned)

In [17]:
df_kmeans2.head(2)

Unnamed: 0,Height,Weight,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,DefensiveAwareness
0,0.470588,0.327869,0.91954,0.869565,0.670455,0.965116,0.965116,0.857143,0.920455,0.922222,...,0.892857,0.489126,0.777778,0.662921,0.122222,0.142857,0.152174,0.076087,0.146067,0.766667
1,0.666667,0.540984,0.781609,0.858696,0.920455,0.906977,0.755814,0.868132,0.795455,0.788889,...,0.833333,0.489126,0.911111,0.797753,0.133333,0.076923,0.152174,0.108696,0.089888,0.788889


## Creating a new k-means df for each position

In [38]:
def clean_data_2(df):
  # Drop unsued columns
  columns_to_drop = ['ID', 'Name', 'Photo', 'Nationality', 'Flag', 'Club', 'Club Logo', 'Special', 'Real Face', 'Position', 'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until', 'Best Overall Rating'
                     ,'Age', 'Overall', 'Potential', 'Value', 'Wage', 'International Reputation', 'Weak Foot', 'Skill Moves', 'Release Clause']
  df_cleaned = df = df.drop(columns_to_drop, axis=1)

  # Convert types and conventions
  df_cleaned['Weight'] = df_cleaned['Weight'].apply(weight_to_kg)
  df_cleaned['Height'] = df_cleaned['Height'].apply(height_to_cm)

  return df_cleaned

In [34]:
def prepare_data_to_kmeans2(df, save = True, position_name=None):
  # Select only numeric columns
  df_numeric = df.select_dtypes(include=[np.number])

  # Fill any NaN values with the mean of the column
  df_numeric = df_numeric.fillna(df_numeric.mean())

  # Save the original data i
  df_numeric.to_csv(f'data/cleaned_data/{position_name}_original.csv', index=False)

  # Normalize the data
  scaler = MinMaxScaler(feature_range=(0,1))
  df_standard = scaler.fit_transform(df_numeric)

  df_kmeans = pd.DataFrame(df_standard, columns=df_numeric.columns)

  # Save and return the new df
  if save:
    df_kmeans.to_csv('data/prepared_data/fifa_data_to_kmeans2.csv')
  return df_kmeans

In [35]:
def clean_data_by_positions(df, positions, position_name):  
    # Extract the position after '>'  
    df['Position'] = df['Position'].apply(lambda x: x.split('>')[-1].strip() if isinstance(x, str) else x)  
      
    # Create a new dataframe for these positions  
    df_positions = df[df['Position'].isin(positions)]  

    # Save the player names to a csv file  
    player_names = df_positions['Name']  
    player_names.to_csv(f'data/prepared_data/by_position/kmeans/{position_name}_names.csv', index=False)  

    df_cleaned = clean_data(df_positions)  
  
    if df_cleaned.empty:  
        print(f"No data for positions {positions}. Skipping...")  
        return  
    
    df_kmeans = prepare_data_to_kmeans2(df_cleaned, False, position_name)  

    df_kmeans.to_csv(f'data/prepared_data/by_position/kmeans/{position_name}.csv', index=False)  

In [36]:
clean_data_by_positions(df, ['RCM', 'LCM', 'CM'], 'CM')  
clean_data_by_positions(df, ['LDM', 'RDM', 'CDM'], 'CDM')
clean_data_by_positions(df, ['RS', 'LS', 'ST'], 'ST')
clean_data_by_positions(df, ['CAM', 'RAM', 'LAM'], 'CAM')
clean_data_by_positions(df, ['LCB', 'RCB', 'CB'], 'CB')
clean_data_by_positions(df, ['CF', 'LF', 'RF'], 'CF')
clean_data_by_positions(df, ['RWB', 'RB'], 'RB')
clean_data_by_positions(df, ['LWB', 'LB'], 'LB')
clean_data_by_positions(df, ['GK'], 'GK')

In [29]:
def clean_data_by_positions2(df, positions, position_name):  
    # Extract the position after '>'  
    df['Position'] = df['Position'].apply(lambda x: x.split('>')[-1].strip() if isinstance(x, str) else x)  
      
    # Create a new dataframe for these positions  
    df_positions = df[df['Position'].isin(positions)]  

    # Save the player names to a csv file  
    player_names = df_positions['Name']  
    player_names.to_csv(f'data/prepared_data_2/by_position/kmeans/{position_name}_names.csv', index=False)  

    df_cleaned = clean_data_2(df_positions)  
  
    if df_cleaned.empty:  
        print(f"No data for positions {positions}. Skipping...")  
        return  
    
    df_kmeans = prepare_data_to_kmeans(df_cleaned, False)  

    df_kmeans.to_csv(f'data/prepared_data_2/by_position/kmeans/{position_name}.csv', index=False)  

In [27]:
clean_data_by_positions2(df, ['RCM', 'LCM', 'CM'], 'CM')  
clean_data_by_positions2(df, ['LDM', 'RDM', 'CDM'], 'CDM')
clean_data_by_positions2(df, ['RS', 'LS', 'ST'], 'ST')
clean_data_by_positions2(df, ['CAM', 'RAM', 'LAM'], 'CAM')
clean_data_by_positions2(df, ['LCB', 'RCB', 'CB'], 'CB')
clean_data_by_positions2(df, ['CF', 'LF', 'RF'], 'CF')
clean_data_by_positions2(df, ['RWB', 'RB'], 'RB')
clean_data_by_positions2(df, ['LWB', 'LB'], 'LB')
clean_data_by_positions2(df, ['GK'], 'GK')