In [1]:
# imports

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import pandas as pd
import numpy as np
import re

In [2]:
# reading the data in pandas dataframe
football = pd.read_csv('./football_data.csv', index_col=0)

In [3]:
football.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18207 entries, 0 to 18206
Data columns (total 88 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        18207 non-null  int64  
 1   Name                      18207 non-null  object 
 2   Age                       18207 non-null  int64  
 3   Photo                     18207 non-null  object 
 4   Nationality               18207 non-null  object 
 5   Flag                      18207 non-null  object 
 6   Overall                   18207 non-null  int64  
 7   Potential                 18207 non-null  int64  
 8   Club                      17966 non-null  object 
 9   Club Logo                 18207 non-null  object 
 10  Value                     18207 non-null  object 
 11  Wage                      18207 non-null  object 
 12  Special                   18207 non-null  int64  
 13  Preferred Foot            18159 non-null  object 
 14  Intern

### Processing the features of interest.

#### Value. Convert to numerical by extracting number out of it. Normalize the number properly based on M/K.

In [4]:
football['processed_value'] = pd.Series([float(str[1:-1]) if str[-1]=='K' else float(str[1:-1])*1000 if str[-1]=='M' else 0 for str in football['Value']])

#### Similarly, release clause

In [5]:
football['processed_release_clause'] = pd.Series([np.nan if isinstance(value, float) else float(value[1:-1]) if value[-1]=='K' else float(value[1:-1])*1000 if value[-1]=='M' else 0 for value in football['Release Clause']])

#### Wage. Convert to numerical by extracting number out of it. Normalize the number properly based on M/K.

In [6]:
football['processed_wage'] = pd.Series([float(str[1:-1]) if len(str)>=3 else 0 for str in football['Wage']])

#### Joined. Convert data to only year format

In [7]:
football['processed_joining'] = pd.Series([float(date[-4:]) if isinstance(date, str) else np.nan for date in football['Joined']])

#### Contract Valid Until. Convert data to only year format

In [8]:
football['processed_contract_valid'] = pd.Series([float(date[-4:]) if isinstance(date, str) else np.nan for date in football['Contract Valid Until']])

#### Height. convert to cms from ft and inches.

In [9]:
def process_height(value):
    if isinstance(value, float):
        return np.nan
    ft = float(value.split('\'')[0])
    inch = float(value.split('\'')[1])
    inch += ft * 12
    return inch * 2.54

football['processed_height'] = football['Height'].apply(process_height)

#### Weight. convert to numeric value by removing 'lbs' unit

In [11]:
football['processed_weight'] = pd.Series([float(weight[:-3]) if isinstance(weight, str) else np.nan for weight in football['Weight']])

#### Convert sport specific features of type (90 + 2) to numeric features of type 90.

In [13]:
def process_column(value):
    if isinstance(value, float):
        return np.nan
    return float(value.split('+')[0])
for column in football.columns[27:53]:
    football['processed_%s'%column] = football[column].apply(process_column)

In [16]:
# Step 1: Extract numerical features
num_data = football.select_dtypes(include=[np.number])
# replacing missing values with mean value of that column
data = num_data.fillna(num_data.mean())

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18207 entries, 0 to 18206
Data columns (total 76 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        18207 non-null  int64  
 1   Age                       18207 non-null  int64  
 2   Overall                   18207 non-null  int64  
 3   Potential                 18207 non-null  int64  
 4   Special                   18207 non-null  int64  
 5   International Reputation  18207 non-null  float64
 6   Weak Foot                 18207 non-null  float64
 7   Skill Moves               18207 non-null  float64
 8   Jersey Number             18207 non-null  float64
 9   Crossing                  18207 non-null  float64
 10  Finishing                 18207 non-null  float64
 11  HeadingAccuracy           18207 non-null  float64
 12  ShortPassing              18207 non-null  float64
 13  Volleys                   18207 non-null  float64
 14  Dribbl

## Finally, we have converted the categorical attributes to numerical, and now we can use 76 attributes from the original 88. This will increase the quality of our clustering