# **Data Cleaning and Preperation Script for "Transfer_Dataset.raw.csv" dataset.**

In [4]:
import numpy as np
import pandas as pd

# Load Data
df = pd.read_csv("C:/Users/User/ET6-CDSP-group-23-repo/1_datasets/Transfer_Dataset.raw.csv")

##✅ Understand the Data

In [5]:
df.head()

Unnamed: 0,Player Name,Transfer Fee,Season,From (Club),To (Club),From (League),To (League),Position
0,Krystian Bielik,2.25,2014-2015,Legia Warszawa,Arsenal FC,Ekstraklasa,Premier League,Centre-Back
1,Dame NDoye,3.96,2014-2015,Loko Moskau,Hull City,Russian Premier League,Premier League,Centre-Forward
2,Andrew Robertson,3.6,2014-2015,Dundee United,Hull City,Scottish Premiership,Premier League,Left-Back
3,Harry Maguire,3.15,2014-2015,Sheff Utd,Hull City,Championship,Premier League,Centre-Back
4,Brian Lenihan,0.26,2014-2015,Cork City,Hull City,League of Ireland,Premier League,Right-Back


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Player Name    489 non-null    object
 1   Transfer Fee   489 non-null    object
 2   Season         489 non-null    object
 3   From (Club)    489 non-null    object
 4   To (Club)      489 non-null    object
 5   From (League)  475 non-null    object
 6   To (League)    489 non-null    object
 7   Position       489 non-null    object
dtypes: object(8)
memory usage: 30.7+ KB


In [7]:
df.describe()

Unnamed: 0,Player Name,Transfer Fee,Season,From (Club),To (Club),From (League),To (League),Position
count,489,489,489,489,489,475,489,489
unique,474,253,10,239,47,91,1,14
top,Alfie Mawson,0,2022/23,FC Porto,Manchester City,Championship,Premier League,Centre-Forward
freq,2,61,77,14,35,116,489,91


##🪓 Exclude unwanted entries

In [8]:
# Exclude seasons before 2018-2019 season

df_filtered = df[
    df["Season"] >= "2018/2019"
].copy()  # .copy() to avoid SettingWithCopyWarning

print(f"\nDataFrame shape after filtering (Season >= '2018/2019'): {df_filtered.shape}")
print("Unique seasons after filtering:")
print(df_filtered["Season"].unique())

# Verify some rows
print("\nFirst 5 rows of the filtered DataFrame:")
df_filtered.head()


DataFrame shape after filtering (Season >= '2018/2019'): (246, 8)
Unique seasons after filtering:
['2018/2019' '2019-2020' '2020-2021' '2021-2022' '2022/23']

First 5 rows of the filtered DataFrame:


Unnamed: 0,Player Name,Transfer Fee,Season,From (Club),To (Club),From (League),To (League),Position
189,Ante Palaversa,6.3,2018/2019,Hajduk Split,Manchester City,Croatian First League,Premier League,Defensive Midfield
190,Philippe Sandler,2.5,2018/2019,PEC Zwolle,Manchester City,Eredivisie,Premier League,Centre-Back
191,Ko Itakura,1.1,2018/2019,Kawasaki Front.,Manchester City,J1 League,Premier League,Centre-Back
192,Daniel Arzani,0.89,2018/2019,Melbourne City,Manchester City,A-League,Premier League,Right Winger
193,Fred,59.0,2018/2019,Shakhtar D.,Manchester United,Ukrainian Premier League,Premier League,Central Midfield


##🧹 Handle Missing Values

In [9]:
# Check for missing Values
df_filtered.isnull().sum()

Player Name      0
Transfer Fee     0
Season           0
From (Club)      0
To (Club)        0
From (League)    5
To (League)      0
Position         0
dtype: int64

In [10]:
# Treat "-" as NaN
df_filtered.replace("-", np.nan, inplace=True)

In [11]:
# Drop NULL entries
df_filtered.dropna(how="any", inplace=True)

In [12]:
# Recheck for missing Values
df_filtered.isnull().sum()

Player Name      0
Transfer Fee     0
Season           0
From (Club)      0
To (Club)        0
From (League)    0
To (League)      0
Position         0
dtype: int64

## 🧼Fix Data Types

In [13]:
# Check Statistical Summery for the numerical columns
df_filtered["Transfer Fee"].describe()

count     238
unique    141
top         0
freq       37
Name: Transfer Fee, dtype: object

In [14]:
# "Transfer Fee is treated as String so convert it to Numerical"
df_filtered["Transfer Fee"] = pd.to_numeric(
    df_filtered["Transfer Fee"], errors="coerce"
)

In [15]:
# Check if it's Numerical now
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 238 entries, 189 to 487
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Player Name    238 non-null    object 
 1   Transfer Fee   238 non-null    float64
 2   Season         238 non-null    object 
 3   From (Club)    238 non-null    object 
 4   To (Club)      238 non-null    object 
 5   From (League)  238 non-null    object 
 6   To (League)    238 non-null    object 
 7   Position       238 non-null    object 
dtypes: float64(1), object(7)
memory usage: 16.7+ KB


In [16]:
# Recheck the Statitstical Summary
df_filtered["Transfer Fee"].describe()

count    238.000000
mean      12.284693
std       16.413628
min        0.000000
25%        1.525000
50%        7.650000
75%       16.420000
max      121.000000
Name: Transfer Fee, dtype: float64

In [22]:
# Save the cleaned version in a new file
df_filtered.to_csv("C:/Users/User/ET6-CDSP-group-23-repo/1_datasets/Transfer_Dataset.cleaned.csv", index=False)