# **Data Cleaning and Preperation for "player_stats.raw.csv" dataset.**

In [2]:
# 1. Import Libraries
import pandas as pd

In [3]:

#  2. Load Dataset
file_path = "/workspaces/ET6-CDSP-group-23-repo/1_datasets/player_stats_data.raw.csv"  # the path of the data
df = pd.read_csv(file_path)


In [4]:
# 3. Remove duplicate rows
df = df.drop_duplicates()

In [5]:
#  4. Drop Useless or Redundant Columns
#  'URL' is not useful for analysis
df.drop(columns=['URL'], inplace=True)

In [10]:
# 5. Ensure Height is float and in meters (already the case)
df['Height'] = pd.to_numeric(df['Height'], errors='coerce')

In [12]:
# 6. Clean market value - remove € and convert to numeric (in millions)
df['Market value'] = df['Market value'].astype(str).str.replace('€', '', regex=False).str.replace('M', '', regex=False)
df['Market value'] = pd.to_numeric(df['Market value'], errors='coerce')

In [13]:
# 7. Convert percentage columns to numeric (divide by 100)
percentage_cols = [col for col in df.columns if '%' in col]
for col in percentage_cols:
    df[col] = df[col].str.replace('%', '').astype(float) / 100
    df = df.rename(columns={col: col.replace(' %', '')})

In [14]:
# 8. Clean numeric columns with commas (like "1,000" to 1000)
numeric_cols = df.select_dtypes(include=['object']).columns
for col in numeric_cols:
    df[col] = df[col].str.replace(',', '').astype(float, errors='ignore')

In [15]:
# 9. Fill missing numeric values with 0 (or you could use mean/median)
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].fillna(df[col].mean(), inplace=True)


In [16]:
# 10. Standardize date format in "Birth Date" column
df['Birth Date'] = pd.to_datetime(df['Birth Date'], errors='coerce')

In [17]:
# 11. Clean preferred foot - standardize values
df['Preferred foot'] = df['Preferred foot'].str.capitalize()

In [18]:
#  5. Rename Columns (Optional for easier access)
df.rename(columns=lambda x: x.strip().lower().replace(' ', '_').replace('-', '_'), inplace=True)

In [19]:
# 12. Final Inspection
print("Cleaned DataFrame Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Remaining Nulls:\n", df.isnull().sum()[df.isnull().sum() > 0])

Cleaned DataFrame Shape: (197, 52)
Columns: ['birthday', 'birth_date', 'country', 'defending___aerial_duels_won', 'defending___aerial_duels_won', 'defending___blocked', 'defending___dribbled_past', 'defending___duels_won', 'defending___duels_won', 'defending___fouls_committed', 'defending___interceptions', 'defending___penalties_conceded', 'defending___possession_won_final_3rd', 'defending___recoveries', 'defending___tackles_won', 'defending___tackles_won', 'discipline___red_cards', 'discipline___yellow_cards', 'height', 'market_value', 'passing___accurate_long_balls', 'passing___assists', 'passing___chances_created', 'passing___cross_accuracy', 'passing___expected_assists_(xa)', 'passing___long_ball_accuracy', 'passing___pass_accuracy', 'passing___successful_crosses', 'passing___successful_passes', 'possession___dispossessed', 'possession___dribble_success', 'possession___fouls_won', 'possession___penalties_awarded', 'possession___successful_dribbles', 'possession___touches', 'possess

In [35]:
# Save cleaned data to new CSV file
df.to_csv('player_stats_data_clean.csv', index=False)

print("Data cleaning complete. Cleaned data saved to 'player_stats_data_clean.csv'")

Data cleaning complete. Cleaned data saved to 'player_stats_data_clean.csv'
