# Cleaning Player and Player Attributes Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

player_attr_df = pd.read_csv('../data/raw/Player_Attributes.csv')
player_df = pd.read_csv('../data/raw/Player.csv')

In [None]:
print("Player Attributes dataset:")
print(player_attr_df.shape)
player_attr_df.head()

In [None]:
player_attr_df['date'] = pd.to_datetime(player_attr_df['date'])
player_attr_df['player_api_id'] = pd.to_numeric(player_attr_df['player_api_id'], errors='coerce').astype('Int64')
player_attr_df.head(3)

In [None]:
player_attr_df.tail(3)

In [None]:
player_attr_df = player_attr_df.drop(['player_fifa_api_id'], axis=1)
player_attr_df = player_attr_df.drop(['id'], axis=1)
#player_attr_df = player_attr_df.set_index(['player_api_id', 'date']) #multi-index
player_attr_df

In [None]:
num_nulls = player_attr_df.isna().sum()
print(f"Number of NaNs in each column:\n{num_nulls}")

In [None]:
player_attr_df.dropna(inplace=True)
player_attr_df.info()

In [None]:
#organize data in numerical order by api id and then chronologically by date
player_attr_df = player_attr_df.sort_index(level=['player_api_id', 'date'])
player_attr_df

# Encoding categorical variables (3 total)

# preferred_foot
We mapped right to 1 and 0 to left for preferred_foot as a form of one-hot encoding.

In [None]:
player_attr_df['preferred_foot'] = player_attr_df['preferred_foot'].map({'right': 1, 'left': 0})
player_attr_df


## attacking_work_rate
We imputed the nonsense descriptions in 'attacking_work_rate' that were not "high," "medium," or "low" with "medium" because "medium" was the most frequent value, constituting more than two thirds of the rows. 

In [None]:
unique_values = player_attr_df['attacking_work_rate'].unique()
#print(f"Unique values in 'attacking_work_rate': {unique_values}")
value_counts = player_attr_df['attacking_work_rate'].value_counts()
player_attr_df['attacking_work_rate'] = player_attr_df['attacking_work_rate'].astype(str)
# Replace any value not 'high', 'medium', or 'low' with 'medium' (as imputed previously)
player_attr_df['attacking_work_rate'] = player_attr_df['attacking_work_rate'].apply(
    lambda x: x if x in ['high', 'medium', 'low'] else 'medium'
)
# Now map to integers
player_attr_df['attacking_work_rate'] = player_attr_df['attacking_work_rate'].map({'high': 2, 'medium': 1, 'low': 0}).astype(int)


In [None]:
unique_values = player_attr_df['defensive_work_rate'].unique()
player_attr_df['defensive_work_rate'] = player_attr_df['defensive_work_rate'].astype(str)
print(player_attr_df['defensive_work_rate'].value_counts())
player_attr_df['defensive_work_rate'] = player_attr_df['defensive_work_rate'].apply(
    lambda x: x if x in ['high', 'medium', 'low'] else 'medium'
)
# Now map to integers
player_attr_df['defensive_work_rate'] = player_attr_df['defensive_work_rate'].map({'high': 2, 'medium': 1, 'low': 0}).astype(int)
player_attr_df
print(player_attr_df['defensive_work_rate'].value_counts())
player_attr_df

In [None]:
print("Player dataset:")
print(player_df.shape)
player_df.drop(['id'], axis=1, inplace=True)
player_df.head()

In [None]:
player_df['birthday'] = pd.to_datetime(player_df['birthday'])
player_df['player_name'] = player_df['player_name'].astype(str)
player_df.sort_values(['player_api_id'], inplace=True)
player_df.drop(['player_fifa_api_id'], axis=1, inplace=True)
player_df

In [None]:
# Merge player_attr_df with player_df to add player_name, birthday, height, and weight
player_attr_df = player_attr_df.reset_index()
player_attr_df = player_attr_df.merge(
    player_df[['player_api_id', 'player_name', 'birthday', 'height', 'weight']],
    on='player_api_id',
    how='right'
)

In [None]:
player_attr_df.dropna(inplace=True)
player_attr_df.drop(['index'], axis=1, inplace=True)
player_attr_df
cols = player_attr_df.columns.tolist()

# Move the last 4 columns to the front
new_order = cols[-4:] + cols[:-4]
player_attr_df = player_attr_df[new_order]
player_attr_df
# Move the 5th column ('player_api_id') to the leftmost position
cols = player_attr_df.columns.tolist()
cols = [cols[4]] + cols[:4] + cols[5:]
player_attr_df = player_attr_df[cols]
player_attr_df

In [None]:
player_attr_df.to_csv('../data/cleaned/player_attributes_cleaned.csv', index=False)
player_attr_df