# Exploratory Data Analysis for Cleaned Player Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
player_cleaned_df = pd.read_csv('../data/cleaned/player_attributes_cleaned.csv')
player_cleaned_df.set_index(['player_api_id', 'player_name', 'birthday', 'height', 'weight', 'date'], inplace=True)
player_cleaned_df

In [None]:
# Extract the 'birthday' from the index and plot histogram
birthdays = player_cleaned_df.index.get_level_values('birthday')
birthdays = pd.to_datetime(birthdays, errors='coerce')
plt.figure(figsize=(10, 6))
pd.Series(birthdays).hist(bins=50)
plt.xlabel('Birthday')
plt.ylabel('Count')
plt.title('Histogram of Player Birthdays')
plt.show()

In [None]:
# Extract the 'date' from the index and plot histogram
dates = player_cleaned_df.index.get_level_values('date')
dates = pd.to_datetime(dates, errors='coerce')
plt.figure(figsize=(10, 6))
pd.Series(dates).hist(bins=12)
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Histogram of Player Dates')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(player_cleaned_df['potential'], bins=25, kde=True)
plt.title('Histogram and KDE of Potential')
plt.xlabel('Player Potential')
plt.ylabel('Count')
plt.show()

In [None]:
# Compute correlation (numeric columns only)
corr = player_cleaned_df.corr()

# Plot heatmap
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Player Dataset: Correlation Matrix Between Standardized Numeric Features')
plt.gcf().set_size_inches(20, 15)
plt.tight_layout()
# Fix for missing y-axis labels: set yticks and yticklabels explicitly
plt.yticks(ticks=np.arange(len(corr.index)) + 0.5, labels=corr.index, rotation=0)
plt.xticks(ticks=np.arange(len(corr.index)) + 0.5, labels=corr.index, rotation=45, ha='right', rotation_mode='anchor')

plt.show()