In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# load data
df = pd.read_csv('data/top_tracks_audio_features.csv')

# drop missing values
df = df.dropna()

# convert all columns to numeric and ignore errors
df = df.apply(pd.to_numeric, errors='ignore')

# display first few rows of data
print(df.head())

# summary statistics
print(df.describe())

# histograms for audio features
plt.figure(figsize=(12, 8))
df.hist(figsize=(12, 8), bins=15)
plt.tight_layout()
plt.suptitle('Histograms for Audio Features', y=1.02)
plt.show()

# correlation matrix of audio features
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Audio Features')
plt.show()

# boxplots for each feature to identify outliers
features = ['danceability', 'energy', 'valence', 'tempo', 'acousticness']
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[features])
plt.title('Boxplot of Audio Features')
plt.xticks(rotation=45)
plt.show()

# pairplot to see relationships between features
plt.figure(figsize=(10, 6))
sns.pairplot(df[['danceability', 'energy', 'valence', 'tempo', 'acousticness']])
plt.suptitle('Pairplot of Audio Features', y=1.02)
plt.show()

# kde plot for audio features distribution
plt.figure(figsize=(10, 6))
for feature in features:
    sns.kdeplot(df[feature], shade=True, label=feature)
plt.title('KDE Plot for Audio Features')
plt.legend()
plt.show()