# EDA

#### Imports

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

#### Load the data

In [53]:
df = pd.read_csv('../popular_songs_metadata.csv')

#### Looking at the Data

In [54]:
df.shape

(10000, 7)

In [55]:
df.head()

Unnamed: 0,user,song,title,play_count,release,artist_name,year
0,b493eda768d25cfb48dec2567b0826cdb9f6eef4,SOCBSZW12AB01891C1,XRDS,256,Ritual Noise,Covenant,0
1,d50046ceb4db11dc162649a0da27733ff5194b47,SOAAAGQ12A8C1420C8,Orgelblut,1,Dolores,Bohren & Der Club Of Gore,2008
2,1e93ab57208a575ad87872ff3a25c1743632d0a1,SOAAAGQ12A8C1420C8,Orgelblut,1,Dolores,Bohren & Der Club Of Gore,2008
3,a0846981ab7c3d06cf1d966e7109774e306eb61a,SOAAAGQ12A8C1420C8,Orgelblut,1,Dolores,Bohren & Der Club Of Gore,2008
4,6e8d4e7d986e077d334da41638e63b9030710141,SOAAAGQ12A8C1420C8,Orgelblut,1,Dolores,Bohren & Der Club Of Gore,2008


In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['title'].unique()[:10]

#### Graphing the numerical data

In [None]:
df.groupby('year')['song'].sum().plot(kind='bar', figsize=(15, 10))
plt.xlabel('Year')
plt.ylabel('Total Songs')
plt.title('Total Songs by Year')
plt.show()

#### Checking for zero values

In [None]:
# Checking for the presence of zero values in each column
zero_values_count = (df == 0).sum()
zero_values_count

#### Inputing Values

In [None]:
df['year'].replace(0, np.nan, inplace=True)

# Initialize the IterativeImputer with a RandomForestRegressor estimator
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0)

# We'll only fit the imputer on the 'year' column as the other zeros are encoded categories
year_imputed = imputer.fit_transform(df[['year']])

# Fill the imputed values back into the DataFrame
df['year'] = year_imputed
df['year'] = df['year'].round().astype(int)

# Check if the imputation is done
df['year'].isnull().sum()


#### Outliers

In [None]:
# Define a function to detect outliers using the IQR method
def detect_outliers_iqr(data, column_name):
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column_name] < lower_bound) | (data[column_name] > upper_bound)]
    return outliers

# Detect outliers in 'play_count' and 'year'
outliers_play_count = detect_outliers_iqr(df, 'play_count')
outliers_year = detect_outliers_iqr(df, 'year')

# Display the number of outliers found in each column
outliers_summary = {
    'play_count_outliers': len(outliers_play_count),
    'year_outliers': len(outliers_year)
}

outliers_summary

#### Plotting the outliers

In [None]:
# Set the style of seaborn
sns.set(style="whitegrid")

# Plot outliers for 'play_count'
plt.figure(figsize=(12, 6))
sns.boxplot(x=df['play_count'])
plt.title('Outliers in Play Count')
plt.show()

# Plot outliers for 'year'
plt.figure(figsize=(12, 6))
sns.boxplot(x=df['year'])
plt.title('Outliers in Year')
plt.show()

- Its normal for some songs to be older and relased in earlier years so we will not deal with the outlires as they are significant to the data

- As for the play count only one stood with a high play count of a song that is above 250 times which could be seen as normal

#### Label Encoding

In [None]:
# Initialize LabelEncoder
le = LabelEncoder()

# Categorical columns for label encoding
features = ['user', 'song', 'title', 'release', 'artist_name']

# Apply LabelEncoder to each categorical column
for col in features:
    df[col] = le.fit_transform(df[col])

# Display the encoded DataFrame
df
