In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# import intermediate datasets from data preparation and merging notebook output
df_hits = pd.read_csv("unique_overlapping_hit_songs.csv")
df_2m = pd.read_csv("non_hit_songs_2mil.csv")

## Distributions of hit songs

In [1]:
# Visualize the correlation matrix
num_data = df_hits.drop(['song','artist', 'song_artist', 'hit_song'], axis=1)

correlation_matrix = num_data.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=.4, annot_kws={"size": 8})
plt.title('Correlation Matrix')
plt.show()

NameError: name 'df_hits' is not defined

### Distributions of each audio feaure & release year

##### Hit Songs

In [None]:
fig, axes = plt.subplots(5,3, figsize=(8,10))

sns.boxplot(df_hits, y="danceability", color='#f7eaab', ax=axes[0,0])
sns.boxplot(df_hits, y="energy", color='#f2df9c', ax=axes[0,1])
sns.boxplot(df_hits, y="loudness", color='#edd38e', ax=axes[0,2])
sns.boxplot(df_hits, y="mode", color='#e8c880',ax=axes[1,0])
sns.boxplot(df_hits, y="speechiness", color='#e3bc73', ax=axes[1,1])
sns.boxplot(df_hits, y="acousticness", color='#dfb066', ax=axes[1,2])
sns.boxplot(df_hits, y="instrumentalness", color='#daa45a', ax=axes[2,0])
sns.boxplot(df_hits, y="liveness", color='#d6984f', ax=axes[2,1])
sns.boxplot(df_hits, y="valence", color='#d28c44', ax=axes[2,2])
sns.boxplot(df_hits, y="tempo", color='#cd803a', ax=axes[3,0])
sns.boxplot(df_hits, y="duration_ms", color='#c97331', ax=axes[3,1])
sns.boxplot(df_hits, y="time_signature", color='#c46629', ax=axes[3,2])
sns.boxplot(df_hits, y="key", color='#bf5922', ax=axes[4,0])
sns.boxplot(df_hits, y="year", color='#ba4b1c', ax=axes[4,1])

fig.tight_layout()

In [None]:
# Plot the distribution of years (df_hits)
plt.figure(figsize=(10, 3))
plt.hist(df_hits['year'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Years in df_hits Dataset')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

##### non-hit songs

In [None]:











# distribution plot from progress report #1
sns.histplot(df_hits['year'], bins=30, kde=True, color='#ba4b1c', alpha=0.5, label='hit songs',
                 line_kws={'color': '#ba4b1c', 'lw': 4})
plt.title('distribution of release year - Billboard Top 100 hit song dataset')

# Plot the distribution of years (df_2m)
plt.figure(figsize=(10, 3))
plt.hist(df_2m['year'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Years in df_2m Dataset')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# This dataset of non-hits does not have a large population of non-hit songs before 1985.
# By histogram this is where the dataset seems to start to have enough songs to sample from
# We will set the minimum year for both hit song and non-hit song datasets as 1985.

# distribution plot from progress report #1
sns.histplot(df_2m['year'], bins=30, kde=True, color= '#1a6261', alpha=0.5, label='non-hit songs',
                 line_kws={'color': '#1a6261', 'lw': 4})
plt.title('distribution of release year - 2.3 million song dataset')

df_hits_filt = df_hits[df_hits['year'] >= 1985]
print(len(df_hits))
print(len(df_hits_filt))

### Identify and remove outliers from the 'duration_ms' feature in the 'df_hits' dataset.

# Calculate the first quartile (Q1)
Q1 = df_hits_filt['duration_ms'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = df_hits_filt['duration_ms'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound,';', upper_bound)

# Identify outliers
outliers = df_hits_filt[(df_hits_filt['duration_ms'] < lower_bound) | (df_hits_filt['duration_ms'] > upper_bound)]

# Remove outliers
df_hits_cleaned = df_hits_filt[(df_hits_filt['duration_ms'] >= lower_bound) & (df_hits_filt['duration_ms'] <= upper_bound)]

# Print the number of rows before and after removing outliers
print(f"Number of rows before removing outliers: {len(df_hits_filt)}")
print(f"Number of rows after removing outliers: {len(df_hits_cleaned)}")

df_hits_cleaned.describe().loc[['min', 'max']]

df_hits_cleaned.to_csv('16k_hits.csv', index=False)

# Plot the distribution of years (df_hits)
plt.figure(figsize=(10, 3))
plt.hist(df_hits_cleaned['year'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Years in df_hits_cleaned Dataset')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

### Distributions of 2 million random songs

fig, axes = plt.subplots(5,3, figsize=(8,10))

sns.boxplot(df_2m, y="danceability", color='#caf2e8', ax=axes[0,0])
sns.boxplot(df_2m, y="energy", color='#bbe5dc', ax=axes[0,1])
sns.boxplot(df_2m, y="loudness", color='#add9d0', ax=axes[0,2])
sns.boxplot(df_2m, y="mode", color='#9fccc4',ax=axes[1,0])
sns.boxplot(df_2m, y="speechiness", color='#90c0b8', ax=axes[1,1])
sns.boxplot(df_2m, y="acousticness", color='#82b4ac', ax=axes[1,2])
sns.boxplot(df_2m, y="instrumentalness", color='#74a7a1', ax=axes[2,0])
sns.boxplot(df_2m, y="liveness", color='#669b96', ax=axes[2,1])
sns.boxplot(df_2m, y="valence", color='#58908b', ax=axes[2,2])
sns.boxplot(df_2m, y="tempo", color='#4a8480', ax=axes[3,0])
sns.boxplot(df_2m, y="duration_ms", color='#3b7875', ax=axes[3,1])
sns.boxplot(df_2m, y="time_signature", color='#2c6d6b', ax=axes[3,2])
sns.boxplot(df_2m, y="key", color='#1a6261', ax=axes[4,0])
sns.boxplot(df_2m, y="year", color='#005757', ax=axes[4,1])

fig.tight_layout()

# The minimum year will be set to 1985. By histogram, this is where the dataset seems to start to have enough songs to sample from.
    # filtering for songs released after 1985
df_2m_filt = df_2m[df_2m['year'] >= 1985]
print(len(df_2m))
print(len(df_2m_filt))

### Identify and remove outliers from the 'duration_ms' feature in the 'df_2m_filt' dataset.

# Calculate the first quartile (Q1)
Q1 = df_2m_filt['duration_ms'].quantile(0.25)

# Calculate the third quartile (Q3)
Q3 = df_2m_filt['duration_ms'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound,';', upper_bound)

# Remove outliers
df_2m_cleaned = df_2m_filt[(df_2m_filt['duration_ms'] >= lower_bound) & (df_2m_filt['duration_ms'] <= upper_bound)]

# Print the number of rows before and after removing outliers
print(f"Number of rows before removing outliers: {len(df_2m_filt)}")
print(f"Number of rows after removing outliers: {len(df_2m_cleaned)}")

df_2m_cleaned.describe().loc[['min', 'max']]

#### Distribution of song years across the hits and non-hits datasets
solution: assess the distribution of the hit songs dataset and use stratified sampling of 2 million songs dataset to reflect proportions of songs/grouping of years

sns.histplot(df_hits_cleaned['year'], bins=4, kde=True, color='#ba4b1c', alpha=0.5, label='hit songs',
                 line_kws={'color': '#ba4b1c', 'lw': 4})
plt.title('distribution of release year - Billboard Top 100 hit song dataset')

sns.histplot(df_2m_cleaned['year'], bins=4, kde=True, color= '#1a6261', alpha=0.5, label='non-hit songs',
                 line_kws={'color': '#1a6261', 'lw': 4})
plt.title('distribution of release year - 2 million song dataset')

### Random selection of 15,373 songs from the 2 million song dataset of non-hit tracks (use the same distribution of years as the hits.

hits_1985_1994 = df_hits_filt[(df_hits_filt['year'] >= 1985) & (df_hits_filt['year'] <= 1994)]
hits_1995_2004 = df_hits_filt[(df_hits_filt['year'] >= 1995) & (df_hits_filt['year'] <= 2004)]
hits_2005_2014 = df_hits_filt[(df_hits_filt['year'] >= 2005) & (df_hits_filt['year'] <= 2014)]
hits_2015_2023 = df_hits_filt[(df_hits_filt['year'] >= 2015) & (df_hits_filt['year'] <= 2023)]

total_songs = len(df_hits_filt)

percentage_hits_1985_1994 = (len(hits_1985_1994) / total_songs) * 100
percentage_hits_1995_2004 = (len(hits_1995_2004) / total_songs) * 100
percentage_hits_2005_2014 = (len(hits_2005_2014) / total_songs) * 100
percentage_hits_2015_2023 = (len(hits_2015_2023) / total_songs) * 100

print(f"Percentage of songs between 1985 and 1994 in df_hits_filt: {percentage_hits_1985_1994:.2f}%")
print(f"Percentage of songs between 1995 and 2024 in df_hits_filt: {percentage_hits_1995_2004:.2f}%")
print(f"Percentage of songs between 2005 and 2014 in df_hits_filt: {percentage_hits_2005_2014:.2f}%")
print(f"Percentage of songs between 2015 and 2023 in df_hits_filt: {percentage_hits_2015_2023:.2f}%")

Analysis plan proposal: with 1985 as minimum year, use reference % of grouped years that 1985-1994 = 15.22%, 1995-2004 = 23.68%, 2005-2014 = 32.47%, and 2015-2023 = 28.64% of the number of songs.

(stratified sampling)

# calculate the number of songs to sample from each bin of years based on stratified sampling
num_songs_1985_1994 = round(percentage_hits_1985_1994 * len(df_hits_cleaned))/100
num_songs_1995_2004 = round(percentage_hits_1995_2004 * len(df_hits_cleaned))/100
num_songs_2005_2014 = round(percentage_hits_2005_2014 * len(df_hits_cleaned))/100
num_songs_2015_2023 = round(percentage_hits_2015_2023 * len(df_hits_cleaned))/100

print("1985-1994 songs count:", num_songs_1985_1994)
print("1995-2004 songs count:", num_songs_1995_2004)
print("2005-2014 songs count:", num_songs_2005_2014)
print("2015-2023 songs count:", num_songs_2015_2023)
print("Total songs count:", num_songs_1985_1994 + num_songs_1995_2004 + num_songs_2005_2014 + num_songs_2015_2023)
print("Hit songs count:", len(df_hits_cleaned))

non_hits_1995_2004 = df_2m_cleaned[(df_2m_cleaned['year'] >= 1995) & (df_2m_cleaned['year'] <= 2004)]
len(non_hits_1995_2004)

# Random Stratified Sampling from non_hits
    # Sample 2409 non-hits between 1985 and 1994
non_hits_1985_1994 = df_2m_cleaned[(df_2m_cleaned['year'] >= 1985) & (df_2m_cleaned['year'] <= 1994)]
rand_1985_1994 = non_hits_1985_1994.sample(n=2409, random_state=42)
print(len(rand_1985_1994))
    # Sample 3747 non-hits between 1995 and 2004
non_hits_1995_2004 = df_2m_cleaned[(df_2m_cleaned['year'] >= 1995) & (df_2m_cleaned['year'] <= 2004)]
rand_1995_2004 = non_hits_1995_2004.sample(n=3747, random_state=42)
print(len(rand_1995_2004))
    # Sample 5137 non-hits between 2005 and 2014
non_hits_2005_2014 = df_2m_cleaned[(df_2m_cleaned['year'] >= 2005) & (df_2m_cleaned['year'] <= 2014)]
rand_2005_2014 = non_hits_2005_2014.sample(n=5137, random_state=42)
print(len(rand_2005_2014))
    # Sample 4531 non-hits between 2015 and 2023
non_hits_2015_2023 = df_2m_cleaned[(df_2m_cleaned['year'] >= 2015) & (df_2m_cleaned['year'] <= 2023)]
rand_2015_2023 = non_hits_2015_2023.sample(n=4531, random_state=42)
print(len(rand_2015_2023))

print(len(rand_1985_1994) + len(rand_1995_2004) + len(rand_2005_2014) + len(rand_2015_2023))

# Combine binned datasets to create 16k non-hits dataset
non_hits_16k = pd.concat([rand_1985_1994, rand_1995_2004, rand_2005_2014, rand_2015_2023])

non_hits_16k.info()

non_hits_16k.to_csv('16k_non_hits.csv', index=False)

sns.histplot(non_hits_16k['year'], bins=4, kde=True, color= '#1a6261', alpha=0.5, label='non-hit songs',
                 line_kws={'color': '#1a6261', 'lw': 4})
plt.title('distribution of release year - stratified subset from 2 million song dataset')