## Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Data

In [None]:
file_path = 'SpotifyFeatures.csv'
data = pd.read_csv(file_path)
data.head()

## Data preprocessing

### Remove outliers

In [None]:
# Removing outliers
data = data[data['duration_ms'] < 500000]
data = data[data["loudness"] > -30]

# Add genre count column
data['genre_count'] = data.groupby('track_id')['genre'].transform('count')

# drop gernre A Capella duo to low count
data = data[data['genre'] != 'A Capella']

# Combine union child genres
data['genre'] = data['genre'].replace('Children’s Music', 'Children\'s Music')

# Convert time signature to int
data['time_signature'] = data['time_signature'].apply(lambda x: x.split('/')[0])

# Convert the mode column to 1 for major and 0 for minor
data['mode'] = data['mode'].replace("Minor", 0)
data['mode'] = data['mode'].replace("Major", 1)

In [None]:
# Function to convert pitch to number
def pitch_to_number(pitch):
    pitch_map = {
        'C': 0,
        'C#': 1, 'Db': 1,
        'D': 2,
        'D#': 3, 'Eb': 3,
        'E': 4, 'Fb': 4,
        'E#': 5, 'F': 5,
        'F#': 6, 'Gb': 6,
        'G': 7,
        'G#': 8, 'Ab': 8,
        'A': 9,
        'A#': 10, 'Bb': 10,
        'B': 11, 'Cb': 11
    }
    return pitch_map.get(pitch, None)

# Replace the pitch column with its numeric representation
data['key'] = data['key'].apply(pitch_to_number)

In [None]:
# group by genre
grouped = data.groupby('genre')
#print(grouped.size())

# Group by genre and calculate mean popularity
grouped = data.groupby('genre')
mean_popularity = grouped['popularity'].mean()

# Function to label rows as 'bop' or 'flop'
def label_popularity(row, mean_popularity):
    if row['popularity'] >= mean_popularity[row['genre']]:
        return 'bop'
    else:
        return 'flop'

# Apply the function to each row
data['popularity_label'] = data.apply(lambda row: label_popularity(row, mean_popularity), axis=1)

# Splitting the data into bop and flop, ensuring each genre is split 50/50
# This step might require adjusting the labels for genres with an odd number of entries
for genre in data['genre'].unique():
    genre_data = data[data['genre'] == genre]
    n = len(genre_data) // 2
    popular_indices = genre_data.nlargest(n, 'popularity').index
    nonpopular_indices = genre_data.nsmallest(n, 'popularity').index
    data.loc[popular_indices, 'popularity_label'] = 'bop'
    data.loc[nonpopular_indices, 'popularity_label'] = 'flop'

data.head()  # Display the first few rows of the modified DataFrame

In [None]:
# Calculate and print the size difference between nonpopular and popular for each genre
size_difference = data.groupby(['genre', 'popularity_label']).size().unstack().fillna(0)
size_difference['difference'] = size_difference['popular'] - size_difference['nonpopular']
size_difference


In [None]:
# split data by genre
data['genre'].unique()
data['genre'].value_counts()


# create new column for popularity over 50
data['popularity_over_50'] = data['popularity'] >= 50
data['popularity_over_50'].value_counts()

# create table graph for every genre and popularity over 50


data.groupby(['genre', 'popularity_over_50']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 10))

# make one but with nonpopular and popular
data.groupby(['genre', 'popularity_label']).size().unstack().plot(kind='bar', stacked=True, figsize=(20, 10))

In [None]:
# Calculate and print the splitting point (mean popularity) for every genre
splitting_points = mean_popularity.to_dict()
splitting_points

### Model

In [None]:

# Create a dictionary of classifiers, one for each genre, random forest classifier
classifiers = {}

# Create a classifier for each genre
for genre in data['genre'].unique():
    # Create a new DataFrame containing only the current genre
    genre_data = data[data['genre'] == genre]

    random_seed = 1337
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(genre_data[['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'genre_count']], genre_data['popularity_label'], test_size=0.2, random_state=random_seed)


    
    # add more to votingClassifier
    classifier = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=50, random_state=random_seed)),
        ('gb', GradientBoostingClassifier(n_estimators=50, random_state=random_seed)),
        ('ab', AdaBoostClassifier(n_estimators=50, random_state=random_seed)),
        ('lr', LogisticRegression(random_state=random_seed)),
        ('svc', SVC(random_state=random_seed))  # Note: SVC doesn't have a random_state parameter in older versions
    ], 
    voting='hard'
    )

    # Train the classifier
    classifier.fit(X_train, y_train)
    # Add the classifier to the dictionary of classifiers
    classifiers[genre] = classifier

    # add accuracy score to dictionary
    classifiers[genre] = classifier.score(X_test, y_test)
    print("genre: ", genre, " Accuracy score (training): {0:.3f}".format(classifier.score(X_train, y_train)))
    print("genre: ", genre, " Accuracy score (validation): {0:.3f}".format(classifier.score(X_test, y_test)))

    

# Print the dictionary of classifiers
classifiers

In [None]:
# print avg accuracy score
avg = 0
for genre in classifiers:
    avg += classifiers[genre]
avg /= len(classifiers)
print(avg)


In [None]:
plot_scores = []

for n_estimator in range(1, 501, 50):
    classifiers = {}
    avg_score = 0

    for genre in data['genre'].unique():
        genre_data = data[data['genre'] == genre]
        random_seed = 1337
        X_train, X_test, y_train, y_test = train_test_split(
            genre_data[['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'genre_count']], 
            genre_data['popularity_label'], 
            test_size=0.2, 
            random_state=random_seed
        )

        classifier = VotingClassifier(
            estimators=[
                ('rf', RandomForestClassifier(n_estimators=n_estimator, random_state=random_seed)),
                ('gb', GradientBoostingClassifier(n_estimators=n_estimator, random_state=random_seed)),
                ('ab', AdaBoostClassifier(n_estimators=n_estimator, random_state=random_seed)),
                ('lr', LogisticRegression(random_state=random_seed)),
                ('svc', SVC(random_state=random_seed))
            ], 
            voting='hard'
        )

        classifier.fit(X_train, y_train)
        score = classifier.score(X_test, y_test)
        classifiers[genre] = score
        avg_score += score

    avg_score /= len(data['genre'].unique())
    plot_scores.append(avg_score)

plt.plot(range(1, 501, 50), plot_scores)
plt.ylabel('Average Accuracy Score')
plt.xlabel('Number of Estimators')
plt.show()