# Predicting Top Genre Given A Year

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Clean and Merge Spotify-Sentiment Dataset with Billboard Dataset

In [2]:
billboard_df = pd.read_csv('../data/compressed/genre_year_weights.csv')
spotify_sentiment_df = pd.read_csv('../data/processed/lyrics_features_clusters.csv')

billboard_df['top_genre'] = billboard_df[['Pop', 'Rock', 'R&B', 'Hip Hop', 'Soul', 'Country','Folk', 'Disco', 'Motown', 'Metal', 'Funk', 'Blues', 'Alternative', 'Electronic', 'Jazz']].idxmax(axis=1)

billboard_df.rename(columns={'Hot100 Ranking Year':'year'}, inplace=True)

billboard_df = billboard_df[billboard_df['year'] <= 2019]

merged_df = pd.merge(spotify_sentiment_df, billboard_df, on='year', how='left')

spotify_sentiment_df.head()



Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,...,positive,surprise,trust,anger,disgust,fear,negative,sadness,cluster,Cluster Name
0,Can I Live,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,...,29.0,8.0,17.0,12.0,9.0,12.0,22.0,10.0,Rap_cluster_2,Energetic Thrilling Rap
1,Money On My Mind,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,...,44.0,34.0,36.0,39.0,13.0,12.0,22.0,10.0,Rap_cluster_2,Energetic Thrilling Rap
2,Mr. Carter,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,...,16.0,11.0,11.0,11.0,12.0,19.0,32.0,22.0,Rap_cluster_2,Energetic Thrilling Rap
3,C.R.E.A.M.,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,...,37.0,21.0,20.0,27.0,9.0,14.0,26.0,9.0,Hip Hop_cluster_2,Danceable Menacing Hip Hop
4,Barry Bonds,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,...,28.0,4.0,10.0,23.0,13.0,12.0,26.0,7.0,Rap_cluster_2,Energetic Thrilling Rap


### Feature Engineering

Process of creating new features like columns from our data to make the model for effective and accurate at learning patterns 


example -> ratios for distinguishing upbeat vs aggressive music or like energy x danceability predicting a specific genre

In [3]:

merged_df['lofi-ness'] = merged_df['acousticness'] * (1 - merged_df['energy']) * (1 - merged_df['loudness'])
merged_df['pop_factor']= merged_df['danceability'] * merged_df['valence'] * merged_df['positive']
merged_df['rock_factor'] = merged_df['energy'] * merged_df['loudness'] * merged_df['instrumentalness']

#merged_df['top_genre'].unique()

merged_df['decade'] = (merged_df['year'] // 10) * 10
merged_df['is_2010s'] = (merged_df['decade'] == 2010).astype(int)

# key attributes
merged_df['energy_squared'] = merged_df['energy'] ** 2
merged_df['speechiness_squared'] = merged_df['speechiness'] ** 2
merged_df['danceability_squared'] = merged_df['danceability'] ** 2
merged_df['loudness_squared'] = merged_df['loudness'] ** 2
merged_df['fear_squared'] = merged_df['fear'] ** 2
merged_df['anger_squared'] = merged_df['anger'] ** 2

merged_df['audio_intensity'] = merged_df['energy'] * merged_df['loudness']
merged_df['vocal_ratio'] = merged_df['speechiness'] / (merged_df['instrumentalness'] + 0.01)
merged_df['emotional_ratio'] = merged_df['joy'] / (merged_df['sadness'] + 0.01)
merged_df['volume_ratio'] = merged_df['loudness'] / (merged_df['acousticness'] + 0.01)

merged_df['emotional_intensity'] = merged_df['joy'] + merged_df['fear'] + merged_df['sadness'] + merged_df['positivity']
merged_df['upbeatness'] = merged_df['danceability'] * merged_df['valence'] * merged_df['joy']
merged_df['aggressiveness'] = merged_df['anger'] * merged_df['energy'] + (1 - merged_df['acousticness'])
merged_df['smooth_hip_hop_indicator'] = merged_df['danceability'] * merged_df['speechiness'] * merged_df['trust']
merged_df['rap_indicator'] = merged_df['danceability'] * merged_df['instrumentalness'] * merged_df['fear']
merged_df['craziness'] = merged_df['energy'] * merged_df['tempo'] * merged_df['loudness']




### Grouping Genres

In [4]:
def group_genres(genre):
    if genre in ['Rock', 'Metal', 'Alternative']:
        return "Rock/Metal"
    elif genre in ['Hip Hop', 'Rap', 'R&B', 'Soul']:
        return "Rap/Hip Hop/R&B/Soul"
    elif genre in [ 'Pop','Dance', 'Electronic']:
        return "Pop/Dance/Electronic"
    elif genre in ['Disco', 'Funk', 'Jazz', 'Blues', 'Motown', 'Folk', 'Country']:
        return "Disco/Funk/Jazz/Blues/Motown/Folk/Country"

    
merged_df['grouped_genres'] = merged_df['mapped_genres'].apply(group_genres)
merged_df['grouped_genres'].value_counts()
# if we do this, then we gotta do stratify (equally grouping genres in train and test groups) based on grouped_genres because it is the most balanced

grouped_genres
Rock/Metal                                   12285
Pop/Dance/Electronic                          7688
Rap/Hip Hop/R&B/Soul                          5556
Disco/Funk/Jazz/Blues/Motown/Folk/Country     2042
Name: count, dtype: int64

### Split, Train, and Test Data


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(merged_df, test_size=0.1, 
                                       random_state=42, stratify=merged_df['grouped_genres'])

one_hot_encoded_train = pd.get_dummies(train_df, columns=['top_genre'])
one_hot_encoded_test = pd.get_dummies(test_df, columns=['top_genre'])
one_hot_encoded_test = one_hot_encoded_test.reindex(columns=one_hot_encoded_train.columns, fill_value=0)
features = ['audio_intensity', 'vocal_ratio', 'energy_squared', 'emotional_intensity', 'upbeatness', 'aggressiveness', 'decade', 'is_2010s', 'lofi-ness', 'pop_factor', 'rock_factor', 'danceability', 'energy', 'instrumentalness', 'tempo', 'anger', 'disgust', 'loudness', 'speechiness', 'acousticness', 'valence', 'joy', 'positivity'] + [col for col in one_hot_encoded_train.columns if col.startswith('top_genre_')]
                                                      

x_train = one_hot_encoded_train[features]
x_test = one_hot_encoded_test[features]

### Prediction Model Level 1 - Clusters

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import xgboost as xgb

y_train = one_hot_encoded_train['Cluster Name']
y_test = one_hot_encoded_test['Cluster Name']

## Logistic Regression Model
categorical_features = [col for col in x_train.columns if col.startswith('top_genre_')]
num_features = ['audio_intensity', 'vocal_ratio', 'energy_squared', 'emotional_intensity', 'upbeatness', 'aggressiveness', 'decade', 'is_2010s', 'lofi-ness', 'pop_factor', 'rock_factor', 'danceability', 'energy', 'instrumentalness', 'tempo', 'anger', 'disgust', 'loudness', 'speechiness', 'acousticness', 'valence', 'joy', 'positivity']
columnTransform = ColumnTransformer([('num', StandardScaler(), num_features), ('cat', 'passthrough', categorical_features)])

pipeline = Pipeline([('transformer', columnTransform), ('model', LogisticRegression(max_iter=3000))])

pipeline.fit(x_train, y_train)
y_test_pred = pipeline.predict(x_test)
y_train_pred = pipeline.predict(x_train)

## Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500, # how many decision trees
    max_depth=10, # how much the tree can grow
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    max_samples=0.8,
    random_state=42
)

model.fit(x_train, y_train)
y_test_pred_r = model.predict(x_test)
y_train_pred_r = model.predict(x_train)


# Evaluation Metrics
from sklearn.metrics import accuracy_score

# feature_importances = model.feature_importances_
# importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importances})
# importance_df = importance_df.sort_values(by='Importance', ascending=False)
# print("Important features:\n", importance_df.head(20))
# print("Least important features:\n", importance_df.tail(20))

train_acc = accuracy_score(y_train_pred,y_train)
test_acc  = accuracy_score(y_test_pred,y_test)

print("Logistic Regression Training Accuracy:", train_acc)
print("Logistic Regression Test Accuracy:", test_acc)

train_acc_r = accuracy_score(y_train_pred_r,y_train)
test_acc_r  = accuracy_score(y_test_pred_r,y_test)
print("Random Forest Training Accuracy:", train_acc_r)
print("Random Forest Test Accuracy:", test_acc_r)




### Prediction Model Level 2 - Standard Genres

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import xgboost as xgb

y_train = one_hot_encoded_train['mapped_genres']
y_test = one_hot_encoded_test['mapped_genres']

## Logistic Regression Model
categorical_features = [col for col in x_train.columns if col.startswith('top_genre_')]
num_features = ['audio_intensity', 'vocal_ratio', 'energy_squared', 'emotional_intensity', 'upbeatness', 'aggressiveness', 'decade', 'is_2010s', 'lofi-ness', 'pop_factor', 'rock_factor', 'danceability', 'energy', 'instrumentalness', 'tempo', 'anger', 'disgust', 'loudness', 'speechiness', 'acousticness', 'valence', 'joy', 'positivity']
columnTransform = ColumnTransformer([('num', StandardScaler(), num_features), ('cat', 'passthrough', categorical_features)])

pipeline = Pipeline([('transformer', columnTransform), ('model', LogisticRegression(max_iter=3000))])

pipeline.fit(x_train, y_train)
y_test_pred_s = pipeline.predict(x_test)
y_train_pred_s = pipeline.predict(x_train)

## Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500, # how many decision trees
    max_depth=10, # how much the tree can grow
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    max_samples=0.8,
    random_state=42
)

model.fit(x_train, y_train)
y_test_pred_r = model.predict(x_test)
y_train_pred_r = model.predict(x_train)


# Evaluation Metrics
from sklearn.metrics import accuracy_score

feature_importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Important features:\n", importance_df.head(20))
print("Least important features:\n", importance_df.tail(20))

train_acc = accuracy_score(y_train_pred_s,y_train)
test_acc  = accuracy_score(y_test_pred_s,y_test)

print("Logistic Regression Training Accuracy:", train_acc)
print("Logistic Regression Test Accuracy:", test_acc)

train_acc_r = accuracy_score(y_train_pred_r,y_train)
test_acc_r  = accuracy_score(y_test_pred_r,y_test)
print("Random Forest Training Accuracy:", train_acc_r)
print("Random Forest Test Accuracy:", test_acc_r)




### Prediction Model Level 3 - Grouped Genres

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


y_train = one_hot_encoded_train['grouped_genres']
y_test = one_hot_encoded_test['grouped_genres']

## Logistic Regression Model
categorical_features = [col for col in x_train.columns if col.startswith('top_genre_')]
num_features = ['audio_intensity', 'vocal_ratio', 'energy_squared', 'emotional_intensity', 'upbeatness', 'aggressiveness', 'decade', 'is_2010s', 'lofi-ness', 'pop_factor', 'rock_factor', 'danceability', 'energy', 'instrumentalness', 'tempo', 'anger', 'disgust', 'loudness', 'speechiness', 'acousticness', 'valence', 'joy', 'positivity']
columnTransform = ColumnTransformer([('num', StandardScaler(), num_features), ('cat', 'passthrough', categorical_features)])

pipeline = Pipeline([('transformer', columnTransform), ('model', LogisticRegression(max_iter=3000))])

pipeline.fit(x_train, y_train)
y_test_pred_g = pipeline.predict(x_test)
y_train_pred_g = pipeline.predict(x_train)

## Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=600, # how many decision trees
    max_depth=10, # how much the tree can grow
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    max_samples=0.8,
    random_state=42
)

model.fit(x_train, y_train)
y_test_pred_r = model.predict(x_test)
y_train_pred_r = model.predict(x_train)


le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.01,
    subsample=0.8,
    random_state=42
)
xgb_model.fit(x_train, y_train_encoded)
y_test_pred_x = xgb_model.predict(x_test)
y_train_pred_x = xgb_model.predict(x_train)


# Evaluation Metrics
from sklearn.metrics import accuracy_score

feature_importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Important features:\n", importance_df.head(20))
print("Least important features:\n", importance_df.tail(20))

train_acc = accuracy_score(y_train_pred_g,y_train)
test_acc  = accuracy_score(y_test_pred_g,y_test)

print("Logistic Regression Training Accuracy:", train_acc)
print("Logistic Regression Test Accuracy:", test_acc)

train_acc_r = accuracy_score(y_train_pred_r,y_train)
test_acc_r  = accuracy_score(y_test_pred_r,y_test)
print("Random Forest Training Accuracy:", train_acc_r)
print("Random Forest Test Accuracy:", test_acc_r)

train_acc_x = accuracy_score(y_train_pred_x,y_train_encoded)
test_acc_x  = accuracy_score(y_test_pred_x,y_test_encoded)
print("XGBoost Training Accuracy:", train_acc_x)
print("XGBoost Test Accuracy:", test_acc_x)




Important features:
                 Feature  Importance
1           vocal_ratio    0.110407
11         danceability    0.109802
18          speechiness    0.101191
6                decade    0.078735
8             lofi-ness    0.069502
7              is_2010s    0.056779
19         acousticness    0.052715
12               energy    0.038266
9            pop_factor    0.037527
2        energy_squared    0.035509
15                anger    0.033898
5        aggressiveness    0.033312
4            upbeatness    0.025808
23        top_genre_Pop    0.023426
10          rock_factor    0.021917
25       top_genre_Rock    0.021366
3   emotional_intensity    0.020594
16              disgust    0.020555
13     instrumentalness    0.019242
20              valence    0.016823
Least important features:
                 Feature  Importance
19         acousticness    0.052715
12               energy    0.038266
9            pop_factor    0.037527
2        energy_squared    0.035509
15              

### Accuracy

### Evalutation
MSE, MAE, Precision, Recall

### Visualizations & Conclusion