# Predicting Top Genre Given A Year

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Clean and Merge Spotify-Sentiment Dataset with Billboard Dataset

In [2]:
billboard_df = pd.read_csv('../data/compressed/genre_year_weights.csv')
spotify_sentiment_df = pd.read_csv('../data/processed/lyrics_features_clusters.csv')

billboard_df['top_genre'] = billboard_df[['Pop', 'Rock', 'R&B', 'Hip Hop', 'Soul', 'Country','Folk', 'Disco', 'Motown', 'Metal', 'Funk', 'Blues', 'Alternative', 'Electronic', 'Jazz']].idxmax(axis=1)

billboard_df.rename(columns={'Hot100 Ranking Year':'year'}, inplace=True)


merged_df = pd.merge(spotify_sentiment_df, billboard_df[['year','top_genre']], on='year', how='left')

merged_df.head()


Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,...,positive,surprise,trust,anger,disgust,fear,negative,sadness,cluster,top_genre
0,Can I Live,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,...,29.0,8.0,17.0,12.0,9.0,12.0,22.0,10.0,Rap_cluster_2,Pop
1,Money On My Mind,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,...,44.0,34.0,36.0,39.0,13.0,12.0,22.0,10.0,Rap_cluster_2,Pop
2,Mr. Carter,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,...,16.0,11.0,11.0,11.0,12.0,19.0,32.0,22.0,Rap_cluster_2,Pop
3,C.R.E.A.M.,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,...,37.0,21.0,20.0,27.0,9.0,14.0,26.0,9.0,Hip Hop_cluster_1,Pop
4,Barry Bonds,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,...,28.0,4.0,10.0,23.0,13.0,12.0,26.0,7.0,Rap_cluster_2,Pop


### Feature Engineering

Process of creating new features like columns from our data to make the model for effective and accurate at learning patterns 


example -> ratios for distinguishing upbeat vs aggressive music or like energy x danceability predicting a specific genre

In [3]:

merged_df['lofi-ness'] = merged_df['acousticness'] * (1 - merged_df['energy']) * (1 - merged_df['loudness'])
merged_df['pop_factor']= merged_df['danceability'] * merged_df['valence'] * merged_df['positive']
merged_df['rock_factor'] = merged_df['energy'] * merged_df['loudness'] * merged_df['instrumentalness']

merged_df['top_genre'].unique()



array(['Pop', 'R&B', 'Rock'], dtype=object)

### Split, Train, and Test Data

Training data could be -> 1960 - 2010

Testing data could be -> 2011 - 2019

In [4]:

train_df = merged_df[merged_df['year'] <= 2018]
test_df = merged_df[merged_df['year'] > 2018]

one_hot_encoded_train = pd.get_dummies(train_df, columns=['top_genre'])
one_hot_encoded_test = pd.get_dummies(test_df, columns=['top_genre'])
one_hot_encoded_test = one_hot_encoded_test.reindex(columns=one_hot_encoded_train.columns, fill_value=0)
features = ['danceability', 'lofi-ness', 'pop_factor', 'rock_factor', 'positive', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness'] + [col for col in one_hot_encoded_train.columns if col.startswith('top_genre_')]

x_train = one_hot_encoded_train[features]
y_train = one_hot_encoded_train['cluster']

x_test = one_hot_encoded_test[features]
y_test = one_hot_encoded_test['cluster']

In [5]:
x_train.columns

Index(['danceability', 'lofi-ness', 'pop_factor', 'rock_factor', 'positive',
       'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise',
       'trust', 'anger', 'disgust', 'fear', 'sadness', 'top_genre_Pop',
       'top_genre_R&B', 'top_genre_Rock'],
      dtype='object')

### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

categorical_features = [col for col in x_train.columns if col.startswith('top_genre_')]
num_features = ['danceability', 'lofi-ness', 'pop_factor', 'rock_factor', 'positive', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness']
columnTransform = ColumnTransformer([('num', StandardScaler(), num_features), ('cat', 'passthrough', categorical_features)])

pipeline = Pipeline([('transformer', columnTransform), ('model', LogisticRegression(max_iter=3000))])

pipeline.fit(x_train, y_train)
y_test_pred = pipeline.predict(x_test)
y_train_pred = pipeline.predict(x_train)



In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

model.fit(x_train, y_train)
y_test_pred_r = model.predict(x_test)
y_train_pred_r = model.predict(x_train)


### Predictions

In [8]:
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(y_train_pred,y_train)
test_acc  = accuracy_score(y_test_pred,y_test)

print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

train_acc_r = accuracy_score(y_train_pred_r,y_train)
test_acc_r  = accuracy_score(y_test_pred_r,y_test)
print("Random Forest Training Accuracy:", train_acc_r)
print("Random Forest Test Accuracy:", test_acc_r)

Training Accuracy: 0.5037267313382524
Test Accuracy: 0.5217889908256881
Random Forest Training Accuracy: 0.7237349713472415
Random Forest Test Accuracy: 0.5263761467889908


### Evalutation
MSE, MAE, Precision, Recall

In [9]:
num_features = ['danceability', 'upbeat', 'lofi-ness', 'pop_factor', 'rock_factor', 'positive', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness']
correlation_matrix = merged_df[num_features].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix of Numerical Features')
plt.show()


KeyError: "['upbeat'] not in index"

### Visualizations & Conclusion