# Predicting Top Genre Given A Year

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Clean and Merge Spotify-Sentiment Dataset with Billboard Dataset

In [None]:
billboard_df = pd.read_csv('../data/compressed/genre_year_weights.csv')
spotify_sentiment_df = pd.read_csv('../data/processed/lyrics_features_clusters.csv')

billboard_df['top_genre'] = billboard_df[['Pop', 'Rock', 'R&B', 'Hip Hop', 'Soul', 'Country','Folk', 'Disco', 'Motown', 'Metal', 'Funk', 'Blues', 'Alternative', 'Electronic', 'Jazz']].idxmax(axis=1)

billboard_df.rename(columns={'Hot100 Ranking Year':'year'}, inplace=True)

billboard_df = billboard_df[billboard_df['year'] <= 2019]

merged_df = pd.merge(spotify_sentiment_df, billboard_df, on='year', how='left')

spotify_sentiment_df.head()



### Feature Engineering

Process of creating new features like columns from our data to make the model for effective and accurate at learning patterns 


example -> ratios for distinguishing upbeat vs aggressive music or like energy x danceability predicting a specific genre

In [None]:

merged_df['lofi-ness'] = merged_df['acousticness'] * (1 - merged_df['energy']) * (1 - merged_df['loudness'])
merged_df['pop_factor']= merged_df['danceability'] * merged_df['valence'] * merged_df['positive']
merged_df['rock_factor'] = merged_df['energy'] * merged_df['loudness'] * merged_df['instrumentalness']

#merged_df['top_genre'].unique()

merged_df['decade'] = (merged_df['year'] // 10) * 10
merged_df['is_80s'] = (merged_df['decade'] == 1980).astype(int)
merged_df['is_90s'] = (merged_df['decade'] == 1990).astype(int)
merged_df['is_2000s'] = (merged_df['decade'] == 2000).astype(int)
merged_df['is_2010s'] = (merged_df['decade'] == 2010).astype(int)


### Split, Train, and Test Data


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

train_df, test_df = train_test_split(merged_df, test_size=0.1, 
                                       random_state=42, stratify=merged_df['cluster'])

one_hot_encoded_train = pd.get_dummies(train_df, columns=['top_genre'])
one_hot_encoded_test = pd.get_dummies(test_df, columns=['top_genre'])
one_hot_encoded_test = one_hot_encoded_test.reindex(columns=one_hot_encoded_train.columns, fill_value=0)
features = ['decade', 'is_80s', 'is_90s', 'is_2000s', 'is_2010s', 'lofi-ness', 'pop_factor', 'rock_factor','danceability', 'positive', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness'] + [col for col in one_hot_encoded_train.columns if col.startswith('top_genre_')]

x_train = one_hot_encoded_train[features]
y_train = one_hot_encoded_train['Cluster Name']

x_test = one_hot_encoded_test[features]
y_test = one_hot_encoded_test['Cluster Name']

In [None]:
x_train.columns

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import xgboost as xgb

categorical_features = [col for col in x_train.columns if col.startswith('top_genre_')]
num_features = ['danceability', 'lofi-ness', 'pop_factor', 'rock_factor', 'positive', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness']
columnTransform = ColumnTransformer([('num', StandardScaler(), num_features), ('cat', 'passthrough', categorical_features)])

pipeline = Pipeline([('transformer', columnTransform), ('model', LogisticRegression(max_iter=3000))])

pipeline.fit(x_train, y_train)
y_test_pred = pipeline.predict(x_test)
y_train_pred = pipeline.predict(x_train)



### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300, # how many decision trees
    max_depth=8, # how much the tree can grow
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

model.fit(x_train, y_train)
y_test_pred_r = model.predict(x_test)
y_train_pred_r = model.predict(x_train)


### Accuracy

In [None]:
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(y_train_pred,y_train)
test_acc  = accuracy_score(y_test_pred,y_test)

print("Logistic Regression Training Accuracy:", train_acc)
print("Logistic Regression Test Accuracy:", test_acc)

train_acc_r = accuracy_score(y_train_pred_r,y_train)
test_acc_r  = accuracy_score(y_test_pred_r,y_test)
print("Random Forest Training Accuracy:", train_acc_r)
print("Random Forest Test Accuracy:", test_acc_r)


### Evalutation
MSE, MAE, Precision, Recall

### Visualizations & Conclusion