In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("/content/drive/MyDrive/Project/tracks_features_processed.csv")

In [3]:
target = data.in_billboard

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,index,name,album,artists,explicit,danceability,energy,key,loudness,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,in_billboard,id
0,0,0,Testify,The Battle Of Los Angeles,Rage Against The Machine,False,0.47,0.978,7,-5.399,...,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,False,TestifyRage Against The Machine
1,1,1,Guerrilla Radio,The Battle Of Los Angeles,Rage Against The Machine,True,0.599,0.957,11,-5.764,...,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,True,Guerrilla RadioRage Against The Machine
2,2,2,Calm Like a Bomb,The Battle Of Los Angeles,Rage Against The Machine,False,0.315,0.97,7,-5.424,...,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,False,Calm Like a BombRage Against The Machine
3,3,3,Mic Check,The Battle Of Los Angeles,Rage Against The Machine,True,0.44,0.967,11,-5.83,...,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,False,Mic CheckRage Against The Machine
4,4,4,Sleep Now In the Fire,The Battle Of Los Angeles,Rage Against The Machine,False,0.426,0.929,2,-6.729,...,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,False,Sleep Now In the FireRage Against The Machine


In [5]:
data.columns

Index(['Unnamed: 0', 'index', 'name', 'album', 'artists', 'explicit',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'time_signature', 'year', 'release_date', 'in_billboard',
       'id'],
      dtype='object')

In [6]:
# Split the dataset into features (X) and target variable (y)
X = data[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]  # Features
y = target  # Target variable

In [7]:
X

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.470,0.978,7,-5.399,1,0.0727,0.02610,0.000011,0.3560,0.5030,117.906,210133,4.0
1,0.599,0.957,11,-5.764,1,0.1880,0.01290,0.000071,0.1550,0.4890,103.680,206200,4.0
2,0.315,0.970,7,-5.424,1,0.4830,0.02340,0.000002,0.1220,0.3700,149.749,298893,4.0
3,0.440,0.967,11,-5.830,0,0.2370,0.16300,0.000004,0.1210,0.5740,96.752,213640,4.0
4,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105000,0.0789,0.5390,127.059,205600,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
659536,0.553,0.562,9,-10.329,1,0.0305,0.22800,0.000622,0.1060,0.3500,135.919,260333,4.0
659537,0.456,0.756,6,-9.656,0,0.0466,0.29100,0.358000,0.1110,0.4970,124.935,260600,4.0
659538,0.322,0.799,7,-8.234,1,0.0368,0.03590,0.203000,0.3550,0.4360,97.526,344667,3.0
659539,0.343,0.846,9,-7.766,1,0.0730,0.00869,0.880000,0.1030,0.0933,143.404,308200,4.0


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_test

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
265598,0.279,0.16800,4,-19.578,1,0.0356,0.909,0.870000,0.1150,0.1690,110.443,641973,3.0
219092,0.617,0.24600,5,-13.279,1,0.1090,0.965,0.006860,0.1450,0.8380,155.147,111400,4.0
288957,0.325,0.30500,0,-9.590,1,0.0436,0.909,0.000012,0.2200,0.1790,162.625,201573,3.0
463623,0.237,0.02310,1,-32.135,0,0.0419,0.753,0.956000,0.1990,0.0348,48.612,70200,4.0
342421,0.404,0.47200,9,-10.485,1,0.0435,0.510,0.062300,0.0896,0.0893,122.637,387000,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
154187,0.482,0.87100,7,-5.681,1,0.0539,0.371,0.000478,0.4410,0.5390,154.838,190333,4.0
65181,0.483,0.27000,7,-19.663,0,0.0336,0.907,0.960000,0.1340,0.1340,119.993,321053,4.0
411162,0.651,0.00002,9,-15.175,1,0.9080,0.854,0.000000,0.6280,0.2110,135.394,507246,4.0
400378,0.322,0.20800,0,-17.698,1,0.0316,0.962,0.871000,0.1320,0.1500,96.149,974173,4.0


In [9]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Initialize the random forest classifier model
rf_model = RandomForestClassifier()

# Fit the model on the training data
rf_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_rf = rf_model.predict_proba(X_test_scaled)

# Evaluate the model
#accuracy_rf = accuracy_score(y_test, y_pred_rf)
#print("Random Forest Accuracy:", accuracy_rf)

# Print classification report
#print("Random Forest Classification Report:")
#print(classification_report(y_test, y_pred_rf))

# Print confusion matrix
#print("Random Forest Confusion Matrix:")
#print(confusion_matrix(y_test, y_pred_rf))

In [11]:
import pickle
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

In [13]:
# Predict on the testing data
y_pred = rf_model.predict_proba(X_test_scaled)

# Evaluate the model
#accuracy = accuracy_score(y_test, y_pred)
#print("Accuracy:", accuracy)

In [26]:
# prompt: I want obtain the name, artist, and album information of X_test from the data variable, using the row indices, and merge them to X_test.

X_test_with_info = X_test.copy()
X_test_with_info['name'] = data.loc[X_test.index, 'name'].values
X_test_with_info['artists'] = data.loc[X_test.index, 'artists'].values
X_test_with_info['album'] = data.loc[X_test.index, 'album'].values


In [41]:
X_test_with_info

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,name,artists,album,probability_of_being_in_billboard
3465,0.507,0.8500,5,-5.679,1,0.0472,0.3780,0.000000,0.0567,0.8470,171.800,183733,4.0,Something Like That,Tim McGraw,Greatest Hits,0.92
3481,0.545,0.4980,2,-7.286,1,0.0266,0.1950,0.000000,0.1690,0.3630,127.604,270107,4.0,Angry All The Time,Tim McGraw,Set This Circus Down,0.88
269759,0.416,0.5460,7,-7.728,1,0.0297,0.4920,0.000000,0.0845,0.4180,159.929,300333,4.0,Live Like You Were Dying,Tim McGraw,Number One Hits,0.87
3462,0.479,0.6980,7,-7.914,1,0.0277,0.1980,0.000000,0.1260,0.6730,159.699,201600,4.0,Where The Green Grass Grows,Tim McGraw,Greatest Hits,0.76
433876,0.480,0.7530,6,-5.632,1,0.0270,0.0102,0.000000,0.1800,0.2630,100.181,256000,4.0,Wrong Impression,Natalie Imbruglia,Left Of The Middle / White Lillies Island,0.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152825,0.891,0.5750,1,-7.728,0,0.1920,0.0950,0.001260,0.0992,0.7130,104.752,273173,4.0,Luck of Lucien,A Tribe Called Quest,The Anthology,0.00
291339,0.174,0.0518,1,-19.184,0,0.0451,0.8970,0.917000,0.0975,0.0367,117.282,670027,3.0,"Symphony No. 24 in F Minor, Op. 63: II. Molto ...","Nikolai Myaskovsky', 'Moscow Philharmonic Orch...",Myaskovsky: Symphonies Nos. 24 and 25,0.00
43192,0.784,0.9070,2,-6.297,1,0.2060,0.1680,0.000004,0.0984,0.4340,91.952,230800,4.0,Sincerely,Brand Nubian,Foundation,0.00
59721,0.600,0.4330,4,-6.664,0,0.0271,0.8990,0.413000,0.3590,0.1500,118.082,235347,4.0,Black Dog Yodel,Those Poor Bastards,Country Bullshit (Reissue),0.00


In [28]:
# prompt: Merge X_test_with_info with the second column of y_pred.

X_test_with_info['probability_of_being_in_billboard'] = y_pred[:, 1]


In [42]:
X_test_with_info

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,name,artists,album,probability_of_being_in_billboard
3465,0.507,0.8500,5,-5.679,1,0.0472,0.3780,0.000000,0.0567,0.8470,171.800,183733,4.0,Something Like That,Tim McGraw,Greatest Hits,0.92
3481,0.545,0.4980,2,-7.286,1,0.0266,0.1950,0.000000,0.1690,0.3630,127.604,270107,4.0,Angry All The Time,Tim McGraw,Set This Circus Down,0.88
269759,0.416,0.5460,7,-7.728,1,0.0297,0.4920,0.000000,0.0845,0.4180,159.929,300333,4.0,Live Like You Were Dying,Tim McGraw,Number One Hits,0.87
3462,0.479,0.6980,7,-7.914,1,0.0277,0.1980,0.000000,0.1260,0.6730,159.699,201600,4.0,Where The Green Grass Grows,Tim McGraw,Greatest Hits,0.76
433876,0.480,0.7530,6,-5.632,1,0.0270,0.0102,0.000000,0.1800,0.2630,100.181,256000,4.0,Wrong Impression,Natalie Imbruglia,Left Of The Middle / White Lillies Island,0.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152825,0.891,0.5750,1,-7.728,0,0.1920,0.0950,0.001260,0.0992,0.7130,104.752,273173,4.0,Luck of Lucien,A Tribe Called Quest,The Anthology,0.00
291339,0.174,0.0518,1,-19.184,0,0.0451,0.8970,0.917000,0.0975,0.0367,117.282,670027,3.0,"Symphony No. 24 in F Minor, Op. 63: II. Molto ...","Nikolai Myaskovsky', 'Moscow Philharmonic Orch...",Myaskovsky: Symphonies Nos. 24 and 25,0.00
43192,0.784,0.9070,2,-6.297,1,0.2060,0.1680,0.000004,0.0984,0.4340,91.952,230800,4.0,Sincerely,Brand Nubian,Foundation,0.00
59721,0.600,0.4330,4,-6.664,0,0.0271,0.8990,0.413000,0.3590,0.1500,118.082,235347,4.0,Black Dog Yodel,Those Poor Bastards,Country Bullshit (Reissue),0.00


In [32]:
# prompt: can you sort X_test_with_info by the probability_of_being_in_billboard column in descending order

X_test_with_info = X_test_with_info.sort_values(by='probability_of_being_in_billboard', ascending=False)


In [34]:
# prompt: create a dataframe from X_test_with_info with only the data containing non-zero values for probability_of_being_in_billboard

df_nonzero = X_test_with_info[X_test_with_info['probability_of_being_in_billboard'] > 0]


In [None]:
# prompt: can you graph the top 100 datapoints df_nonzero with probability_of_being_in_billboard on the y

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar(df_nonzero['name'][:100], df_nonzero['probability_of_being_in_billboard'][:100])
plt.title('Top 100 Songs with Probability of Being in Billboard')
plt.xlabel('Song Name')
plt.ylabel('Probability')
plt.xticks(rotation=90)
plt.show()