# ML Models

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("cleaned_data.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_data.csv'

In [6]:
X = df.drop("popularity", axis = 1)
y = df["popularity"]

In [7]:
X

Unnamed: 0,A Capella,Alternative,Anime,Blues,Children’s Music,Classical,Comedy,Country,Dance,Electronic,...,key_F,key_F#,key_G,key_G#,time_signature_1/4,time_signature_3/4,time_signature_4/4,time_signature_5/4,mode_Major,mode_Minor
0,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
176764,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
176765,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
176766,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [8]:
y

0         13
1          5
2         30
3         39
4         70
          ..
176763    38
176764    40
176765    28
176766    67
176767    33
Name: popularity, Length: 176768, dtype: int64

## train test dataset split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(141414, 54)
(35354, 54)


## Save data

In [50]:
training = pd.DataFrame(X_train.copy())
training["popularity"] = y_train
training.to_csv("data/train.csv")

In [51]:
test = pd.DataFrame(X_test.copy())
test["popularity"] = y_test
test.to_csv("data/test.csv")

In [52]:
"popularity" in X_train.columns

False

In [53]:
"popularity" in X_test.columns

False

## Random Forest

In [57]:
base_rf = RandomForestRegressor(n_estimators= 100, max_depth= 15, min_samples_split= 10, max_features = 30)
base_rf.fit(X_train, y_train)


In [58]:
base_rf_pred = base_rf.predict(X_test)
base_rf_r2 = r2_score(y_test,base_rf_pred)
print("R2 Score:", base_rf_r2)

R2 Score: 0.7183473025678488


In [59]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    # 'max_depth': [5, 10, 15],
    # 'min_samples_split': [3, 5, 10, 15],
    # 'max_features': [22, 30, 35]
}


In [60]:
rf = RandomForestRegressor()
# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ....................................n_estimators=50; total time= 1.7min
[CV] END ....................................n_estimators=50; total time= 1.7min
[CV] END ....................................n_estimators=50; total time= 1.7min
[CV] END ....................................n_estimators=50; total time= 1.7min
[CV] END ....................................n_estimators=50; total time= 1.7min
[CV] END ...................................n_estimators=100; total time= 3.5min
[CV] END ...................................n_estimators=100; total time= 3.6min
[CV] END ...................................n_estimators=100; total time= 3.6min
[CV] END ...................................n_estimators=100; total time= 3.5min
[CV] END ...................................n_estimators=100; total time= 3.5min
[CV] END ...................................n_estimators=200; total time= 6.5min
[CV] END ...................................n_est

In [61]:
# Print the best hyperparameters and the best score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Hyperparameters:  {'n_estimators': 200}
Best Score:  0.739290044739761


In [62]:
"popularity" in list(X_train.columns)

False