<a href="https://colab.research.google.com/github/Harin22/Hit-Song-Predictor/blob/main/Feature_Scaling_and_Modelingipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
df = pd.read_csv("spotify_final_cleaned.csv")

In [3]:
numeric_df = df.select_dtypes(include=["int64","float64"]) #correlation
display(numeric_df.head())


Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,hit
0,0.0594,1921,0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,0.0366,80.954,0
1,0.963,1921,0.732,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,0.415,60.936,0
2,0.0394,1921,0.961,0.328,500062,0.166,0,0.913,3,0.101,-14.85,1,0.0339,110.339,0
3,0.165,1921,0.967,0.275,210000,0.309,0,2.8e-05,5,0.381,-9.316,1,0.0354,100.109,0
4,0.253,1921,0.957,0.418,166693,0.193,0,2e-06,3,0.229,-10.096,1,0.038,101.665,0


# About Correlation
From the correlation matrix, we observed that year, energy, and loudness are the strongest indicators of a hit. There's multicollinearity between energy and loudness, which we will handle cautiously in linear models. Features like mode, key, and liveness show negligible correlation with the target and can be deprioritized in feature selection.

In [4]:
c_matrix = numeric_df.corr()
#display(c_matrix.head)
c_matrix.head(16)


Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,hit
valence,1.0,-0.028245,-0.184101,0.558946,-0.191813,0.353876,-0.018613,-0.198501,0.028473,0.003832,0.313512,0.015641,0.046381,0.171689,0.053182
year,-0.028245,1.0,-0.61425,0.188515,0.079713,0.530272,0.220881,-0.272371,0.00754,-0.057318,0.487697,-0.032385,-0.167816,0.141048,0.757783
acousticness,-0.184101,-0.61425,1.0,-0.266852,-0.076373,-0.749393,-0.246007,0.329819,-0.02055,-0.024482,-0.561696,0.047168,-0.04398,-0.20712,-0.572136
danceability,0.558946,0.188515,-0.266852,1.0,-0.139937,0.221967,0.241757,-0.278063,0.024439,-0.100193,0.285057,-0.045956,0.235491,0.001801,0.140922
duration_ms,-0.191813,0.079713,-0.076373,-0.139937,1.0,0.042119,-0.04888,0.08477,-0.004266,0.047168,-0.003037,-0.046085,-0.084604,-0.025472,0.086468
energy,0.353876,0.530272,-0.749393,0.221967,0.042119,1.0,0.132723,-0.281101,0.027705,0.126192,0.782362,-0.03926,-0.070555,0.250865,0.478174
explicit,-0.018613,0.220881,-0.246007,0.241757,-0.04888,0.132723,1.0,-0.140987,0.005432,0.03964,0.1403,-0.078872,0.41407,0.011969,0.086762
instrumentalness,-0.198501,-0.272371,0.329819,-0.278063,0.08477,-0.281101,-0.140987,1.0,-0.014591,-0.047193,-0.408611,-0.036543,-0.1217,-0.105361,-0.283438
key,0.028473,0.00754,-0.02055,0.024439,-0.004266,0.027705,0.005432,-0.014591,1.0,0.000205,0.017385,-0.11626,0.023784,0.002629,0.00646
liveness,0.003832,-0.057318,-0.024482,-0.100193,0.047168,0.126192,0.03964,-0.047193,0.000205,1.0,0.056422,0.002641,0.134667,0.007714,-0.04999


# Feature Scaling and Standardizing data

In [5]:
x = df.drop(columns=["hit"])
y = df["hit"]


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
display(x_scaled)


array([[-1.7828247 , -2.15247016,  1.27618658, ...,  0.64391197,
        -0.37970638, -1.16930675],
       [ 1.65068832, -2.15247016,  0.61134711, ...,  0.64391197,
         1.94548067, -1.82117959],
       [-1.858821  , -2.15247016,  1.22034007, ...,  0.64391197,
        -0.3962973 , -0.21240379],
       ...,
       [ 0.41194856,  1.66730194, -1.06670771, ..., -1.55300732,
        -0.10749235, -0.81976118],
       [-1.26756976,  1.66730194, -1.30876246, ...,  0.64391197,
         1.28798856, -1.36140375],
       [ 0.43094764,  1.66730194, -0.98426761, ...,  0.64391197,
         0.05903135, -0.71220119]])

# Splitting train and test data

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x_scaled, y, test_size=0.2, random_state=42, stratify = y) #spliting data

print("Training set: ", x_train, y_train )
print("Testing set:", x_test, y_test)



Training set:  [[-0.38601263 -1.57371681  0.17787178 ... -1.55300732  5.13462464
  -1.16018873]
 [-1.86870052 -0.53196078  1.13789997 ... -1.55300732 -0.36250098
  -1.13635164]
 [ 0.0699652  -0.87921279  0.43582949 ... -1.55300732  0.63049645
   2.1266272 ]
 ...
 [ 0.72733323  0.12395968 -1.33349183 ...  0.64391197 -0.22854464
   1.34895834]
 [-0.36701355 -1.65088392  1.30543952 ... -1.55300732  0.10818964
  -1.04442252]
 [ 0.58294025 -0.30045944  0.80548024 ...  0.64391197 -0.39875521
   1.57977434]] 58894     0
27208     1
95659     0
56491     1
19384     1
         ..
102685    1
56478     1
49327     1
21576     0
82119     1
Name: hit, Length: 136522, dtype: int64
Testing set:  [[-1.52215738 -0.57054434  1.08205346 ...  0.64391197 -0.39076699
   1.87256027]
 [ 0.81472897  0.2782939  -1.30889543 ...  0.64391197 -0.39261043
  -0.51127925]
 [ 0.09276409  0.39404457 -0.00828327 ... -1.55300732 -0.27155814
   0.45396018]
 ...
 [ 0.20295873  1.01138148 -0.73162861 ...  0.64391197  0.64

# Model training amd evaluation using Scikit learn

In [15]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression( max_iter = 1000)
log_reg.fit(x_train, y_train) #training the model


In [23]:
y_pred = log_reg.predict(x_test)
print(y_pred) #to predict


[1 1 1 ... 1 1 1]


In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))



Confusion Matrix:
[[ 9394  1171]
 [ 1106 22460]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     10565
           1       0.95      0.95      0.95     23566

    accuracy                           0.93     34131
   macro avg       0.92      0.92      0.92     34131
weighted avg       0.93      0.93      0.93     34131


Accuracy Score:
0.9332864551287686


## ✅ Project Outcome Summary

- Built a machine learning model to predict hit songs based on audio features
- Used Logistic Regression as a baseline model for classification
- Dataset: 126,000+ songs from Spotify
- Features used: `danceability`, `energy`, `loudness`, `speechiness`, etc.
- Scaled data using StandardScaler
- Achieved **93.3% accuracy**
- Correlation analysis showed top predictive features as: `year`, `energy`, `acousticness`, and `loudness`

🎯 Final Verdict:  
Model is traiined, not biased toward specific class, and provides a prediction baseline.
