# Spotify Track Success Predictor - Model Training

This notebook trains a Logistic Regression model to predict whether a Spotify track is likely to be successful (popularity â‰¥ 70).

The trained model will be saved as JSON files that can be loaded directly by the NestJS backend.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json
import os


## 2. Load Dataset


In [2]:
# Paths - adjust BASE_DIR based on your notebook location
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath('')))
DATA_PATH = os.path.join(BASE_DIR, 'backend', 'data', 'spotify_tracks.csv')
MODEL_DIR = os.path.join(BASE_DIR, 'backend', 'model')

# Load data
df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df)} tracks")
print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")


Loaded 114000 tracks

Dataset shape: (114000, 21)

Columns: ['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']


## 3. Create Target Variable


In [3]:
# Create target variable: success = 1 if popularity >= 70, else 0
df['success'] = (df['popularity'] >= 70).astype(int)
print(f"Success rate: {df['success'].mean() * 100:.2f}%")
print(f"\nSuccess distribution:")
print(df['success'].value_counts())


Success rate: 4.80%

Success distribution:
success
0    108528
1      5472
Name: count, dtype: int64


## 4. Select Features


In [4]:
# Select features (matching your dataset columns)
feature_columns = [
    'danceability',
    'energy',
    'loudness',
    'tempo',
    'duration_ms'
]

X = df[feature_columns].copy()
y = df['success'].copy()

print(f"Features: {feature_columns}")
print(f"\nFeature statistics:")
print(X.describe())


Features: ['danceability', 'energy', 'loudness', 'tempo', 'duration_ms']

Feature statistics:
        danceability         energy       loudness          tempo  \
count  114000.000000  114000.000000  114000.000000  114000.000000   
mean        0.566800       0.641383      -8.258960     122.147837   
std         0.173542       0.251529       5.029337      29.978197   
min         0.000000       0.000000     -49.531000       0.000000   
25%         0.456000       0.472000     -10.013000      99.218750   
50%         0.580000       0.685000      -7.004000     122.017000   
75%         0.695000       0.854000      -5.003000     140.071000   
max         0.985000       1.000000       4.532000     243.372000   

        duration_ms  
count  1.140000e+05  
mean   2.280292e+05  
std    1.072977e+05  
min    0.000000e+00  
25%    1.740660e+05  
50%    2.129060e+05  
75%    2.615060e+05  
max    5.237295e+06  


## 5. Handle Missing Values


In [5]:
# Check for missing values
missing_count = X.isnull().sum().sum()
print(f"Missing values: {missing_count}")

if missing_count > 0:
    print("\nFilling missing values with mean...")
    X = X.fillna(X.mean())
    print("Missing values filled.")
else:
    print("No missing values found.")


Missing values: 0
No missing values found.


## 6. Split Data


In [6]:
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTraining success rate: {y_train.mean() * 100:.2f}%")
print(f"Test success rate: {y_test.mean() * 100:.2f}%")


Training set: 91200 samples
Test set: 22800 samples

Training success rate: 4.80%
Test success rate: 4.80%


## 7. Scale Features


In [7]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler")
print(f"\nScaler mean: {scaler.mean_}")
print(f"Scaler scale: {scaler.scale_}")


Features scaled using StandardScaler

Scaler mean: [ 5.66932471e-01  6.41719801e-01 -8.24817505e+00  1.22199313e+02
  2.27829386e+05]
Scaler scale: [1.73615621e-01 2.51580857e-01 5.02223715e+00 2.99946370e+01
 1.05553572e+05]


## 8. Train Model


In [8]:
# Train Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")
print(f"\nModel coefficients: {model.coef_[0]}")
print(f"Model intercept: {model.intercept_[0]}")


Model trained successfully!

Model coefficients: [ 0.22696789 -0.36867651  0.81913646 -0.1024276  -0.07037222]
Model intercept: -3.1464639533904264


## 9. Evaluate Model


In [9]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print("=" * 60)
print("ðŸ“Š Model Evaluation Metrics:")
print("=" * 60)
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1 Score:  {f1:.4f}")
print("=" * 60)


ðŸ“Š Model Evaluation Metrics:
   Accuracy:  0.9520
   Precision: 0.0000
   Recall:    0.0000
   F1 Score:  0.0000


## 10. Save Model as JSON (for NestJS)

Save model parameters and scaler parameters as JSON files that can be loaded by the NestJS backend.


In [10]:
# Create model directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)

# Prepare model data for JSON export
model_data = {
    "coefficients": model.coef_[0].tolist(),
    "intercept": model.intercept_[0].item(),
    "feature_order": feature_columns
}

scaler_data = {
    "mean": scaler.mean_.tolist(),
    "scale": scaler.scale_.tolist(),
    "feature_order": feature_columns
}

# Save as JSON files
MODEL_JSON_PATH = os.path.join(MODEL_DIR, 'model.json')
SCALER_JSON_PATH = os.path.join(MODEL_DIR, 'scaler.json')

with open(MODEL_JSON_PATH, 'w') as f:
    json.dump(model_data, f, indent=2)

with open(SCALER_JSON_PATH, 'w') as f:
    json.dump(scaler_data, f, indent=2)

print(f"âœ… Model saved to: {MODEL_JSON_PATH}")
print(f"âœ… Scaler saved to: {SCALER_JSON_PATH}")
print("\nâœ… Model training completed successfully!")
print("\nThe model is now ready to be used by the NestJS backend endpoints.")


âœ… Model saved to: /Users/hassantahir/Desktop/final-project/backend/model/model.json
âœ… Scaler saved to: /Users/hassantahir/Desktop/final-project/backend/model/scaler.json

âœ… Model training completed successfully!

The model is now ready to be used by the NestJS backend endpoints.
