## AI Music Recommendatiomn App Models

### Data Preparation

In [12]:
# 1. Load required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
# 2. Load the dataset
df = pd.read_csv('music_sentiment_dataset.csv')  # Replace with your actual CSV filename
print(f"Initial shape: {df.shape}")
df.head()

Initial shape: (1000, 11)


Unnamed: 0,User_ID,User_Text,Sentiment_Label,Recommended_Song_ID,Song_Name,Artist,Genre,Tempo (BPM),Mood,Energy,Danceability
0,U1,Way ball purpose public experience recently re...,Sad,S1,Someone Like You,Adele,Pop,67,Melancholic,Low,Low
1,U2,Save officer two myself a.,Happy,S2,Happy,Pharrell Williams,Pop,160,Joyful,High,High
2,U3,Decade ahead everyone environment themselves a...,Relaxed,S3,Clair de Lune,Debussy,Classical,60,Soothing,Low,Low
3,U4,Best change letter citizen try ask quality pro...,Happy,S4,Happy,Pharrell Williams,Pop,160,Joyful,High,High
4,U5,Worker player chance kind actually.,Happy,S5,Happy,Pharrell Williams,Pop,160,Joyful,High,High


In [3]:
# 3. Drop duplicate rows
df = df.drop_duplicates()
print(f"After dropping duplicates: {df.shape}")

After dropping duplicates: (1000, 11)


In [None]:
# 4. Handle missing values
# Numeric columns: median; Categorical columns: mode
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

print("Missing values handled.")

In [6]:
# 5. Encode categorical features, excluding 'Song_Name', 'User_Text', and 'Mood' for now
features_to_encode = [col for col in cat_cols if col not in ['Song_Name', 'User_Text', 'Mood']]
df = pd.get_dummies(df, columns=features_to_encode, drop_first=True)
print(f"Shape after one-hot encoding: {df.shape}")

Shape after one-hot encoding: (1000, 2020)


In [7]:
# 6. Encode the target variable 'Song_Name'
label_encoder = LabelEncoder()
df['Song_Name_encoded'] = label_encoder.fit_transform(df['Song_Name'])
print("Song_Name encoded.")

Song_Name encoded.


In [9]:
# 7. One-hot encode the 'Mood' column
mood_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
mood_encoded = mood_encoder.fit_transform(df[['Mood']])
mood_encoded_df = pd.DataFrame(mood_encoded, columns=mood_encoder.get_feature_names_out(['Mood']))
df = pd.concat([df.reset_index(drop=True), mood_encoded_df.reset_index(drop=True)], axis=1)
print("Mood one-hot encoded.")

Mood one-hot encoded.


In [10]:
# 8. Prepare features and target for modeling
feature_cols = [col for col in df.columns if col not in ['Song_Name', 'Song_Name_encoded']]
X = df[feature_cols]
y = df['Song_Name_encoded']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

Train shape: (800, 2026), Validation shape: (200, 2026)


In [None]:
# 9. Apply TF-IDF vectorization to User_Text
tfidf = TfidfVectorizer(
    max_features=100,  # Tune this number as needed
    stop_words='english',
    ngram_range=(1,2)  # Unigrams and bigrams
)

user_text_tfidf = tfidf.fit_transform(df['User_Text'])
user_text_tfidf_df = pd.DataFrame(
    user_text_tfidf.toarray(), 
    columns=[f"tfidf_{f}" for f in tfidf.get_feature_names_out()]
)
print("TF-IDF vectorization applied to User_Text.")

TF-IDF vectorization applied to User_Text.


In [None]:
# 10. Concatenate TF-IDF features to your main dataframe
df = pd.concat([df.reset_index(drop=True), user_text_tfidf_df.reset_index(drop=True)], axis=1)
print(f"Shape after adding TF-IDF features: {df.shape}")

Shape after adding TF-IDF features: (1000, 2128)


In [None]:
# 11. Update feature columns (exclude User_Text now since it's vectorized)
feature_cols = [col for col in df.columns if col not in ['Song_Name', 'Song_Name_encoded', 'User_Text']]
X = df[feature_cols]
y = df['Song_Name_encoded']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

Train shape: (800, 2125), Validation shape: (200, 2125)


### Random Forest Classifier

In [16]:
# 1. Import the Random Forest Classifier and evaluation tools
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [17]:
# 2. Initialize the Random Forest Classifier
rf_clf = RandomForestClassifier(
    n_estimators=200,      # Number of trees (can tune)
    max_depth=None,        # Let trees expand until pure or min_samples_split
    random_state=42,       # For reproducibility
    n_jobs=-1,             # Use all CPU cores
    class_weight='balanced' # Handle class imbalance if present
)

In [None]:
# Remove 'Mood' column from features if present
if 'Mood' in feature_cols:
	feature_cols.remove('Mood')

# Recreate X, X_train, X_val with updated feature_cols
X = df[feature_cols]
X_train, X_val, y_train, y_val = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Train the Random Forest Classifier
rf_clf.fit(X_train, y_train)
print("Random Forest training complete.")

Random Forest training complete.


In [21]:
# 4. Predict on the validation set
y_pred = rf_clf.predict(X_val)

In [28]:
# 5. Evaluate the model
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.3f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=label_encoder.inverse_transform(sorted(y_val.unique()))))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

Validation Accuracy: 1.000

Classification Report:
                  precision    recall  f1-score   support

   Clair de Lune       1.00      1.00      1.00        27
Eye of the Tiger       1.00      1.00      1.00        26
         Fix You       1.00      1.00      1.00        25
           Happy       1.00      1.00      1.00        29
Someone Like You       1.00      1.00      1.00        24
        Stronger       1.00      1.00      1.00        26
     Uptown Funk       1.00      1.00      1.00        22
      Weightless       1.00      1.00      1.00        21

        accuracy                           1.00       200
       macro avg       1.00      1.00      1.00       200
    weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[27  0  0  0  0  0  0  0]
 [ 0 26  0  0  0  0  0  0]
 [ 0  0 25  0  0  0  0  0]
 [ 0  0  0 29  0  0  0  0]
 [ 0  0  0  0 24  0  0  0]
 [ 0  0  0  0  0 26  0  0]
 [ 0  0  0  0  0  0 22  0]
 [ 0  0  0  0  0  0  0 21]]


### xGBoost Model

In [23]:
# 1. Install XGBoost if not already installed
# !pip install xgboost
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
# 2. Initialize the XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    objective='multi:softmax',  # For multiclass classification
    num_class=len(y.unique()),
    eval_metric='mlogloss',     # Multiclass log-loss
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

In [25]:
# 3. Train the XGBoost Classifier
xgb_clf.fit(X_train, y_train)
print("XGBoost training complete.")

Parameters: { "use_label_encoder" } are not used.



XGBoost training complete.


In [26]:
# 4. Predict on the validation set
y_pred_xgb = xgb_clf.predict(X_val)

In [27]:
# 5. Evaluate the model
acc_xgb = accuracy_score(y_val, y_pred_xgb)
print(f"Validation Accuracy (XGBoost): {acc_xgb:.3f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred_xgb, target_names=label_encoder.inverse_transform(sorted(y_val.unique()))))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_xgb))

Validation Accuracy (XGBoost): 1.000

Classification Report:
                  precision    recall  f1-score   support

   Clair de Lune       1.00      1.00      1.00        27
Eye of the Tiger       1.00      1.00      1.00        26
         Fix You       1.00      1.00      1.00        25
           Happy       1.00      1.00      1.00        29
Someone Like You       1.00      1.00      1.00        24
        Stronger       1.00      1.00      1.00        26
     Uptown Funk       1.00      1.00      1.00        22
      Weightless       1.00      1.00      1.00        21

        accuracy                           1.00       200
       macro avg       1.00      1.00      1.00       200
    weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[27  0  0  0  0  0  0  0]
 [ 0 26  0  0  0  0  0  0]
 [ 0  0 25  0  0  0  0  0]
 [ 0  0  0 29  0  0  0  0]
 [ 0  0  0  0 24  0  0  0]
 [ 0  0  0  0  0 26  0  0]
 [ 0  0  0  0  0  0 22  0]
 [ 0  0  0  0  0  0  0 21]]


### LightGBM

In [29]:
# 1. Install LightGBM if not already installed
# !pip install lightgbm
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [30]:
# 2. Initialize the LightGBM Classifier
lgbm_clf = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

In [31]:
# 3. Train the LightGBM Classifier
lgbm_clf.fit(X_train, y_train)
print("LightGBM training complete.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 26
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -2.079441
[LightGBM] [Info] Start training from score -2.079442
LightGBM training complete.


In [32]:
# 4. Predict on the validation set
y_pred_lgbm = lgbm_clf.predict(X_val)

In [33]:
# 5. Evaluate the model
acc_lgbm = accuracy_score(y_val, y_pred_lgbm)
print(f"Validation Accuracy (LightGBM): {acc_lgbm:.3f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred_lgbm, target_names=label_encoder.inverse_transform(sorted(y_val.unique()))))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_lgbm))

Validation Accuracy (LightGBM): 1.000

Classification Report:
                  precision    recall  f1-score   support

   Clair de Lune       1.00      1.00      1.00        27
Eye of the Tiger       1.00      1.00      1.00        26
         Fix You       1.00      1.00      1.00        25
           Happy       1.00      1.00      1.00        29
Someone Like You       1.00      1.00      1.00        24
        Stronger       1.00      1.00      1.00        26
     Uptown Funk       1.00      1.00      1.00        22
      Weightless       1.00      1.00      1.00        21

        accuracy                           1.00       200
       macro avg       1.00      1.00      1.00       200
    weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[27  0  0  0  0  0  0  0]
 [ 0 26  0  0  0  0  0  0]
 [ 0  0 25  0  0  0  0  0]
 [ 0  0  0 29  0  0  0  0]
 [ 0  0  0  0 24  0  0  0]
 [ 0  0  0  0  0 26  0  0]
 [ 0  0  0  0  0  0 22  0]
 [ 0  0  0  0  0  0  0 21]]


### Save models

In [34]:
# Save models and preprocessing objects after training in your notebook

import joblib

# Create a directory for saved models if not exists
import os
os.makedirs('models', exist_ok=True)

# Save Random Forest
joblib.dump(rf_clf, 'models/random_forest.pkl')

# Save XGBoost
joblib.dump(xgb_clf, 'models/xgboost.pkl')

# Save LightGBM
joblib.dump(lgbm_clf, 'models/lightgbm.pkl')

# Save encoders and vectorizer
joblib.dump(label_encoder, 'models/label_encoder.pkl')
joblib.dump(mood_encoder, 'models/mood_encoder.pkl')
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

['models/tfidf_vectorizer.pkl']