In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  
import seaborn as sns  # For advanced visualizations
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.ensemble import RandomForestClassifier  # For random forest model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  # For model evaluation
from sklearn.feature_extraction.text import TfidfVectorizer  # For text vectorization
from nltk.corpus import stopwords  # For removing stopwords
from nltk.tokenize import word_tokenize  # For tokenizing text
import re  # For regex-based text cleaning
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Configure Jupyter Notebook for inline plotting
%matplotlib inline


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [3]:
# Synthetic dataset (or replace with your own dataset)
data = {
    'Track': ['Song1', 'Song2', 'Song3', 'Song4', 'Song5'],
    'Artist': ['Artist1', 'Artist2', 'Artist3', 'Artist4', 'Artist5'],
    'Genre': ['Pop', 'Rock', 'Jazz', 'Hip-Hop', 'Electronic'],
    'Lyrics': [
        "I'm feeling good, it's a new dawn, a new day",
        "Rocking all night, partying all day",
        "Smooth melodies under the moonlight",
        "Rapping the truth, spitting fire on the mic",
        "Dancing through the beats, under the strobe lights"
    ],
    'Danceability': [0.8, 0.6, 0.5, 0.9, 0.95],
    'Energy': [0.7, 0.8, 0.4, 0.9, 0.85],
    'Valence': [0.6, 0.7, 0.5, 0.3, 0.9],
    'Duration': [210000, 180000, 240000, 200000, 230000],
    'Loudness': [-5.0, -6.5, -8.0, -3.0, -2.5],
    'Popularity': [70, 65, 55, 80, 90]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,Track,Artist,Genre,Lyrics,Danceability,Energy,Valence,Duration,Loudness,Popularity
0,Song1,Artist1,Pop,"I'm feeling good, it's a new dawn, a new day",0.8,0.7,0.6,210000,-5.0,70
1,Song2,Artist2,Rock,"Rocking all night, partying all day",0.6,0.8,0.7,180000,-6.5,65
2,Song3,Artist3,Jazz,Smooth melodies under the moonlight,0.5,0.4,0.5,240000,-8.0,55
3,Song4,Artist4,Hip-Hop,"Rapping the truth, spitting fire on the mic",0.9,0.9,0.3,200000,-3.0,80
4,Song5,Artist5,Electronic,"Dancing through the beats, under the strobe li...",0.95,0.85,0.9,230000,-2.5,90


In [5]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text

# Apply text cleaning
df['Cleaned_Lyrics'] = df['Lyrics'].apply(clean_text)

# Display the cleaned lyrics
df[['Track', 'Cleaned_Lyrics']].head()


Unnamed: 0,Track,Cleaned_Lyrics
0,Song1,im feeling good new dawn new day
1,Song2,rocking night partying day
2,Song3,smooth melodies moonlight
3,Song4,rapping truth spitting fire mic
4,Song5,dancing beats strobe lights


In [7]:
# Vectorize lyrics using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)  # Limit to top 100 features
X_lyrics = vectorizer.fit_transform(df['Cleaned_Lyrics']).toarray()

# Convert to DataFrame for merging
lyrics_df = pd.DataFrame(X_lyrics, columns=vectorizer.get_feature_names_out())

# Merge the original DataFrame with lyrics DataFrame
df_combined = pd.concat([df, lyrics_df], axis=1)

# Display the combined DataFrame
df_combined.head()


Unnamed: 0,Track,Artist,Genre,Lyrics,Danceability,Energy,Valence,Duration,Loudness,Popularity,...,moonlight,new,night,partying,rapping,rocking,smooth,spitting,strobe,truth
0,Song1,Artist1,Pop,"I'm feeling good, it's a new dawn, a new day",0.8,0.7,0.6,210000,-5.0,70,...,0.0,0.679984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Song2,Artist2,Rock,"Rocking all night, partying all day",0.6,0.8,0.7,180000,-6.5,65,...,0.0,0.0,0.523358,0.523358,0.0,0.523358,0.0,0.0,0.0,0.0
2,Song3,Artist3,Jazz,Smooth melodies under the moonlight,0.5,0.4,0.5,240000,-8.0,55,...,0.57735,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0
3,Song4,Artist4,Hip-Hop,"Rapping the truth, spitting fire on the mic",0.9,0.9,0.3,200000,-3.0,80,...,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.0,0.447214
4,Song5,Artist5,Electronic,"Dancing through the beats, under the strobe li...",0.95,0.85,0.9,230000,-2.5,90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0


In [9]:
# Define features (X) and target (y)
X = df_combined.drop(['Track', 'Artist', 'Genre', 'Lyrics', 'Cleaned_Lyrics', 'Popularity'], axis=1)
y = df_combined['Popularity'] > 60  # Set a threshold for popularity (e.g., 60)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [11]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Confusion Matrix:
[[1]]

Classification Report:
              precision    recall  f1-score   support

        True       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

Accuracy: 1.00


