In [1]:
# import statements
import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly as px
import matplotlib.pyplot as plt
import plotly.express as px
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder


In [2]:
# import csv files
df_one = pd.read_csv("train.csv")
df_two = pd.read_csv('test.csv')

# concatenates the files into one dataframe
df_music = pd.concat([df_one, df_two], ignore_index=True)
df_music.reset_index(drop=True, inplace=True)

# normalizing dataframe headers and replaceing double spaces in vendor name
df_music.columns = df_music.columns.str.lower().str.replace(r'\s+', '_', regex=True)

In [3]:
# defines a function to exract featured artists if there are any
def extract_featured_artist(track_name):
    pattern = r"\(feat\. ([^)]+)\)"  # notates a pattern to match "(feat. artist)"
    match = re.search(pattern, track_name)
   # if statement when we find a match
    if match:
        featured_artist = match.group(1)
        track_name = re.sub(pattern, "", track_name)  # removes the "(feat. artist)" from track_name
        return track_name.strip(), featured_artist.strip()
    
    return track_name.strip(), None

# cleans track name of any excess hyphenations
def track_name_cleaner(track_name):
    
    if track_name is not None:
        # Remove all text after the hyphen '-'
        track_name = track_name.split('-')[0].strip()
    
    return track_name
# makes sure characters are printable values
def is_printable(text):
    return all(32 <= ord(char) <= 127 for char in text)

# turns milliseconds into minutes
def convert_ms_to_min(time):
    if time > 60:
        return time / (1000 * 60)  # convert milliseconds to minutes
    else:
        return time  # return the original value if it's already in minutes


In [4]:
df_music["featured_artist"] = "" # makes a new column in the datafram for featured artists

# sets up a for loop to run our function
for index, row in df_music.iterrows():
    track_name = row["track_name"]
    track_name, featured_artist = extract_featured_artist(track_name)
    df_music.at[index, "track_name"] = track_name
    df_music.at[index, "featured_artist"] = featured_artist.strip() if featured_artist else None 
# show df
df_music.head()

Unnamed: 0,artist_name,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in_min/ms,time_signature,class,featured_artist
0,Bruno Mars,That's What I Like,60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,,0.0849,0.899,134.071,234596.0,4,5.0,Gucci Mane
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,251733.0,4,10.0,
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,109667.0,4,6.0,
3,Deno,Lingo,66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,,0.122,0.569,107.033,173968.0,4,5.0,J.I & Chunkz
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,229960.0,4,10.0,


In [5]:
# dataframe changes
# Convert the original featured_artist into a list
df_music['featured_artist'] = df_music['featured_artist'].apply(lambda x: x.split(',') if isinstance(x, str) else x)

# Split the artist_name into main_artist and new_featured_artists
df_music['main_artist'] = df_music['artist_name'].apply(lambda x: x.split(',', 1)[0].split('&', 1)[0].strip())
df_music['new_featured_artists'] = df_music['artist_name'].apply(lambda x: [i.strip() for sublist in x.split(',', 1)[1:] + x.split('&', 1)[1:] for i in sublist.split('&') if sublist])

# Merge the original featured_artist list and new_featured_artists list
df_music['featured_artist'] = df_music['featured_artist'].apply(lambda x: x if isinstance(x, list) else []) + df_music['new_featured_artists']

# Convert the merged list back into a string, if necessary
df_music['featured_artist'] = df_music['featured_artist'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Remove artist_name and new_featured_artists columns and rename main_artist as artist_name
df_music = df_music.drop(['artist_name', 'new_featured_artists'], axis=1)
df_music = df_music.rename(columns={'main_artist': 'artist_name'})

# apply the track_name_cleaner function to all rows
df_music["track_name"] = df_music["track_name"].apply(track_name_cleaner)

# Assuming your string column is named "your_string_column"
df_music["track_name"] = df_music["track_name"].apply(lambda x: x if is_printable(x) else 'UNKNOWN')

# replace NaN values in "instrumentalness" column with 0's
df_music.fillna(0, inplace=True)

# Apply the function to your column
df_music['duration_in_min/ms'] = df_music['duration_in_min/ms'].apply(convert_ms_to_min)

# show df
df_music.head()


Unnamed: 0,track_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in_min/ms,time_signature,class,featured_artist,artist_name
0,That's What I Like,60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.0171,0.0,0.0849,0.899,134.071,3.909933,4,5.0,Gucci Mane,Bruno Mars
1,Hitch a Ride,54.0,0.382,0.814,3.0,-7.23,1,0.0406,0.0011,0.00401,0.101,0.569,116.454,4.19555,4,10.0,,Boston
2,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486,0.000196,0.394,0.787,147.681,1.827783,4,6.0,,The Raincoats
3,Lingo,66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.0212,0.0,0.122,0.569,107.033,2.899467,4,5.0,J.I & Chunkz,Deno
4,Nobody Weird Like Me,53.0,0.167,0.975,2.0,-4.279,1,0.216,0.000169,0.0161,0.172,0.0918,199.06,3.832667,4,10.0,,Red Hot Chili Peppers


In [6]:
# Identify categorical columns in the dataset
categorical_cols = ['featured_artist', 'artist_name']

# Apply One-Hot Encoding to each categorical column
df_music = pd.get_dummies(df_music, columns=categorical_cols, drop_first=True)

df_music.set_index('track_name', inplace=True)

# Assuming 'song_popularity' is the target column, and other columns are features
X = df_music.drop('popularity', axis=1)
y = df_music['popularity']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Hyperparameter tuning for Logistic Regression
param_grid_lr = {'C': [0.01, 0.1, 1, 10]}  # Regularization parameter

lr_model = LogisticRegression(max_iter=20000, random_state=45)
grid_search_lr = GridSearchCV(lr_model, param_grid_lr, cv=5)
grid_search_lr.fit(X_train, y_train)

best_lr_model = grid_search_lr.best_estimator_




KeyboardInterrupt: 

In [None]:
# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_


In [None]:
# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

xgb_model = XGBClassifier(random_state=42)
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5)
grid_search_xgb.fit(X_train, y_train)

best_xgb_model = grid_search_xgb.best_estimator_


In [None]:
# Hyperparameter tuning for Decision Tree
param_grid_dt = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_model = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5)
grid_search_dt.fit(X_train, y_train)

best_dt_model = grid_search_dt.best_estimator_

In [None]:
# Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

    # Feature importance for tree-based models
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        features = X_test.columns
        importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
        importance_df = importance_df.sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(10, 6))
        plt.bar(importance_df['Feature'], importance_df['Importance'])
        plt.xticks(rotation=90)
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.title('Feature Importance')
        plt.show()

# Evaluate Logistic Regression
print("Logistic Regression:")
evaluate_model(best_lr_model, X_test, y_test)

# Evaluate Random Forest
print("Random Forest:")
evaluate_model(best_rf_model, X_test, y_test)

# Evaluate XGBoost
print("XGBoost:")
evaluate_model(best_xgb_model, X_test, y_test)

# Evaluate Decision Tree
print("Decision Tree:")
evaluate_model(best_dt_model, X_test, y_test)


In [None]:
# Assuming 'song_popularity' is the target column, and other columns are features
X = df_music.drop('class', axis=1)
y = df_music['class']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Hyperparameter tuning for Logistic Regression
param_grid_lr = {'C': [0.01, 0.1, 1, 10]}  # Regularization parameter

lr_model = LogisticRegression(max_iter=2000, random_state=42)
grid_search_lr = GridSearchCV(lr_model, param_grid_lr, cv=5)
grid_search_lr.fit(X_train, y_train)

best_lr_model = grid_search_lr.best_estimator_


In [None]:
# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_


In [None]:
# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

xgb_model = XGBClassifier(random_state=42)
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5)
grid_search_xgb.fit(X_train, y_train)

best_xgb_model = grid_search_xgb.best_estimator_


In [None]:
# Hyperparameter tuning for Decision Tree
param_grid_dt = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_model = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5)
grid_search_dt.fit(X_train, y_train)

best_dt_model = grid_search_dt.best_estimator_

In [None]:
# Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

    # Feature importance for tree-based models
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        features = X_test.columns
        importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
        importance_df = importance_df.sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(10, 6))
        plt.bar(importance_df['Feature'], importance_df['Importance'])
        plt.xticks(rotation=90)
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.title('Feature Importance')
        plt.show()

# Evaluate Logistic Regression
print("Logistic Regression:")
evaluate_model(best_lr_model, X_test, y_test)

# Evaluate Random Forest
print("Random Forest:")
evaluate_model(best_rf_model, X_test, y_test)

# Evaluate XGBoost
print("XGBoost:")
evaluate_model(best_xgb_model, X_test, y_test)

# Evaluate Decision Tree
print("Decision Tree:")
evaluate_model(best_dt_model, X_test, y_test)


In [None]:
# Create a dictionary to map numeric encoding to genre labels
encoding_to_genre = {
    0: 'Acoustic/Folk',
    1: 'Alt_Music',
    2: 'Blues',
    3: 'Bollywood',
    4: 'Country',
    5: 'HipHop',
    6: 'Indie Alt',
    7: 'Instrumental',
    8: 'Metal',
    9: 'Pop',
    10: 'Rock'
}

# Replace numeric encoding with genre labels in the "class" column
df_music['class'] = df_music['class'].astype(int)
df_music['class'] = df_music['class'].map(encoding_to_genre)