In [117]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [118]:
import warnings
warnings.simplefilter('ignore')
from classes.model_pipeline import ClassificationPipeline, RegressionPipeline
from classes.utils import gather_data_from_folders
from classes.text_features import TextFeatureExtractor
from dotenv import load_dotenv
from tqdm.auto import tqdm
import os
from pandas.api.types import is_numeric_dtype

load_dotenv();

In [119]:
playlists_dir = 'audio_sentiment_classifier/playlists'
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "playlists")
df = gather_data_from_folders(playlists_dir)

Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/final download 1/final download 1.csv
Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Every song in the world 🌍 /Every song in the world 🌍 .csv
Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/final download 2/final download 2.csv


In [120]:
df.shape

(5999, 54)

In [82]:
# Fixing the dtypes
int_columns = ['duration_ms', 'album_release_year']
df[int_columns] = df[int_columns].astype(int)

df.key = df.key.astype('category')
df['mode'] = df['mode'].astype('category')
df.time_signature = df.time_signature.astype('category')
df.explicit = df.explicit.astype('category')
df.album_release_year = df.album_release_year.astype(float)

In [83]:
tfe = TextFeatureExtractor()
df = tfe.add_features(df)

# Training

In [6]:
target = 'mode'

In [32]:
cat_features = [ f for f in df.select_dtypes(include=['category']).columns if f != target ]
num_features = [ f for f in df.select_dtypes(include=['number']).columns if f != target ]

In [33]:
model = ClassificationPipeline(df, target_column=target, num_features=num_features, cat_features=cat_features)
model.get_classification_pipeline()

[1;32mINFO: Initializing pipeline for target variable: album_release_year[0m
[1;32mINFO: Setting up the pipeline...[0m
[1;32mINFO: Setting up the pipeline...[0m


In [34]:
%%time
model.run_pipeline(f'dashboard/results/{target}')

Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Fixing class balance...[0m


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 64, n_samples_fit = 1, n_samples = 1

In [10]:
model.evaluate_model()

[1;32mINFO: Evaluating classification model...[0m
[1;32mINFO: Classification evaluation completed.[0m


Unnamed: 0,Metric,Score
0,Accuracy,0.677374
1,Precision,0.668487
2,Recall,0.677374
3,F1 Score,0.655246


# Automated

In [99]:
features_to_exclude = ['title', 'artist', 'mp3_path', 'lyrics']

In [100]:
features_to_predict = ['explicit',
       'album_release_year', 'duration_ms',
        'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'tempo_extracted', 'zcr']

In [115]:
df['explicit'] = df['explicit'].map({True: 1, False: 0})

In [116]:
for target in tqdm(features_to_predict):
    if os.path.exists(f"dashboard/results/{target}"):
        continue
    cat_features = [ f for f in df.select_dtypes(include=['category']).columns if f != target and f not in features_to_exclude]
    num_features = [ f for f in df.select_dtypes(include=['number']).columns if f != target and f not in features_to_exclude]

    if is_numeric_dtype(df[target]):
        model = RegressionPipeline(df, target_column=target, num_features=num_features, cat_features=cat_features)
    else:
        model = ClassificationPipeline(df, target_column=target, num_features=num_features, cat_features=cat_features)
        
    model.run_pipeline(f'dashboard/results/{target}')

  0%|          | 0/17 [00:00<?, ?it/s]

[1;32mINFO: Initializing pipeline for target variable: explicit[0m
[1;32mINFO: Setting up the pipeline...[0m


Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Fixing class balance...[0m
[1;32mINFO: Applied SMOTE with k_neighbors=5 to fix class imbalance.[0m
[1;32mINFO: Model training completed.[0m
[1;32mINFO: Evaluating classification model...[0m
[1;32mINFO: Classification evaluation completed.[0m
[1;32mINFO: Performing SHAP analysis for classification...[0m




[1;32mINFO: SHAP analysis for classification completed.[0m
[1;32mINFO: Saving results to dashboard/results/explicit...[0m
[1;32mINFO: Evaluating classification model...[0m
[1;32mINFO: Classification evaluation completed.[0m
[1;32mINFO: Evaluation results saved to dashboard/results/explicit/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/explicit/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/explicit/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
[1;32mINFO: Initializing pipeline for target variable: album_release_year[0m
[1;32mINFO: Setting up the pipeline...[0m


Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Training the model...[0m
[1;32mINFO: Model training completed.[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Performing SHAP analysis for regression...[0m




[1;32mINFO: SHAP analysis for regression completed.[0m
[1;32mINFO: Saving results to dashboard/results/album_release_year...[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Evaluation results saved to dashboard/results/album_release_year/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/album_release_year/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/album_release_year/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
[1;32mINFO: Initializing pipeline for target variable: duration_ms[0m
[1;32mINFO: Setting up the pipeline...[0m


Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Training the model...[0m
[1;32mINFO: Model training completed.[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Performing SHAP analysis for regression...[0m




[1;32mINFO: SHAP analysis for regression completed.[0m
[1;32mINFO: Saving results to dashboard/results/duration_ms...[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Evaluation results saved to dashboard/results/duration_ms/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/duration_ms/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/duration_ms/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
