In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import warnings
warnings.simplefilter('ignore')
from classes.text_features import TextFeatureExtractor
from dotenv import load_dotenv
from tqdm.auto import tqdm
import os
from pandas.api.types import is_numeric_dtype
import pandas as pd

load_dotenv();

In [5]:
df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))#gather_data_from_folders(playlists_dir)

In [6]:
df.shape

(5375, 571)

# Features

In [7]:
non_feature_columns = [ 'id', 'title', 'artist', 'album_art_url', 'genres', 'mp3_path', 'lyrics' ]

In [8]:
cat_features = [ f for f in df.select_dtypes(include=['category', 'object']).columns if f not in non_feature_columns ]
num_features = [ f for f in df.select_dtypes(include=['number']).columns if f not in non_feature_columns ]

In [9]:
cat_features

['key', 'mode', 'time_signature', 'language', 'genre']

In [10]:
len(num_features)

558

# Optimal Classification

In [11]:
from classes.model_pipeline import OptimalClassificationPipeline

In [24]:
target = 'mode'
cat_features = [ f for f in cat_features if f != target ]
num_features = [ f for f in num_features if f != target ]

In [25]:
df = df.sample(400)

In [26]:
model = OptimalClassificationPipeline(
    df.dropna(subset=['genre']),
    target_column=target,
    num_features=num_features,
    cat_features=cat_features,
    n_trials=1,
    param_grid={
        "iterations": [500, 1000],
        "depth": [4, 6, 8],
        "learning_rate": (0.01, 0.1),
        "l2_leaf_reg": (1, 10),
    },
    cache_path="catboost_optuna.db"
)
model.get_pipeline()

[1;32mINFO: Initializing pipeline for target variable: mode[0m
[1;32mINFO: Setting up the pipeline...[0m
[1;32mINFO: Setting up the pipeline...[0m


In [27]:
%%time
model.run_pipeline(f'dashboard/results/{target}')

Pipeline Progress:   0%|          | 0/4 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution for prediction of mode...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Training the OptimalCatBoostClassifier through the pipeline...[0m


[I 2024-12-05 00:45:35,872] A new study created in RDB with name: catboost_mode
[I 2024-12-05 00:45:36,998] Trial 0 finished with value: 0.7560975609756098 and parameters: {'iterations': 1000, 'depth': 4, 'learning_rate': 0.029437170706238948, 'l2_leaf_reg': 2.001211643186846}. Best is trial 0 with value: 0.7560975609756098.


[1;32mINFO: Pipeline training completed.[0m
[1;32mINFO: Performing SHAP analysis for OptimalCatBoostClassifier...[0m
[1;32mINFO: SHAP analysis for classification completed.[0m
[1;32mINFO: Saving results to dashboard/results/mode...[0m
[1;32mINFO: Evaluation results saved to dashboard/results/mode/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/mode/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/mode/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
CPU times: user 24.1 s, sys: 3.08 s, total: 27.2 s
Wall time: 5.77 s


In [23]:
model.pipeline.named_steps['model'].training_results_

Unnamed: 0,Metric,Score
0,Accuracy,1.0
1,F1 Score,1.0
2,Precision,1.0
3,Recall,1.0


In [16]:
model.evaluate_model()

Unnamed: 0,Metric,Score
0,Accuracy,1.0
1,F1 Score,1.0
2,Precision,1.0
3,Recall,1.0


In [18]:
df.key.value_counts()

1     54
0     47
11    44
5     44
7     37
9     33
8     28
2     27
6     27
4     22
10    22
3     15
Name: key, dtype: int64

# Optimal Regression

In [54]:
from classes.model_pipeline import OptimalRegressionPipeline

In [55]:
target = 'acousticness'
cat_features = [ f for f in cat_features if f != target ]
num_features = [ f for f in num_features if f != target ]

In [59]:
model = OptimalRegressionPipeline(
    df.dropna(subset=['genre']),
    target_column=target,
    num_features=num_features,
    cat_features=cat_features,
    n_trials=1,
    param_grid={
        "iterations": [500, 1000],
        "depth": [4, 6, 8],
        "learning_rate": (0.01, 0.1),
        "l2_leaf_reg": (1, 10),
    },
    cache_path="catboost_optuna.db"
)
model.get_pipeline()

[1;32mINFO: Initializing pipeline for target variable: acousticness[0m
[1;32mINFO: Setting up the pipeline...[0m
[1;32mINFO: Setting up the pipeline...[0m


In [62]:
%%time
model.run_pipeline(f'dashboard/results/{target}')

Pipeline Progress:   0%|          | 0/4 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution for prediction of acousticness...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Training the OptimalCatBoostRegressor through the pipeline...[0m


[I 2024-12-05 01:02:19,474] Using an existing study with name 'catboost_regressor_optimization' instead of creating a new one.
[I 2024-12-05 01:02:42,243] Trial 4 finished with value: 0.5392764472315293 and parameters: {'iterations': 1000, 'depth': 8, 'learning_rate': 0.010468431140172561, 'l2_leaf_reg': 2.4582871967623214}. Best is trial 0 with value: 0.15850645009573017.


[1;32mINFO: Pipeline training completed.[0m
[1;32mINFO: Performing SHAP analysis for OptimalCatBoostRegressor...[0m
[1;32mINFO: SHAP analysis for regression completed.[0m
[1;32mINFO: Saving results to dashboard/results/acousticness...[0m
[1;32mINFO: Evaluation results saved to dashboard/results/acousticness/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/acousticness/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/acousticness/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
CPU times: user 2min 23s, sys: 19.6 s, total: 2min 43s
Wall time: 29.2 s


In [63]:
model.evaluate_model()

Unnamed: 0,Metric,Score
0,Mean Absolute Error,0.097809
1,Mean Squared Error,0.01619
2,Root Mean Squared Error,0.127239
3,R2 Score,0.821513


# Automated

In [99]:
features_to_exclude = ['title', 'artist', 'mp3_path', 'lyrics']

In [100]:
features_to_predict = ['explicit',
       'album_release_year', 'duration_ms',
        'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'time_signature', 'tempo_extracted', 'zcr']

In [115]:
df['explicit'] = df['explicit'].map({True: 1, False: 0})

In [116]:
for target in tqdm(features_to_predict):
    if os.path.exists(f"dashboard/results/{target}"):
        continue
    cat_features = [ f for f in df.select_dtypes(include=['category']).columns if f != target and f not in features_to_exclude]
    num_features = [ f for f in df.select_dtypes(include=['number']).columns if f != target and f not in features_to_exclude]

    if is_numeric_dtype(df[target]):
        model = RegressionPipeline(df, target_column=target, num_features=num_features, cat_features=cat_features)
    else:
        model = ClassificationPipeline(df, target_column=target, num_features=num_features, cat_features=cat_features)
        
    model.run_pipeline(f'dashboard/results/{target}')

  0%|          | 0/17 [00:00<?, ?it/s]

[1;32mINFO: Initializing pipeline for target variable: explicit[0m
[1;32mINFO: Setting up the pipeline...[0m


Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Fixing class balance...[0m
[1;32mINFO: Applied SMOTE with k_neighbors=5 to fix class imbalance.[0m
[1;32mINFO: Model training completed.[0m
[1;32mINFO: Evaluating classification model...[0m
[1;32mINFO: Classification evaluation completed.[0m
[1;32mINFO: Performing SHAP analysis for classification...[0m




[1;32mINFO: SHAP analysis for classification completed.[0m
[1;32mINFO: Saving results to dashboard/results/explicit...[0m
[1;32mINFO: Evaluating classification model...[0m
[1;32mINFO: Classification evaluation completed.[0m
[1;32mINFO: Evaluation results saved to dashboard/results/explicit/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/explicit/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/explicit/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
[1;32mINFO: Initializing pipeline for target variable: album_release_year[0m
[1;32mINFO: Setting up the pipeline...[0m


Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Training the model...[0m
[1;32mINFO: Model training completed.[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Performing SHAP analysis for regression...[0m




[1;32mINFO: SHAP analysis for regression completed.[0m
[1;32mINFO: Saving results to dashboard/results/album_release_year...[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Evaluation results saved to dashboard/results/album_release_year/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/album_release_year/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/album_release_year/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
[1;32mINFO: Initializing pipeline for target variable: duration_ms[0m
[1;32mINFO: Setting up the pipeline...[0m


Pipeline Progress:   0%|          | 0/5 [00:00<?, ?step/s]

[1;32mINFO: Starting pipeline execution...[0m
[1;32mINFO: Splitting the data into training and testing sets...[0m
[1;32mINFO: Data splitting completed.[0m
[1;32mINFO: Training the model...[0m
[1;32mINFO: Model training completed.[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Performing SHAP analysis for regression...[0m




[1;32mINFO: SHAP analysis for regression completed.[0m
[1;32mINFO: Saving results to dashboard/results/duration_ms...[0m
[1;32mINFO: Evaluating regression model...[0m
[1;32mINFO: Regression evaluation completed.[0m
[1;32mINFO: Evaluation results saved to dashboard/results/duration_ms/evaluation_results.csv[0m
[1;32mINFO: SHAP summary plot saved to dashboard/results/duration_ms/shap_summary_plot.png[0m
[1;32mINFO: SHAP beeswarm plot saved to dashboard/results/duration_ms/shap_beeswarm_plot.png[0m
[1;32mINFO: Pipeline execution completed.[0m
