In [1]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.playlist import Playlist
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
from classes.utils import gather_data_from_folders
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor, TfidfFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor
from classes.constants import GENRE_MAPPING
from classes.utils import reduce_genres_with_regex
load_dotenv();

# Loading the data

In [10]:
playlists_dir = 'audio_sentiment_classifier/playlists'
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "playlists")
df = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Best Of Global Hip-Hop 2024/Best Of Global Hip-Hop 2024.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/HOUSE MIX 2024 🔥 HOUSE PARTY 2024 🔥HOUSE MUSIC 2024 🔥/HOUSE MIX 2024 🔥 HOUSE PARTY 2024 🔥HOUSE MUSIC 2024 🔥.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/✨LATINO✨/✨LATINO✨.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/final download 1/final download 1.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/60s-70s Mix/60s-70s Mix.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Soul Jazz Mix/Soul Jazz Mix.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/th

In [11]:
df.shape

(6116, 24)

In [12]:
df.isna().sum()

id                      0
title                   0
artist                  0
album_art_url           0
popularity            575
explicit                0
album_release_year      0
duration_ms             0
genres                  0
lyrics                  0
mp3_path                0
csv_path                0
danceability            0
energy                  0
key                     0
loudness                0
mode                    0
speechiness             0
acousticness            0
instrumentalness        0
liveness                0
valence                 0
tempo                   0
time_signature          0
dtype: int64

# Getting rid of unusable songs

In [13]:
df_songs_to_drop = find_songs_to_drop(df, allow_nan_cols=['popularity'])

In [14]:
print("Songs that will be dropped: ", len(df_songs_to_drop))

Songs that will be dropped:  0


In [15]:
clean_songs_to_drop(df_songs_to_drop)

---

# Adding audio features

In [18]:
df = df.sample(2)

In [21]:
afe = AudioFeatureExtractor()
df = afe.add_features(df)

Extracting Audio Features:   0%|          | 0/2 [00:00<?, ?file/s]

In [22]:
df.columns

Index(['id', 'title', 'artist', 'album_art_url', 'popularity', 'explicit',
       'album_release_year', 'duration_ms', 'genres', 'lyrics', 'mp3_path',
       'csv_path', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'genre', 'mfcc_1', 'mfcc_2',
       'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9',
       'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'chroma_1', 'chroma_2',
       'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8',
       'chroma_9', 'chroma_10', 'chroma_11', 'chroma_12',
       'spectral_contrast_1', 'spectral_contrast_2', 'spectral_contrast_3',
       'spectral_contrast_4', 'spectral_contrast_5', 'spectral_contrast_6',
       'spectral_contrast_7', 'tempo_extracted', 'zcr'],
      dtype='object')

# Adding textual features

In [23]:
df['lyrics'] = df.lyrics.apply(lambda x: TextFeatureExtractor.preprocess_text(x))

In [24]:
tfe = TextFeatureExtractor()
df = tfe.add_features(df)

In [25]:
df.columns

Index(['id', 'title', 'artist', 'album_art_url', 'popularity', 'explicit',
       'album_release_year', 'duration_ms', 'genres', 'lyrics', 'mp3_path',
       'csv_path', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'genre', 'mfcc_1', 'mfcc_2',
       'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9',
       'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'chroma_1', 'chroma_2',
       'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8',
       'chroma_9', 'chroma_10', 'chroma_11', 'chroma_12',
       'spectral_contrast_1', 'spectral_contrast_2', 'spectral_contrast_3',
       'spectral_contrast_4', 'spectral_contrast_5', 'spectral_contrast_6',
       'spectral_contrast_7', 'tempo_extracted', 'zcr', 'word_count',
       'unique_word_count', 'lexical_richness', 'semantic_depth',
       'syntactic_complexity', 'rhyme_density', 'sentime

In [26]:
df['genre'] = df['genres'].apply(lambda x: reduce_genres_with_regex(eval(x), GENRE_MAPPING))
df['genre'].value_counts()

electronic    1
pop           1
Name: genre, dtype: int64

In [27]:
incorrect_genre_songs = df[(df.genre.isna()) | (df.genre == 'None')]

In [28]:
print("{} songs have incorrect genre and will be dropped".format(incorrect_genre_songs.shape[0]))
df = df[~df.id.isin(incorrect_genre_songs.id)]

0 songs have incorrect genre and will be dropped


# TF-IDF

In [30]:
tfidffe = TfidfFeatureExtractor(n_pca_components=100)
df = tfidffe.add_features(df)

In [31]:
df.columns

Index(['id', 'title', 'artist', 'album_art_url', 'popularity', 'explicit',
       'album_release_year', 'duration_ms', 'genres', 'lyrics', 'mp3_path',
       'csv_path', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'genre', 'mfcc_1', 'mfcc_2',
       'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9',
       'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'chroma_1', 'chroma_2',
       'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8',
       'chroma_9', 'chroma_10', 'chroma_11', 'chroma_12',
       'spectral_contrast_1', 'spectral_contrast_2', 'spectral_contrast_3',
       'spectral_contrast_4', 'spectral_contrast_5', 'spectral_contrast_6',
       'spectral_contrast_7', 'tempo_extracted', 'zcr', 'word_count',
       'unique_word_count', 'lexical_richness', 'semantic_depth',
       'syntactic_complexity', 'rhyme_density', 'sentime

# Cleaning

In [32]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype(str).astype('category')