In [2]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
from classes.utils import gather_data_from_folders
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor, TfidfFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor
from classes.constants import GENRE_MAPPING
from classes.utils import reduce_genres_with_regex
load_dotenv();

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the data

In [3]:
playlists_dir = 'audio_sentiment_classifier/playlists'
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "playlists")
df_from_folders = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Best Of Global Hip-Hop 2024/Best Of Global Hip-Hop 2024.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/HOUSE MIX 2024 🔥 HOUSE PARTY 2024 🔥HOUSE MUSIC 2024 🔥/HOUSE MIX 2024 🔥 HOUSE PARTY 2024 🔥HOUSE MUSIC 2024 🔥.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/✨LATINO✨/✨LATINO✨.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/final download 1/final download 1.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/60s-70s Mix/60s-70s Mix.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Soul Jazz Mix/Soul Jazz Mix.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/th

In [4]:
df_from_folders = df_from_folders[~df_from_folders.id.duplicated()]

In [5]:
df_from_folders.shape

(5386, 24)

In [134]:
df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))

In [135]:
df.shape

(5387, 178)

In [136]:
df_to_process = df_from_folders[~df_from_folders['id'].isin(df['id'])]

In [137]:
df_to_process.shape

(0, 24)

In [110]:
df_to_process.isna().sum()

id                    0.0
title                 0.0
artist                0.0
album_art_url         0.0
popularity            0.0
explicit              0.0
album_release_year    0.0
duration_ms           0.0
genres                0.0
lyrics                0.0
mp3_path              0.0
csv_path              0.0
danceability          0.0
energy                0.0
key                   0.0
loudness              0.0
mode                  0.0
speechiness           0.0
acousticness          0.0
instrumentalness      0.0
liveness              0.0
valence               0.0
tempo                 0.0
time_signature        0.0
dtype: float64

# Getting rid of unusable songs

In [111]:
df_songs_to_drop = find_songs_to_drop(df_to_process, allow_nan_cols=['popularity'])

In [112]:
print("Songs that will be dropped: ", len(df_songs_to_drop))

Songs that will be dropped:  0


In [113]:
df_songs_to_drop.head()

Unnamed: 0,id,title,artist,album_art_url,popularity,explicit,album_release_year,duration_ms,genres,lyrics,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,genre


In [114]:
clean_songs_to_drop(df_songs_to_drop)

In [115]:
df_to_process = df_to_process[~df_to_process.id.isin(df_songs_to_drop.id)]

In [116]:
df_to_process.shape[0]

0

In [117]:
df_to_process.to_pickle(os.path.join("datasets", "onboarding_step1.pkl"))

---

# Adding audio features

In [118]:
aud_processor = FeatureProcessor(
    extractor=AudioFeatureExtractor(),
    input_file=os.path.join("datasets", "onboarding_step1.pkl"),
    output_file=os.path.join('datasets', "onboarding_step2.pkl"),
    batch_size=10
)
aud_processor.process_batches()

# Adding textual features

In [None]:
df_onboarding = pd.read_pickle(os.path.join("datasets", "onboarding_step2.pkl"))
df_onboarding.head()

In [21]:
df_onboarding.shape

(2, 59)

In [140]:
def remove_letra_de_title(df):
    def remove_prefix(row):
        if isinstance(row['lyrics'], str) and row['lyrics'].startswith("letra de"):
            lyrics = row['lyrics'][len("letra de"):].strip()
            title_words = set(row['title'].split()) if isinstance(row['title'], str) else set()
            lyrics_words = lyrics.split()
            # Remove title words from the first 6 words of the lyrics
            filtered_words = [word for i, word in enumerate(lyrics_words) if i >= 6 or word not in title_words]
            return " ".join(filtered_words)
        return row['lyrics']

    df['lyrics'] = df.apply(remove_prefix, axis=1)
    return df

df_onboarding = remove_letra_de_title(df_onboarding)

In [22]:
df_onboarding['lyrics'] = df_onboarding.lyrics.apply(lambda x: TextFeatureExtractor.preprocess_text(x))

In [24]:
df_onboarding.to_pickle(os.path.join("datasets", "onboarding_step3.pkl"))

In [74]:
tf_processor = FeatureProcessor(
    extractor=TextFeatureExtractor(),
    input_file=os.path.join("datasets", "onboarding_step3.pkl"),
    output_file=os.path.join('datasets', "onboarding_step4.pkl"),
    batch_size=100
)
tf_processor.process_batches()

Processing Batches:   0%|          | 0/1 [00:00<?, ?batch/s]

# Concatenating

In [86]:
df_onboarding = pd.read_pickle(os.path.join("datasets", "onboarding_step4.pkl"))
df_onboarding.shape

(2, 78)

In [87]:
df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))
df.drop([c for c in df.columns if c.startswith('tfidf_')], axis=1, inplace=True)
df.shape

(5888, 78)

In [88]:
df_onboarding['genre'] = df_onboarding['genres'].apply(lambda x: reduce_genres_with_regex(eval(x), GENRE_MAPPING))
df_onboarding['genre'].value_counts()

rock    2
Name: genre, dtype: int64

In [89]:
incorrect_genre_songs = df[(df.genre.isna()) | (df.genre == 'None')]

In [90]:
print("{} songs have incorrect genre and will be dropped".format(incorrect_genre_songs.shape[0]))
df = df[~df.id.isin(incorrect_genre_songs.id)]

0 songs have incorrect genre and will be dropped


In [91]:
df = pd.concat([df, df_onboarding], axis=0)

In [94]:
df.to_pickle(os.path.join("datasets", 'onboarding_step5.pkl'))

# TF-IDF

In [98]:
tf_processor = FeatureProcessor(
    extractor=TfidfFeatureExtractor(n_pca_components=100),
    input_file=os.path.join("datasets", "onboarding_step5.pkl"),
    output_file=os.path.join('datasets', "onboarding_step6.pkl"),
    batch_size=None
)
tf_processor.process_batches()

# Cleaning

In [99]:
df = pd.read_pickle(os.path.join("datasets", "onboarding_step6.pkl"))
df.shape

(5890, 178)

In [100]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype(str).astype('category')

In [129]:
df.to_pickle(os.path.join("datasets", 'final_data.pkl'))

In [102]:
for pkl in [ file for file in os.listdir('datasets') if file.startswith('onboarding') and file.endswith('pkl')]:
    os.remove(os.path.join('datasets', pkl))