In [33]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
from classes.utils import gather_data_from_folders
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor, TfidfFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor
from classes.constants import GENRE_MAPPING
from classes.utils import reduce_genres_with_regex
load_dotenv();

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the data

In [34]:
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "test")
df = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/test/songs.csv[0m


In [35]:
final_df_path = 'final_data.pkl'
final_df = None
if os.path.exists(final_df_path):
    final_df = pd.read_pickle(final_df_path)
    df = df[~df.id.isin(final_df.id)]

In [36]:
df.shape

(11, 24)

In [37]:
extractor = TextFeatureExtractor()
df = extractor.add_features(df, batch_size=3, text_column='lyrics')

Processing Features:   0%|          | 0/11 [00:00<?, ?it/s]

In [38]:
df.shape

(11, 238)

In [39]:
extractor = AudioFeatureExtractor()
df = extractor.add_features(df, batch_size=3)

Extracting Audio Features:   0%|          | 0/11 [00:00<?, ?file/s]

In [40]:
df.shape

(11, 272)

In [41]:
if df.shape[0] > 0:
    if final_df is not None:
        final_df = pd.concat([df, final_df], axis=0)
    else:
        final_df = df
    final_df.to_pickle(final_df_path)

In [44]:
pd.read_pickle(final_df_path).empath_family

0     0.000000
1     0.000000
2     0.005051
3     0.032544
4     0.000000
5     0.002611
6     0.000000
7     0.003125
8     0.000000
9     0.000000
10    0.000000
Name: empath_family, dtype: float64

In [140]:
def remove_letra_de_title(df):
    def remove_prefix(row):
        if isinstance(row['lyrics'], str) and row['lyrics'].startswith("letra de"):
            lyrics = row['lyrics'][len("letra de"):].strip()
            title_words = set(row['title'].split()) if isinstance(row['title'], str) else set()
            lyrics_words = lyrics.split()
            # Remove title words from the first 6 words of the lyrics
            filtered_words = [word for i, word in enumerate(lyrics_words) if i >= 6 or word not in title_words]
            return " ".join(filtered_words)
        return row['lyrics']

    df['lyrics'] = df.apply(remove_prefix, axis=1)
    return df

df_onboarding = remove_letra_de_title(df_onboarding)

In [22]:
df_onboarding['lyrics'] = df_onboarding.lyrics.apply(lambda x: TextFeatureExtractor.preprocess_text(x))

In [24]:
df_onboarding.to_pickle(os.path.join("datasets", "onboarding_step3.pkl"))

In [74]:
tf_processor = FeatureProcessor(
    extractor=TextFeatureExtractor(),
    input_file=os.path.join("datasets", "onboarding_step3.pkl"),
    output_file=os.path.join('datasets', "onboarding_step4.pkl"),
    batch_size=100
)
tf_processor.process_batches()

Processing Batches:   0%|          | 0/1 [00:00<?, ?batch/s]

# Concatenating

In [86]:
df_onboarding = pd.read_pickle(os.path.join("datasets", "onboarding_step4.pkl"))
df_onboarding.shape

(2, 78)

In [87]:
df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))
df.drop([c for c in df.columns if c.startswith('tfidf_')], axis=1, inplace=True)
df.shape

(5888, 78)

In [88]:
df_onboarding['genre'] = df_onboarding['genres'].apply(lambda x: reduce_genres_with_regex(eval(x), GENRE_MAPPING))
df_onboarding['genre'].value_counts()

rock    2
Name: genre, dtype: int64

In [89]:
incorrect_genre_songs = df[(df.genre.isna()) | (df.genre == 'None')]

In [90]:
print("{} songs have incorrect genre and will be dropped".format(incorrect_genre_songs.shape[0]))
df = df[~df.id.isin(incorrect_genre_songs.id)]

0 songs have incorrect genre and will be dropped


In [91]:
df = pd.concat([df, df_onboarding], axis=0)

In [94]:
df.to_pickle(os.path.join("datasets", 'onboarding_step5.pkl'))

# TF-IDF

In [98]:
tf_processor = FeatureProcessor(
    extractor=TfidfFeatureExtractor(n_pca_components=100),
    input_file=os.path.join("datasets", "onboarding_step5.pkl"),
    output_file=os.path.join('datasets', "onboarding_step6.pkl"),
    batch_size=None
)
tf_processor.process_batches()

# Cleaning

In [99]:
df = pd.read_pickle(os.path.join("datasets", "onboarding_step6.pkl"))
df.shape

(5890, 178)

In [100]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype(str).astype('category')

In [129]:
df.to_pickle(os.path.join("datasets", 'final_data.pkl'))

In [102]:
for pkl in [ file for file in os.listdir('datasets') if file.startswith('onboarding') and file.endswith('pkl')]:
    os.remove(os.path.join('datasets', pkl))