In [11]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
from classes.utils import gather_data_from_folders
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor, TfidfFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor
from classes.constants import GENRE_MAPPING
from classes.utils import reduce_genres_with_regex
load_dotenv();

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the data

In [21]:
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "test")
df = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/test/songs.csv[0m


In [22]:
final_df_path = 'final_data.pkl'
final_df = None
if os.path.exists(final_df_path):
    final_df = pd.read_pickle(final_df_path)
    df = df[~df.id.isin(final_df.id)]

In [23]:
df.shape

(11, 24)

In [24]:
extractor = TextFeatureExtractor()
df = extractor.add_features(df, batch_size=3, text_column='lyrics')

Processing Features:   0%|          | 0/11 [00:00<?, ?it/s]

In [25]:
df.shape

(11, 238)

In [26]:
extractor = AudioFeatureExtractor()
df = extractor.add_features(df, batch_size=3)

Extracting Audio Features:   0%|          | 0/11 [00:00<?, ?file/s]

In [27]:
df.shape

(11, 272)

In [28]:
if df.shape[0] > 0:
    if final_df is not None:
        final_df = pd.concat([df, final_df], axis=0)
    else:
        final_df = df
    final_df.to_pickle(final_df_path)

In [32]:
pd.read_pickle(final_df_path)

Unnamed: 0,id,title,artist,album_art_url,popularity,explicit,album_release_year,duration_ms,genres,lyrics,...,chroma_12,spectral_contrast_1,spectral_contrast_2,spectral_contrast_3,spectral_contrast_4,spectral_contrast_5,spectral_contrast_6,spectral_contrast_7,tempo_extracted,zcr
0,6mSnSuOhgHHohqeBw76jRV,i was all over her,salvia palth,https://i.scdn.co/image/ab67616d0000b2734a82bd...,80,False,2013,161422,"['[\'[\\\'[""[\\\\\\\'search:country\\\\\\\'""\\...","[Verse 1]\nDon't know what I wanted\n, \nI hav...",...,0.652,17.398707,10.183668,14.943735,19.089379,20.234558,19.928241,44.187604,106.132075,0.010976
1,25leEEaz1gIpp7o21Fqyjo,here comes santa claus,gene autry,https://i.scdn.co/image/ab67616d0000b27335e9a6...,82,False,1947,150266,"['[\'[\\\'[""[\\\\\\\'search:country\\\\\\\'""\\...","Here comes Santa Claus, here comes Santa Claus...",...,0.341628,11.325815,11.214302,17.129537,19.427072,20.00133,19.391607,54.09549,96.982759,0.057525
2,4BP3uh0hFLFRb5cjsgLqDh,fortunate son,creedence clearwater revival,https://i.scdn.co/image/ab67616d0000b2739f3919...,82,False,1969,140773,"['[\'[""[\\\'search:country\\\'""\'', ' \' "" \\\...","Some folks are born, made to wave the flag\nOo...",...,0.502544,17.308924,9.733423,12.671681,15.786249,17.868257,17.810266,52.367367,133.928571,0.048801
3,0bYg9bo50gSsH3LtXe2SQn,all i want for christmas is you,mariah carey,https://i.scdn.co/image/ab67616d0000b2734246e3...,94,False,1994,241106,"['[""[\'search:pop\'""', ' "" \'dance pop\'""', ' ...",I don't want a lot for Christmas\nThere is jus...,...,0.411156,14.947797,10.464679,15.385924,18.800166,19.554925,17.591203,57.75228,148.026316,0.084413
4,2plbrEY59IikOBgBGLjaoe,die with a smile,lady gaga,https://i.scdn.co/image/ab67616d0000b27382ea2e...,99,False,2024,251667,"['[""[\'search:pop\'""', ' "" \'art pop\'""', ' "" ...","[Intro: Bruno Mars]\n(Ooh, ooh)\n[Verse 1: Bru...",...,0.338875,16.609555,11.475996,14.859217,18.167123,19.003853,18.176417,54.122924,156.25,0.051372
5,2FRnf9qhLbvw8fu4IBXx78,last christmas,wham!,https://i.scdn.co/image/ab67616d0000b273f2d2ad...,92,False,1984,262960,"['[""[\'search:pop\'""', ' "" \'new romantic\'""',...","Oh, oh-oh, ooh-oh\nAh-ah\n\nLast Christmas, I ...",...,0.490096,17.542734,11.143639,14.6511,17.832024,17.890835,16.484866,55.002171,106.132075,0.067837
6,6rCz0E0G6El6irdft7Zjgp,falochrony,roxie węgiel,https://i.scdn.co/image/ab67616d0000b273953dd2...,67,False,2024,169142,"['[""[\'search:pop\'""', ' "" \'polish pop\'""', '...","[Zwrotka 1: Roxie Węgiel]\nDzień, taki bez Cie...",...,0.422268,15.64402,11.671298,14.848017,17.419779,18.456029,18.455853,55.633655,140.625,0.064564
7,2262bWmqomIaJXwCRHr13j,sailor song,gigi perez,https://i.scdn.co/image/ab67616d0000b273e6065f...,93,False,2024,211978,"['[""[\'search:pop\'""', ' "" \'bedroom pop\']""]']",[Verse 1]\nI saw her in the rightest way\nLook...,...,0.564012,16.091741,10.55819,14.930307,18.007222,18.866767,17.555558,53.556683,125.0,0.046842
8,2EjXfH91m7f8HiJN1yQg97,rockin' around the christmas tree,brenda lee,https://i.scdn.co/image/ab67616d0000b2737845f7...,93,False,1964,126266,"[""['search:rock'"", "" 'adult standards'"", "" 'ea...",Rockin' around the Christmas tree\nAt the Chri...,...,0.317221,14.208915,9.677373,14.825332,17.437802,19.703676,19.716016,52.868684,140.625,0.063229
9,7aQViYYpIkpJwyCHPysCo4,na szczycie,grubson,https://i.scdn.co/image/ab67616d0000b27395899c...,60,False,2009,244948,"['search:reggae', 'polish hip hop', 'polish re...","Jestem tego pewny, w głębi duszy o tym wiem,\n...",...,0.614861,15.928131,10.720004,14.65088,16.416395,16.843627,15.79146,53.170024,96.982759,0.02358


In [55]:
extractor = AudioFeatureExtractor()
df = extractor.add_features(df)

Processing Batches:   0%|          | 0/4 [00:00<?, ?it/s]

PicklingError: Can't pickle <class 'classes.audio_features.AudioFeatureExtractor'>: it's not the same object as classes.audio_features.AudioFeatureExtractor

In [46]:
df.to_pickle(final_df_path)

# Adding audio features

In [118]:
aud_processor = FeatureProcessor(
    extractor=AudioFeatureExtractor(),
    input_file=os.path.join("datasets", "onboarding_step1.pkl"),
    output_file=os.path.join('datasets', "onboarding_step2.pkl"),
    batch_size=10
)
aud_processor.process_batches()

# Adding textual features

In [None]:
df_onboarding = pd.read_pickle(os.path.join("datasets", "onboarding_step2.pkl"))
df_onboarding.head()

In [21]:
df_onboarding.shape

(2, 59)

In [140]:
def remove_letra_de_title(df):
    def remove_prefix(row):
        if isinstance(row['lyrics'], str) and row['lyrics'].startswith("letra de"):
            lyrics = row['lyrics'][len("letra de"):].strip()
            title_words = set(row['title'].split()) if isinstance(row['title'], str) else set()
            lyrics_words = lyrics.split()
            # Remove title words from the first 6 words of the lyrics
            filtered_words = [word for i, word in enumerate(lyrics_words) if i >= 6 or word not in title_words]
            return " ".join(filtered_words)
        return row['lyrics']

    df['lyrics'] = df.apply(remove_prefix, axis=1)
    return df

df_onboarding = remove_letra_de_title(df_onboarding)

In [22]:
df_onboarding['lyrics'] = df_onboarding.lyrics.apply(lambda x: TextFeatureExtractor.preprocess_text(x))

In [24]:
df_onboarding.to_pickle(os.path.join("datasets", "onboarding_step3.pkl"))

In [74]:
tf_processor = FeatureProcessor(
    extractor=TextFeatureExtractor(),
    input_file=os.path.join("datasets", "onboarding_step3.pkl"),
    output_file=os.path.join('datasets', "onboarding_step4.pkl"),
    batch_size=100
)
tf_processor.process_batches()

Processing Batches:   0%|          | 0/1 [00:00<?, ?batch/s]

# Concatenating

In [86]:
df_onboarding = pd.read_pickle(os.path.join("datasets", "onboarding_step4.pkl"))
df_onboarding.shape

(2, 78)

In [87]:
df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))
df.drop([c for c in df.columns if c.startswith('tfidf_')], axis=1, inplace=True)
df.shape

(5888, 78)

In [88]:
df_onboarding['genre'] = df_onboarding['genres'].apply(lambda x: reduce_genres_with_regex(eval(x), GENRE_MAPPING))
df_onboarding['genre'].value_counts()

rock    2
Name: genre, dtype: int64

In [89]:
incorrect_genre_songs = df[(df.genre.isna()) | (df.genre == 'None')]

In [90]:
print("{} songs have incorrect genre and will be dropped".format(incorrect_genre_songs.shape[0]))
df = df[~df.id.isin(incorrect_genre_songs.id)]

0 songs have incorrect genre and will be dropped


In [91]:
df = pd.concat([df, df_onboarding], axis=0)

In [94]:
df.to_pickle(os.path.join("datasets", 'onboarding_step5.pkl'))

# TF-IDF

In [98]:
tf_processor = FeatureProcessor(
    extractor=TfidfFeatureExtractor(n_pca_components=100),
    input_file=os.path.join("datasets", "onboarding_step5.pkl"),
    output_file=os.path.join('datasets', "onboarding_step6.pkl"),
    batch_size=None
)
tf_processor.process_batches()

# Cleaning

In [99]:
df = pd.read_pickle(os.path.join("datasets", "onboarding_step6.pkl"))
df.shape

(5890, 178)

In [100]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype(str).astype('category')

In [129]:
df.to_pickle(os.path.join("datasets", 'final_data.pkl'))

In [102]:
for pkl in [ file for file in os.listdir('datasets') if file.startswith('onboarding') and file.endswith('pkl')]:
    os.remove(os.path.join('datasets', pkl))