In [60]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor
from classes.word_embeddings import Word2VecFeatureExtractor, TfidfFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor
from classes.constants import GENRE_MAPPING
from classes.utils import reduce_genres_with_regex, gather_data_from_folders, winsorize_series
load_dotenv();

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the data

In [2]:
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "playlists_new")
df = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists_new/songs.csv[0m


In [3]:
df.shape

(4451, 24)

In [4]:
final_df_path = os.path.join('datasets', 'new_data.pkl')
final_df = None
if os.path.exists(final_df_path):
    final_df = pd.read_pickle(final_df_path)
    df = df[~df.id.isin(final_df.id)]

In [5]:
df.shape

(4451, 24)

#  Feature extraction

In [6]:
extractor = TextFeatureExtractor()
df = extractor.add_features(df, batch_size=100, text_column='lyrics')

Processing Features:   0%|          | 0/4451 [00:00<?, ?it/s]

ERROR:root:Error processing row: The language 'catalan' is not supported.
ERROR:root:Error processing row: The language 'indonesian' is not supported.
ERROR:root:Error processing row: The language 'indonesian' is not supported.
ERROR:root:Error processing row: The language 'greek' is not supported.
ERROR:root:Error processing row: The language 'indonesian' is not supported.
ERROR:root:Error processing row: The language 'turkish' is not supported.


In [7]:
df.shape

(4451, 238)

In [8]:
vc = df.language.value_counts()
vc

language
english       3641
spanish        314
polish         301
portuguese      45
german          38
romanian        37
italian         16
russian         16
french          12
swedish          5
ukrainian        4
afrikaans        3
tagalog          3
japanese         2
dutch            2
hungarian        1
somali           1
czech            1
swahili          1
finnish          1
norwegian        1
Name: count, dtype: int64

In [9]:
languages_to_drop = vc[vc < 50].keys()
languages_to_drop

Index(['portuguese', 'german', 'romanian', 'italian', 'russian', 'french',
       'swedish', 'ukrainian', 'afrikaans', 'tagalog', 'japanese', 'dutch',
       'hungarian', 'somali', 'czech', 'swahili', 'finnish', 'norwegian'],
      dtype='object', name='language')

In [10]:
# Base rules
df = df[(~df.language.isin(languages_to_drop)) & (~df.language.isna())]

In [11]:
df.shape

(4256, 238)

In [12]:
extractor = AudioFeatureExtractor()
df = extractor.add_features(df, batch_size=5)

Extracting Audio Features:   0%|          | 0/4256 [00:00<?, ?file/s]

In [13]:
df.shape

(4256, 272)

In [14]:
df.to_pickle(os.path.join('datasets', 'after_audio.pkl'))

In [15]:
tfidf_ext = TfidfFeatureExtractor(n_pca_components=100)
df = tfidf_ext.add_features(df)

In [17]:
extractor = Word2VecFeatureExtractor(n_pca_components=100)

df = extractor.add_features(df, text_column='lyrics', vector_size=100)

Extracting Word2Vec Features:   0%|          | 0/4256 [00:00<?, ?it/s]

# Cleaning

In [23]:
# genre extraction
# df['genre'] = df['genres'].apply(lambda x: reduce_genres_with_regex(eval(x), GENRE_MAPPING))
df['genre'] = df['genres'].apply(lambda x: eval(x)[0].replace("genre:", ""))
df['genre'].value_counts()

genre
rock         483
metal        477
country      464
r&b          462
pop          433
reggae       427
rap          410
indie        366
reggaeton    265
hip          255
edm          214
Name: count, dtype: int64

In [25]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype(str).astype('category')

In [55]:
df =  df[(df.language == 'english') & (df.genre != 'reggaeton') & (~df.popularity.isna())]
df.genre = df.genre.astype(str).astype('category')
df.language = df.language.astype(str).astype('category')

In [56]:
df.genre = df.genre.replace({'hip':  'hip hop'})

  df.genre = df.genre.replace({'hip':  'hip hop'})


# Saving

In [27]:
if df.shape[0] > 0:
    if final_df is not None:
        final_df = pd.concat([df, final_df], axis=0)
    else:
        final_df = df
    final_df.to_pickle(final_df_path)

# Verification

In [54]:
df = pd.read_pickle(final_df_path)