In [45]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
from classes.utils import gather_data_from_folders
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor, TfidfFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor
from classes.constants import GENRE_MAPPING
from classes.utils import reduce_genres_with_regex
load_dotenv();

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the data

In [46]:
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "test")
df = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/test/songs.csv[0m


In [47]:
final_df_path = 'final_data.pkl'
final_df = None
if os.path.exists(final_df_path):
    final_df = pd.read_pickle(final_df_path)
    df = df[~df.id.isin(final_df.id)]

In [48]:
df.shape

(11, 24)

#  Feature extraction

In [49]:
extractor = TextFeatureExtractor()
df = extractor.add_features(df, batch_size=3, text_column='lyrics')

Processing Features:   0%|          | 0/11 [00:00<?, ?it/s]

In [50]:
df.shape

(11, 238)

In [51]:
extractor = AudioFeatureExtractor()
df = extractor.add_features(df, batch_size=3)

Extracting Audio Features:   0%|          | 0/11 [00:00<?, ?file/s]

In [52]:
df.shape

(11, 272)

In [54]:
tfidf_ext = TfidfFeatureExtractor(n_pca_components=100)
df = tfidf_ext.add_features(df)

In [55]:
df.shape

(11, 277)

# Cleaning

In [61]:
# genre extraction
df['genre'] = df['genres'].apply(lambda x: reduce_genres_with_regex(eval(x), GENRE_MAPPING))
df['genre'].value_counts()

genre
pop        5
country    3
reggae     2
rock       1
Name: count, dtype: int64

In [62]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype(str).astype('category')

# Saving

In [63]:
if df.shape[0] > 0:
    if final_df is not None:
        final_df = pd.concat([df, final_df], axis=0)
    else:
        final_df = df
    final_df.to_pickle(final_df_path)

# Verification

In [64]:
df = pd.read_pickle('final_data.pkl')

In [66]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 278 columns):
 #    Column                        Dtype   
---   ------                        -----   
 0    id                            object  
 1    title                         object  
 2    artist                        object  
 3    album_art_url                 object  
 4    popularity                    int64   
 5    explicit                      category
 6    album_release_year            int64   
 7    duration_ms                   int64   
 8    genres                        object  
 9    lyrics                        object  
 10   mp3_path                      object  
 11   csv_path                      object  
 12   danceability                  float64 
 13   energy                        float64 
 14   key                           category
 15   loudness                      float64 
 16   mode                          category
 17   speechiness                   float