In [1]:
%load_ext autoreload
%autoreload 2
    
from classes.song import Song
from classes.playlist import Playlist
from classes.spotify_manager import SpotifyManager
from classes.lyrics_manager import LyricsManager
from dotenv import load_dotenv
from classes.utils import gather_data_from_folders
import pandas as pd
import os
import seaborn as sns
from classes.text_features import TextFeatureExtractor
from classes.audio_features import AudioFeatureExtractor
from classes.utils import find_songs_to_drop, clean_songs_to_drop
from classes.feature_processor import FeatureProcessor

load_dotenv();

# Loading the data

In [30]:
playlists_dir = 'audio_sentiment_classifier/playlists'
playlists_dir = os.path.join("/".join(os.path.abspath(os.path.realpath(__name__)).split("/")[:-1]), "playlists")
df_from_folders = gather_data_from_folders(playlists_dir)

[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/final download 1/final download 1.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/uu/uu.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Every song in the world 🌍 /Every song in the world 🌍 .csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Changes/Changes.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/Skinny Love/Skinny Love.csv[0m
[1;32mINFO: Loading CSV file: /home/krystian/Python/Jupyter-projects/thesis/songs_analysis/playlists/final download 2/final download 2.csv[0m


In [31]:
df_from_folders.shape

(5286, 24)

In [32]:
df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))

In [33]:
df.shape

(5154, 572)

In [34]:
df_to_process = df_from_folders[~df_from_folders['id'].isin(df['id'])]

In [36]:
df_to_process.shape

(0, 24)

In [39]:
df_to_process.isna().sum()

id                    0.0
title                 0.0
artist                0.0
album_art_url         0.0
popularity            0.0
explicit              0.0
album_release_year    0.0
duration_ms           0.0
genres                0.0
lyrics                0.0
mp3_path              0.0
danceability          0.0
energy                0.0
key                   0.0
loudness              0.0
mode                  0.0
speechiness           0.0
acousticness          0.0
instrumentalness      0.0
liveness              0.0
valence               0.0
tempo                 0.0
time_signature        0.0
csv_path              0.0
dtype: float64

# Getting rid of unusable songs

In [40]:
df_songs_to_drop = find_songs_to_drop(df_to_process, allow_nan_cols=['popularity'])

In [41]:
print("Songs that will be dropped: ", len(df_songs_to_drop))

Songs that will be dropped:  0


In [42]:
df_songs_to_drop.head()

Unnamed: 0,id,title,artist,album_art_url,popularity,explicit,album_release_year,duration_ms,genres,lyrics,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,csv_path


In [43]:
clean_songs_to_drop(df_songs_to_drop)

In [44]:
df_to_process = df_to_process[~df_to_process.id.isin(df_songs_to_drop.id)]

In [45]:
df_to_process.shape[0]

0

In [16]:
df_to_process.to_pickle(os.path.join("datasets", "onboarding_step1.pkl"))

---

# Adding audio features

In [21]:
aud_processor = FeatureProcessor(
    extractor=AudioFeatureExtractor(),
    input_file=os.path.join("datasets", "onboarding_step1.pkl"),
    output_file=os.path.join('datasets', "onboarding_step2.pkl"),
    batch_size=100
)
aud_processor.process_batches()

Total records to process: 2
Processing in batches of size 100...


Processing Batches:   0%|          | 0/1 [00:00<?, ?batch/s]

Extracting Audio Features:   0%|          | 0/2 [00:00<?, ?file/s]

Saved batch with 2 records to datasets/onboarding_step2.pkl


# Adding textual features

### Combining existing data with data being onboarded to calculate tfidf on the whole corpus

In [62]:
text_features = ['word_count',
 'unique_word_count',
 'avg_word_length',
 'syllable_count',
 'sentiment_polarity',
 'sentiment_subjectivity',
 'readability_score',
 'noun_count',
 'verb_count',
 'vader_compound',
 'repetition_count',
 'average_syllables_per_word',
 'language',
 'genre',
 'word_count',
 'unique_word_count',
 'avg_word_length',
 'syllable_count',
 'sentiment_polarity',
 'sentiment_subjectivity',
 'readability_score',
 'noun_count',
 'verb_count',
 'vader_compound',
 'repetition_count',
 'average_syllables_per_word',
 'language'] 

In [63]:
df_complete_data = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))
df_complete_data.drop([c for c in df_complete_data.columns if c.startswith('tfidf') or c in text_features ], axis=1, inplace=True)

In [64]:
df_complete_data.lyrics

0       Come on, skinny love, just last the year\nPour...
1       Life is fleeting by the slow\nIt chills us to ...
2       And I'd give up forever to touch you\n'Cause I...
3       There is freedom within\nThere is freedom with...
4       I know you've suffered\nBut I don't want you t...
                              ...                        
5373    It must be Monday! What a dumb day!\nCan't dra...
5374    God knows what is hiding\nIn those weak and dr...
5375    Your mouth is a revolver firing bullets in the...
5376    I can see what you're looking to find\nIn the ...
5377    Bury all your secrets in my skin\nCome away wi...
Name: lyrics, Length: 5154, dtype: object

In [34]:
df_onboarding = pd.read_pickle(os.path.join("datasets", "onboarding_step2.pkl"))
df_onboarding.head()

Unnamed: 0,id,title,artist,album_art_url,popularity,explicit,album_release_year,duration_ms,genres,lyrics,...,chroma_12,spectral_contrast_1,spectral_contrast_2,spectral_contrast_3,spectral_contrast_4,spectral_contrast_5,spectral_contrast_6,spectral_contrast_7,tempo_extracted,zcr
0,4Uzv9ZmajuHh2tJzpATR3T,ticket to heaven,dire straits,https://i.scdn.co/image/ab67616d0000b273ee92e1...,57.0,False,1991.0,264200.0,"['album rock', 'classic rock', 'mellow gold', ...",I can see what you're looking to find\nIn the ...,...,0.508303,15.254438,10.797656,14.90614,17.791504,19.277332,17.169731,53.26034,117.1875,0.031138
1,3RptaQ5Xb8WvtpItZ2f9Hi,snuff,slipknot,https://i.scdn.co/image/ab67616d0000b273457163...,66.0,False,2008.0,276146.0,"['alternative metal', 'nu metal', 'rap metal',...",Bury all your secrets in my skin\nCome away wi...,...,0.445762,14.870351,10.289753,13.564273,15.741718,18.031011,17.757005,51.333696,125.0,0.047012


In [35]:
df_complete_data.shape

(5376, 58)

In [36]:
df_onboarding.shape

(2, 58)

In [38]:
df_final = pd.concat([df_complete_data, df_onboarding])
df_final

Unnamed: 0,id,title,artist,album_art_url,popularity,explicit,album_release_year,duration_ms,genres,lyrics,...,spectral_contrast_1,spectral_contrast_2,spectral_contrast_3,spectral_contrast_4,spectral_contrast_5,spectral_contrast_6,spectral_contrast_7,tempo_extracted,zcr,csv_path
0,4RL77hMWUq35NYnPLXBpih,skinny love,birdy,https://i.scdn.co/image/ab67616d0000b2733661c0...,70.0,0,2011.0,201080.0,"['neo mellow', 'uk pop', 'viral pop']","Come on, skinny love, just last the year\nPour...",...,17.627332,13.771729,19.275149,22.244659,23.802122,22.563887,46.881367,119.680851,0.031446,
1,5PsjZ21f5tMPFf1sJpokm7,deathwish,red sun rising,https://i.scdn.co/image/ab67616d0000b273612f16...,,0,2018.0,287946.0,"['modern hard rock', 'post-grunge']",Life is fleeting by the slow\nIt chills us to ...,...,16.215804,10.309580,13.744232,16.058571,17.278253,16.887822,53.087236,133.928571,0.051242,
2,6vrUTGn5p8IrfTZ0J6sIVM,iris,the goo goo dolls,https://i.scdn.co/image/ab67616d0000b273d54c4b...,71.0,0,2007.0,289906.0,"['neo mellow', 'permanent wave', 'pop rock', '...",And I'd give up forever to touch you\n'Cause I...,...,15.357601,10.201447,13.698081,17.050868,18.783636,17.473927,56.185155,152.027027,0.069683,
3,0Leo4GP0UKG4tUABm9JATe,don't dream it's over,crowded house,https://i.scdn.co/image/ab67616d0000b27380ed24...,,0,2017.0,231377.0,"['australian rock', 'permanent wave']",There is freedom within\nThere is freedom with...,...,14.941456,11.061548,15.084157,17.604808,18.362215,18.056211,51.318052,160.714286,0.044647,
4,0It6VJoMAare1zdV2wxqZq,undisclosed desires,muse,https://i.scdn.co/image/ab67616d0000b273b6d456...,65.0,0,2009.0,235000.0,"['alternative rock', 'modern rock', 'permanent...",I know you've suffered\nBut I don't want you t...,...,14.934434,11.366229,15.412446,16.330238,17.806636,17.648626,53.669779,117.187500,0.041844,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5373,46S89qBi4tDaDaWf2gHYwj,c'est la vie,shania twain,https://i.scdn.co/image/ab67616d0000b273e5edf8...,22.0,0,2002.0,218960.0,"['canadian country', 'canadian pop', 'contempo...",It must be Monday! What a dumb day!\nCan't dra...,...,14.036007,11.390483,15.766920,19.441054,21.393834,18.276284,57.088811,122.282609,0.080069,
5374,0YywjDvFudcaHG74NuWISy,people help the people,birdy,https://i.scdn.co/image/ab67616d0000b2733661c0...,68.0,0,2011.0,256236.0,"['neo mellow', 'uk pop', 'viral pop']",God knows what is hiding\nIn those weak and dr...,...,16.001677,13.129550,17.740944,19.416752,21.367145,20.205828,50.704285,148.026316,0.028913,
5387,2N0volXQpwsvjDorTTxBAU,bonfire heart,james blunt,https://i.scdn.co/image/ab67616d0000b273fcf5fe...,67.0,0,2013.0,238000.0,['neo mellow'],Your mouth is a revolver firing bullets in the...,...,12.764830,11.519237,14.631629,16.818982,18.706928,17.424538,53.491715,117.187500,0.051468,/home/krystian/Python/Jupyter-projects/thesis/...
0,4Uzv9ZmajuHh2tJzpATR3T,ticket to heaven,dire straits,https://i.scdn.co/image/ab67616d0000b273ee92e1...,57.0,0,1991.0,264200.0,"['album rock', 'classic rock', 'mellow gold', ...",I can see what you're looking to find\nIn the ...,...,15.254438,10.797656,14.906140,17.791504,19.277332,17.169731,53.260340,117.187500,0.031138,/home/krystian/Python/Jupyter-projects/thesis/...


In [66]:
df_final['lyrics'] = df_final.lyrics.apply(lambda x: TextFeatureExtractor.preprocess_text(x))

[nltk_data] Downloading package punkt to /home/krystian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/krystian/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/krystian/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package cmudict to /home/krystian/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [71]:
df_final.to_pickle(os.path.join("datasets", "onboarding_step3.pkl"))

### Calculating textual features

In [79]:
tf_processor = FeatureProcessor(
    extractor=TextFeatureExtractor(n_pca_components=50),
    input_file=os.path.join("datasets", "onboarding_step3.pkl"),
    output_file=os.path.join('datasets', "onboarding_step4.pkl"),
    batch_size=None,
    overwrite_existing=True
)
tf_processor.process_batches()

Total records to process: 5154
Processing in a single batch without multiprocessing...
Saved batch with 5154 records to datasets/onboarding_step4.pkl


# Cleaning

In [80]:
df = pd.read_pickle(os.path.join("datasets", "onboarding_step4.pkl"))
# df = pd.read_pickle(os.path.join("datasets", "final_data.pkl"))
df.shape

(5154, 123)

In [81]:
from classes.constants import GENRE_MAPPING

def reduce_genres(genre_list: list, mapping: dict):
    for genre in genre_list:
        for general, specifics in mapping.items():
            if genre in specifics:
                return general
    return None

# Apply the generalization function to the DataFrame
df['genre'] = df['genres'].apply(lambda x: reduce_genres(eval(x), GENRE_MAPPING))

df['genre'].value_counts()

pop                  2644
rock                  921
hip hop               551
r&b                   215
indie                 199
folk                  129
dance                  98
electronic             97
metal                  55
experimental           44
country                39
jazz                   38
latin                  37
classical              28
grime                  21
alternative            13
reggae                 10
singer-songwriter       7
house                   6
Name: genre, dtype: int64

In [82]:
# Fixing the dtypes
cast_to_int_columns = ['duration_ms', 'album_release_year', 'key', 'mode', 'time_signature', 'explicit']
df[cast_to_int_columns] = df[cast_to_int_columns].astype(int)

cast_to_category_columns = ['key', 'mode', 'time_signature', 'explicit', 'language', 'genre']
df[cast_to_category_columns] = df[cast_to_category_columns].astype('category')

  fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype))


In [83]:
df.to_pickle(os.path.join("datasets", 'final_data.pkl'))

In [84]:
for pkl in [ file for file in os.listdir('datasets') if file.startswith('onboarding') and file.endswith('pkl')]:
    os.remove(os.path.join('datasets', pkl))