In [2]:
import pandas as pd

In [3]:
original_dataset = pd.read_csv('./raw_data/Music_Dataset_Lyrics_and_Metadata_from_1950_to_2019/tcc_ceds_music.csv')

In [4]:
original_dataset.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [5]:
emotion_classified_dataset = pd.read_csv('lyrics_with_emotion_preds.csv')

In [6]:
emotion_classified_dataset.head()

Unnamed: 0.1,Unnamed: 0,lyrics_text,predicted_emotion_label,predicted_encoded_label
0,0,hold time feel break feel untrue convince spea...,sadness,4
1,1,believe drop rain fall grow believe darkest ni...,joy,2
2,2,sweetheart send letter goodbye secret feel bet...,sadness,4
3,3,kiss lips want stroll charm mambo chacha merin...,joy,2
4,4,till darling till matter know till dream live ...,sadness,4


In [18]:
# So how do I merge these two datasets? 
# Easy way: simply add the column. 
# To do that, we first need to make sure that the indices for both dataframes match. (i.e. lyrics in row i of one match lyrics in row i of the other)

# More general, create a new empty column for the original data frame.
# Then find the row in the second dataframe where lyrics in an index of the first dataframe match 
# Then populate the corresponding index in the new column with those lyrics. 

def check_index_match(dataframe_1, dataframe_2, dataframe_1_col, dataframe_2_col):

    """
    Checks if the items in each index of dataframe_1[dataframe_1_col] match the items in 
    each index of dataframe_2[dataframe_2_col]
    """
    
    if len(dataframe_1) != len(dataframe_2):
        return False
    
    for dataframe_2_index in range(len(dataframe_2)):
        dataframe_1_index_obj = dataframe_1.index[dataframe_1[dataframe_1_col]==dataframe_2[dataframe_2_col][dataframe_2_index]]
        if len(dataframe_1_index_obj) != 1:
            print(f"What?! @ {dataframe_2_index}")
            return False
        dataframe_1_index = dataframe_1_index_obj[0]
        if dataframe_1_index != dataframe_2_index:
            return False
        
    return True

In [19]:
check_index_match(original_dataset, emotion_classified_dataset, 'lyrics', 'lyrics_text')

True

In [25]:
combined_dataset = original_dataset.assign(predicted_emotion_label=emotion_classified_dataset['predicted_emotion_label'])

In [26]:
combined_dataset.to_csv('./processed_data/tcc_ceds_music_with_emotion_classification.csv')

# Emotion Trends over Time

In [34]:
# Check what years exist in the df
combined_dataset['release_date'].value_counts()

release_date
2017    660
2018    653
2015    617
2009    597
2012    588
       ... 
1955    106
1952     60
1951     58
1950     51
1953     48
Name: count, Length: 70, dtype: int64

In [35]:
set(combined_dataset['release_date'])

{1950,
 1951,
 1952,
 1953,
 1954,
 1955,
 1956,
 1957,
 1958,
 1959,
 1960,
 1961,
 1962,
 1963,
 1964,
 1965,
 1966,
 1967,
 1968,
 1969,
 1970,
 1971,
 1972,
 1973,
 1974,
 1975,
 1976,
 1977,
 1978,
 1979,
 1980,
 1981,
 1982,
 1983,
 1984,
 1985,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019}

In [39]:
release_dates = sorted(list(set(combined_dataset['release_date'])))

In [38]:
combined_dataset.where(combined_dataset['release_date'] == 1950).dropna()['predicted_emotion_label'].value_counts().to_dict()

{'joy': 31, 'sadness': 13, 'anger': 3, 'fear': 2, 'love': 1, 'surprise': 1}

In [59]:
# Aggregate data over time
emotion_data_per_year = {}
for date in release_dates:
    emotion_counts = combined_dataset.where(combined_dataset['release_date'] == date).dropna()['predicted_emotion_label'].value_counts().to_dict()
    emotion_data_per_year[date] = emotion_counts
emotion_data_per_year

{1950: {'joy': 31,
  'sadness': 13,
  'anger': 3,
  'fear': 2,
  'love': 1,
  'surprise': 1},
 1951: {'joy': 32, 'sadness': 17, 'anger': 5, 'love': 4},
 1952: {'joy': 30,
  'sadness': 18,
  'love': 5,
  'anger': 4,
  'fear': 2,
  'surprise': 1},
 1953: {'joy': 19, 'sadness': 15, 'love': 7, 'anger': 6, 'surprise': 1},
 1954: {'joy': 62,
  'sadness': 23,
  'love': 14,
  'anger': 7,
  'fear': 2,
  'surprise': 1},
 1955: {'joy': 58,
  'sadness': 29,
  'anger': 8,
  'love': 6,
  'fear': 4,
  'surprise': 1},
 1956: {'joy': 113,
  'sadness': 51,
  'love': 15,
  'anger': 14,
  'fear': 5,
  'surprise': 2},
 1957: {'joy': 111,
  'sadness': 85,
  'love': 19,
  'anger': 14,
  'surprise': 4,
  'fear': 4},
 1958: {'joy': 144,
  'sadness': 72,
  'anger': 33,
  'love': 30,
  'fear': 5,
  'surprise': 3},
 1959: {'joy': 159,
  'sadness': 85,
  'love': 31,
  'anger': 28,
  'fear': 7,
  'surprise': 2},
 1960: {'joy': 149, 'sadness': 76, 'anger': 25, 'love': 19, 'fear': 5},
 1961: {'joy': 130,
  'sadness':

In [None]:
#TODO: rewrite this code so we're staying in dataframe. I don't like converting to dict

In [60]:
emotion_labels= set(emotion_classified_dataset['predicted_emotion_label'].values.tolist())

In [61]:
emotion_labels

{'anger', 'fear', 'joy', 'love', 'sadness', 'surprise'}

In [67]:
emotion_over_time_dfs = {}
assert release_dates #NOTE: not sure if this is good practice
for emotion in emotion_labels:
    
    emotion_counts_column = []
    for date in release_dates:
        
        if emotion not in emotion_data_per_year[date].keys():
            emotion_counts_column.append(0)
        else:
            emotion_count = emotion_data_per_year[date][emotion]
            emotion_counts_column.append(emotion_count)

    emotion_over_time_dict = {'release_date':release_dates,
                                'count': emotion_counts_column
                                }
    emotion_df = pd.DataFrame(emotion_over_time_dict)
    emotion_over_time_dfs[emotion] = emotion_df

In [68]:
emotion_over_time_dfs['anger'].head()

Unnamed: 0,release_date,count
0,1950,3
1,1951,5
2,1952,4
3,1953,6
4,1954,7


In [None]:
# import plotly.express as px

# df = px.data.gapminder().query("continent=='Oceania'")
# fig = px.line(df, x="year", y="lifeExp", color='country')
# fig.show()

In [69]:
emotion_over_time_dfs['anger']

Unnamed: 0,release_date,count
0,1950,3
1,1951,5
2,1952,4
3,1953,6
4,1954,7
...,...,...
65,2015,102
66,2016,86
67,2017,94
68,2018,115


In [70]:
from copy import deepcopy

In [81]:
emotion_over_time_dfs_to_combine = deepcopy(emotion_over_time_dfs)

In [82]:
emotion_over_time_dfs_to_combine['anger'].head()

Unnamed: 0,release_date,count
0,1950,3
1,1951,5
2,1952,4
3,1953,6
4,1954,7


In [77]:
len(['anger' for i in range(len(emotion_over_time_dfs['anger']))])

70

In [78]:

 
# Add the new column using loc
emotion_over_time_dfs_to_combine['anger'].loc[:, "emotion"] = ['anger' for i in range(len(emotion_over_time_dfs['anger']))]

In [79]:
emotion_over_time_dfs_to_combine['anger'].head()

Unnamed: 0,release_date,count,emotion
0,1950,3,anger
1,1951,5,anger
2,1952,4,anger
3,1953,6,anger
4,1954,7,anger


In [83]:
for emotion in emotion_labels:
    emotion_over_time_dfs_to_combine[emotion].loc[:, "emotion"] = [emotion for i in range(len(emotion_over_time_dfs_to_combine[emotion]))]

In [86]:
emotion_over_time_dfs_to_combine['sadness'].head()

Unnamed: 0,release_date,count,emotion
0,1950,13,sadness
1,1951,17,sadness
2,1952,18,sadness
3,1953,15,sadness
4,1954,23,sadness


In [87]:
data_frames = list(emotion_over_time_dfs_to_combine.values())

In [89]:
import pandas as pd

# Assuming df1, df2, df3 are your dataframes
# Concatenate them along the rows axis
combined_df = pd.concat(data_frames, axis=0)

# Reset index to ensure a continuous index after concatenation
combined_df.reset_index(drop=True, inplace=True)

# Display the combined dataframe
combined_df.head()


Unnamed: 0,release_date,count,emotion
0,1950,1,love
1,1951,4,love
2,1952,5,love
3,1953,7,love
4,1954,14,love


In [None]:
emotion_over_time_dfs_to_combine

In [90]:
import plotly.express as px

fig = px.line(combined_df, x="release_date", y="count", color='emotion', title='Anger Over Time')
fig.show()