In [18]:
import pandas as pd
import plotly.express as px
from nltk import FreqDist
import spacy

In [19]:
# Load the language model
nlp = spacy.load("en_core_web_sm")

In [3]:
df = pd.read_csv('dataframes/lyrics_combined.csv', index_col = 0)

In [4]:
df.head()

Unnamed: 0,Artist,all_lyrics,genre,gender
0,Aaliyah,dirty south can yall really feel me east coas...,pop,female
1,Beyoncé,ive been drinkin ive been drinkin i get filth...,pop,female
2,Britney Spears,oh baby baby oh baby baby oh baby baby how w...,pop,female
3,Carly Rae Jepsen,i threw a wish in the well dont ask me ill ne...,pop,female
4,Charli XCX,i was busy thinkin bout boys boys boys always...,pop,female


In [9]:
test_data = df.loc[0, 'all_lyrics'].strip()

In [20]:
doc = nlp(test_data)

In [28]:
filtered_data = [token.text for token in doc if not token.is_stop]
# remove one-letter errors
filtered_data = [token for token in filtered_data if len(token)>1]

In [29]:
test_freq = FreqDist(filtered_data)

In [30]:
test_freq

FreqDist({'nt': 208, 'let': 113, 'baby': 96, 'know': 89, 'love': 82, 'wanna': 77, 'come': 73, 'yeah': 64, 'forth': 63, 'oh': 61, ...})

In [31]:
test_freq.items()

dict_items([('dirty', 4), ('south', 4), ('feel', 42), ('east', 4), ('coast', 11), ('west', 4), ('boy', 23), ('ve', 16), ('watching', 1), ('like', 44), ('hawk', 1), ('sky', 1), ('flies', 1), ('prey', 2), ('promise', 10), ('bumpin', 1), ('heads', 1), ('know', 89), ('days', 3), ('gon', 16), ('hook', 1), ('prolly', 1), ('talk', 28), ('phone', 9), ('nt', 208), ('good', 11), ('holdin', 1), ('secret', 1), ('probably', 14), ('tell', 52), ('let', 113), ('talkin', 4), ('bout', 7), ('responsible', 4), ('got', 51), ('ta', 26), ('watch', 7), ('cause', 44), ('anybody', 4), ('goodygoody', 5), ('right', 10), ('naughtynaughty', 4), ('yes', 10), ('need', 38), ('somebody', 17), ('wo', 14), ('pick', 3), ('park', 1), ('block', 1), ('sleep', 9), ('ill', 33), ('waiting', 1), ('trench', 1), ('locs', 1), ('hat', 1), ('lowkey', 1), ('world', 9), ('speak', 4), ('weak', 2), ('oh', 61), ('trusting', 1), ('heart', 25), ('soul', 5), ('hope', 2), ('baby', 96), ('girl', 30), ('man', 4), ('big', 2), ('va', 1), ('come',

In [39]:
def generate_freq_dist(row):
    lyrics = row['all_lyrics'].strip()
    doc = nlp(lyrics)
    filtered_data = [token.text for token in doc if not token.is_stop]
    # remove one-letter errors
    filtered_data = [token for token in filtered_data if len(token)>1]
    freq_dist = FreqDist(filtered_data)
    return freq_dist

In [40]:
def apply_generate_freq_dist(df):
    df['freq_dist'] = df.apply(
    lambda row: generate_freq_dist(row),
    axis = 1
    )

In [41]:
apply_generate_freq_dist(df)

In [42]:
df

Unnamed: 0,Artist,all_lyrics,genre,gender,freq_dist
0,Aaliyah,dirty south can yall really feel me east coas...,pop,female,"{'dirty': 4, 'south': 4, 'feel': 42, 'east': 4..."
1,Beyoncé,ive been drinkin ive been drinkin i get filth...,pop,female,"{'ve': 19, 'drinkin': 6, 'filthy': 1, 'liquor'..."
2,Britney Spears,oh baby baby oh baby baby oh baby baby how w...,pop,female,"{'oh': 103, 'baby': 90, 'supposed': 2, 'know':..."
3,Carly Rae Jepsen,i threw a wish in the well dont ask me ill ne...,pop,female,"{'threw': 1, 'wish': 2, 'nt': 104, 'ask': 1, '..."
4,Charli XCX,i was busy thinkin bout boys boys boys always...,pop,female,"{'busy': 10, 'thinkin': 9, 'bout': 24, 'boys':..."
5,Dua Lipa,one talkin in my sleep at night makin myself...,pop,female,"{'talkin': 1, 'sleep': 5, 'night': 33, 'makin'..."
6,Madonna,life is a mystery everyone must stand alone i...,pop,female,"{'life': 13, 'mystery': 5, 'stand': 3, 'hear':..."
7,Michael Jackson,she was more like a beauty queen from a movie...,pop,male,"{'like': 51, 'beauty': 1, 'queen': 1, 'movie':..."
8,Olivia Rodrigo,i got my drivers license last week just like ...,pop,female,"{'got': 23, 'drivers': 1, 'license': 1, 'week'..."
9,Taylor Swift,i walked through the door with you the air wa...,pop,female,"{'walked': 4, 'door': 8, 'air': 3, 'cold': 2, ..."


In [53]:
def sort_freq_dist(row):
    freq_dist = row['freq_dist']
    sorted_dist = sorted(freq_dist.items(), key = lambda x: x[1], reverse=True)
    sorted_dist = dict(sorted_dist)
    return sorted_dist

In [54]:
df['sorted_dist'] = df.apply(
    lambda row: sort_freq_dist(row),
    axis = 1
)

In [55]:
df

Unnamed: 0,Artist,all_lyrics,genre,gender,freq_dist,sorted_dist
0,Aaliyah,dirty south can yall really feel me east coas...,pop,female,"{'dirty': 4, 'south': 4, 'feel': 42, 'east': 4...","{'nt': 208, 'let': 113, 'baby': 96, 'know': 89..."
1,Beyoncé,ive been drinkin ive been drinkin i get filth...,pop,female,"{'ve': 19, 'drinkin': 6, 'filthy': 1, 'liquor'...","{'nt': 200, 'like': 135, 'love': 119, 'halo': ..."
2,Britney Spears,oh baby baby oh baby baby oh baby baby how w...,pop,female,"{'oh': 103, 'baby': 90, 'supposed': 2, 'know':...","{'nt': 121, 'oh': 103, 'like': 91, 'baby': 90,..."
3,Carly Rae Jepsen,i threw a wish in the well dont ask me ill ne...,pop,female,"{'threw': 1, 'wish': 2, 'nt': 104, 'ask': 1, '...","{'love': 123, 'nt': 104, 'want': 95, 'gimmie':..."
4,Charli XCX,i was busy thinkin bout boys boys boys always...,pop,female,"{'busy': 10, 'thinkin': 9, 'bout': 24, 'boys':...","{'got': 239, 'like': 206, 'party': 166, 'shake..."
5,Dua Lipa,one talkin in my sleep at night makin myself...,pop,female,"{'talkin': 1, 'sleep': 5, 'night': 33, 'makin'...","{'nt': 232, 'know': 80, 'come': 78, 'got': 69,..."
6,Madonna,life is a mystery everyone must stand alone i...,pop,female,"{'life': 13, 'mystery': 5, 'stand': 3, 'hear':...","{'like': 120, 'nt': 116, 'got': 77, 've': 74, ..."
7,Michael Jackson,she was more like a beauty queen from a movie...,pop,male,"{'like': 51, 'beauty': 1, 'queen': 1, 'movie':...","{'nt': 190, 'ma': 174, 'yeah': 139, 'know': 93..."
8,Olivia Rodrigo,i got my drivers license last week just like ...,pop,female,"{'got': 23, 'drivers': 1, 'license': 1, 'week'...","{'nt': 121, 'like': 71, 'know': 65, 'oh': 59, ..."
9,Taylor Swift,i walked through the door with you the air wa...,pop,female,"{'walked': 4, 'door': 8, 'air': 3, 'cold': 2, ...","{'nt': 107, 'like': 88, 'know': 52, 'cause': 4..."


In [56]:
dist_1 = df.loc[0, 'sorted_dist']
dist_2 = df.loc[1, 'sorted_dist']

In [50]:
import math

In [58]:
compared = {}
for key, value in dist_1.items():
    value2 = dist_2.get(key, 0)
    if value2 == 0:
        comparison = 1000
    else:
        comparison = math.log(value / value2)
    compared[key] = comparison

In [62]:
compared_sorted = sorted(compared.items(), key = lambda x: x[1], reverse=True)
compared_sorted = dict(compared_sorted)

In [63]:
compared_sorted

{'forth': 1000,
 'change': 1000,
 'boat': 1000,
 'check': 1000,
 'huh': 1000,
 'dust': 1000,
 'supposed': 1000,
 'somebody': 1000,
 'age': 1000,
 'number': 1000,
 'probably': 1000,
 'lover': 1000,
 'yeahyeahyeahyeahyeahyeahyeah': 1000,
 'throwin': 1000,
 'position': 1000,
 'succeed': 1000,
 'coast': 1000,
 'knew': 1000,
 'wondering': 1000,
 'gone': 1000,
 'liyah': 1000,
 'million': 1000,
 'resolution': 1000,
 'dog': 1000,
 'needing': 1000,
 'letter': 1000,
 'ha': 1000,
 'wanting': 1000,
 'calling': 1000,
 'dope': 1000,
 'fool': 1000,
 'glad': 1000,
 'hand': 1000,
 'went': 1000,
 'positions': 1000,
 'grooving': 1000,
 'sending': 1000,
 'page': 1000,
 'enclosed': 1000,
 'yah': 1000,
 'unplug': 1000,
 'goodygoody': 1000,
 'soul': 1000,
 'date': 1000,
 'comfort': 1000,
 'curse': 1000,
 'bet': 1000,
 'dirty': 1000,
 'south': 1000,
 'east': 1000,
 'west': 1000,
 'responsible': 1000,
 'anybody': 1000,
 'naughtynaughty': 1000,
 'confusion': 1000,
 'school': 1000,
 'share': 1000,
 'callin': 100

In [64]:
artists = list(df['Artist'].unique())

In [80]:
def compare_sorted_dist(df):
    # backup
    comparison_dicts = []
    # new column in df
    df['comparison_dict'] = None
    
    for artist in artists:
        artist_df = df[df['Artist'] == artist]
        
        artist_compared = {}
        
        sorted_dist = artist_df['sorted_dist'].values[0]
        
        other_artists = [art for art in artists if art != artist]

        for key, value in sorted_dist.items():
            other_val = 0
            for other in other_artists:
                other_artist_dist = df[df['Artist'] == other]['sorted_dist'].values[0]
                other_artist_val = other_artist_dist.get(key, 0)
                other_val += other_artist_val
                
            if other_val == 0:
                comparison = 1000
            else:
                comparison = math.log(value / other_val)

            artist_compared[key] = comparison
            
        comparison_dicts.append(artist_compared)
        df[df['Artist'] == artist]['comparison_dict'] = artist_compared
    return comparison_dicts

In [81]:
comparison_dicts = compare_sorted_dist(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['Artist'] == artist]['comparison_dict'] = artist_compared
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['Artist'] == artist]['comparison_dict'] = artist_compared
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['Artist'] == artist]['comparison_dict'] = artist_compared
A value is tr

In [82]:
len(comparison_dicts)

40

In [85]:
df['comparison_dict'] = comparison_dicts

In [86]:
df

Unnamed: 0,Artist,all_lyrics,genre,gender,freq_dist,sorted_dist,comparison_dict
0,Aaliyah,dirty south can yall really feel me east coas...,pop,female,"{'dirty': 4, 'south': 4, 'feel': 42, 'east': 4...","{'nt': 208, 'let': 113, 'baby': 96, 'know': 89...","{'nt': -3.128782781341163, 'let': -1.875200073..."
1,Beyoncé,ive been drinkin ive been drinkin i get filth...,pop,female,"{'ve': 19, 'drinkin': 6, 'filthy': 1, 'liquor'...","{'nt': 200, 'like': 135, 'love': 119, 'halo': ...","{'nt': -3.169685580677429, 'like': -2.97363813..."
2,Britney Spears,oh baby baby oh baby baby oh baby baby how w...,pop,female,"{'oh': 103, 'baby': 90, 'supposed': 2, 'know':...","{'nt': 121, 'oh': 103, 'like': 91, 'baby': 90,...","{'nt': -3.6886728211965774, 'oh': -2.708697240..."
3,Carly Rae Jepsen,i threw a wish in the well dont ask me ill ne...,pop,female,"{'threw': 1, 'wish': 2, 'nt': 104, 'ask': 1, '...","{'love': 123, 'nt': 104, 'want': 95, 'gimmie':...","{'love': -2.8260138889133617, 'nt': -3.8435794..."
4,Charli XCX,i was busy thinkin bout boys boys boys always...,pop,female,"{'busy': 10, 'thinkin': 9, 'bout': 24, 'boys':...","{'got': 239, 'like': 206, 'party': 166, 'shake...","{'got': -2.1457110428861115, 'like': -2.523785..."
5,Dua Lipa,one talkin in my sleep at night makin myself...,pop,female,"{'talkin': 1, 'sleep': 5, 'night': 33, 'makin'...","{'nt': 232, 'know': 80, 'come': 78, 'got': 69,...","{'nt': -3.0145201874196244, 'know': -3.2153696..."
6,Madonna,life is a mystery everyone must stand alone i...,pop,female,"{'life': 13, 'mystery': 5, 'stand': 3, 'hear':...","{'like': 120, 'nt': 116, 'got': 77, 've': 74, ...","{'like': -3.0970847678142785, 'nt': -3.7319059..."
7,Michael Jackson,she was more like a beauty queen from a movie...,pop,male,"{'like': 51, 'beauty': 1, 'queen': 1, 'movie':...","{'nt': 190, 'ma': 174, 'yeah': 139, 'know': 93...","{'nt': -3.223077511721901, 'ma': 2.96183072187..."
8,Olivia Rodrigo,i got my drivers license last week just like ...,pop,female,"{'got': 23, 'drivers': 1, 'license': 1, 'week'...","{'nt': 121, 'like': 71, 'know': 65, 'oh': 59, ...","{'nt': -3.6886728211965774, 'like': -3.6401773..."
9,Taylor Swift,i walked through the door with you the air wa...,pop,female,"{'walked': 4, 'door': 8, 'air': 3, 'cold': 2, ...","{'nt': 107, 'like': 88, 'know': 52, 'cause': 4...","{'nt': -3.81452351494331, 'like': -3.419215887..."


In [87]:
df.to_csv('dataframes/freq_dist_comparisons_raw.csv')

In [88]:
import pickle
with open('comparison_dicts.pickle', 'wb') as handle:
    pickle.dump(comparison_dicts, handle, protocol=pickle.HIGHEST_PROTOCOL)