Find most common words from summed texts for artists and genres

In [23]:
import pandas as pd
from collections import Counter

# Import library
import spacy
# Load the language model
nlp = spacy.load("en_core_web_sm")

In [2]:
df_path = '../dataframes/genres/cleaned_combined.csv'

In [3]:
df = pd.read_csv(df_path, index_col = 0)

In [4]:
df.head()

Unnamed: 0,Artist,Song Title,Full Title,Release Date,Year,Month,Day,Pageviews,url,featured_count,producer_count,writer_count,Song Lyrics,Artist Image,gender,genre
0,Aaliyah,Are You That Somebody?,Are You That Somebody? by Aaliyah (Ft. Timbaland),1998-05-26,1998.0,5.0,26.0,373960,https://genius.com/Aaliyah-are-you-that-somebo...,1,1,2,dirty south can yall really feel me east coas...,https://images.genius.com/3fea34947a97beb226fc...,female,pop
1,Aaliyah,Enough Said,Enough Said by Aaliyah (Ft. Drake),2012-08-05,2012.0,8.0,5.0,316333,https://genius.com/Aaliyah-enough-said-lyrics,1,1,3,uh cause i yall arent ready for this ooh oh y...,https://images.genius.com/3fea34947a97beb226fc...,female,pop
2,Aaliyah,At Your Best (You Are Love),At Your Best (You Are Love) by Aaliyah,1994-08-22,1994.0,8.0,22.0,285549,https://genius.com/Aaliyah-at-your-best-you-ar...,0,1,6,let me know let me know haah haaaah let me kn...,https://images.genius.com/3fea34947a97beb226fc...,female,pop
3,Aaliyah,Miss You,Miss You by Aaliyah,2002-11-16,2002.0,11.0,16.0,245608,https://genius.com/Aaliyah-miss-you-lyrics,0,1,3,oh hey yeahyeahyeah nononononono its been to...,https://images.genius.com/3fea34947a97beb226fc...,female,pop
4,Aaliyah,Age Ain’t Nothing But a Number,Age Ain't Nothing But a Number by Aaliyah,1994-12-06,1994.0,12.0,6.0,207419,https://genius.com/Aaliyah-age-aint-nothing-bu...,0,1,1,may fifth 1993 aaliyahs diary got it age ain...,https://images.genius.com/3fea34947a97beb226fc...,female,pop


In [6]:
artists = list(df['Artist'].unique())

In [36]:
word_cols = [f'most_common_{n}' for n in range(1,11)]
cols = [*word_cols]
artist_counts_df = pd.DataFrame(columns=cols)

for artist in artists:
    artist_df = df[df['Artist'] == artist]
    artist_lyrics = ' '.join(artist_df['Song Lyrics'].str.lower())
    doc = nlp(artist_lyrics)

    filtered_tokens = [token.text for token in doc if not token.is_stop]
    
    total_words = len(artist_lyrics.split())
    
    word_counts = Counter(filtered_tokens)
    most_common = word_counts.most_common(10)

    most_common_words = [word[0] for word in most_common]

    new_row = {}
    for index, word in enumerate(most_common_words):
        col_name = word_cols[index]
        new_row[col_name] = word
    
    new_row_df = pd.DataFrame([new_row])

    new_row_df.index = [artist]

    artist_counts_df = pd.concat([artist_counts_df, new_row_df])

In [37]:
artist_counts_df

Unnamed: 0,most_common_1,most_common_2,most_common_3,most_common_4,most_common_5,most_common_6,most_common_7,most_common_8,most_common_9,most_common_10
Aaliyah,nt,,let,baby,know,love,m,wanna,come,yeah
Beyoncé,nt,like,m,,love,halo,baby,ai,know,girl
Britney Spears,,m,nt,oh,like,baby,gimme,yeah,"""",want
Carly Rae Jepsen,,love,m,nt,want,gimmie,ill,like,na,oh
Charli XCX,got,like,party,shake,nt,love,yeah,,m,know
Dua Lipa,nt,,m,know,come,got,like,yeah,love,baby
Madonna,,m,like,nt,got,ve,know,wake,sorry,love
Michael Jackson,nt,ma,,yeah,know,bad,stop,m,beat,baby
Olivia Rodrigo,nt,,m,like,know,oh,want,s,love,cause
Taylor Swift,,nt,like,"""",m,know,cause,got,look,ve


### ========== tests ===========

In [9]:
artist = artists[0]

In [10]:
artist_df = df[df['Artist'] == artist]

In [25]:
artist_lyrics = ' '.join(artist_df['Song Lyrics'].str.lower())

doc = nlp(artist_lyrics)

filtered_tokens = [token.text for token in doc if not token.is_stop]

total_words = len(artist_lyrics.split())
word_counts = Counter(filtered_tokens)
most_common = word_counts.most_common(10)

most_common_words = [word[0] for word in most_common]

In [26]:
most_common_words

['nt', ' ', 'let', 'baby', 'know', 'love', 'm', 'wanna', 'come', 'yeah']

In [27]:
artist_counts_df

Unnamed: 0,most_common_1,most_common_2,most_common_3,most_common_4,most_common_5,most_common_6,most_common_7,most_common_8,most_common_9,most_common_10


In [33]:
new_row = {}
for index, word in enumerate(most_common_words):
    col_name = word_cols[index]
    new_row[col_name] = word
    # print(col_name)
    # artist_counts_df[col_name] = word
new_row_df = pd.DataFrame([new_row])

In [34]:
new_row_df.index = [artist]

In [35]:
new_row_df

Unnamed: 0,most_common_1,most_common_2,most_common_3,most_common_4,most_common_5,most_common_6,most_common_7,most_common_8,most_common_9,most_common_10
Aaliyah,nt,,let,baby,know,love,m,wanna,come,yeah


### ========= end tests =========