### Add word group counts to dataframes in this script

Analyzing saved lyrics dataframes by counting occurences of words of topcis within songs within genres

Then sum the occurences for each artist and for each genre

In [1]:
import pandas as pd
import re
import os
import plotly
import plotly.express as px

### Specify thematic word groups

In [2]:
to_count_groups = [
    {
        'group': 'love',
        'group_words': ['love', 'lover', 'honey', 'baby', 'heart',
                      'sweetheart', 'loverboy', 'babygirl']
    },
    {
        'group': 'money',
        'group_words': ['money', 'cash', 'gold', 'bill', 'check', 'checks',
                     'dolla', 'bag', 'bags']
    },
    {
        'group': 'violence',
        'group_words': ['shoot', 'kill', 'kills', 'victim', 'whack', 'murder',
                      'gun', 'rob', 'robbin', 'steal', 'stole', 'whacked',
                      'killed', 'shot', 'robbed']
    },
    {
        'group': 'drugs',
        'group_words': ['drugs', 'weed', 'kush', 'mary jane',
                      'perc', 'cocaine', 'molly',]
    },
    {
        'group': 'gendered',
        'group_words': ['girl', 'boy', 'man', 'woman', 'him', 'her', 'he', 'she']
    },
    {
        'group': 'sadness',
        'group_words': ['sad', 'sadness', 'hate' ,'hateful', 'sadly', 'melancholy',
                       'sorry', 'bitter', 'heartbreak', 'heartbroken']
    },
    {
        'group': 'joy',
        'group_words': ['joy', 'happy', 'wonder' ,'pride', 'smile', 'joyful',
                       'laugh', 'lucky', 'glad', 'hope', 'hopeful', 'bliss',
                       'euphoria']
    },
    {
        'group': 'yes',
        'group_words': ['yes', 'agree', 'agreed', 'sure', 'confirm', 'accept']
    },
    {
        'group': 'no',
        'group_words': ['no', 'disagree', 'resist', 'reject', 'deny']
    }
]

### Function to count manual group occurrences

In [3]:
def count_occurrences(df, to_count_groups):
    df['unique_words'] = 0
    df['total_words'] = 0
    for i in df.index:
        lyrics = df['Song Lyrics'].str
        lyrics_set = set(df['Song Lyrics'][i].split())
        lyrics_len = len(lyrics_set)
        df['unique_words'][i] = lyrics_len

        lyrics_len_total = len(df['Song Lyrics'][i].split())
        df['total_words'][i] = lyrics_len_total
        
        for group in to_count_groups:
            group_count_total = 0
            for group_word in group['group_words']:
                group_word_count = lyrics.count(group_word)
                group_count_total += group_word_count
                
            df[f"manual_{group['group']}_count"] = group_count_total
    return df

### Create counts for all genres
plus save intermediate within-genre summaries

In [4]:
# define columns to be saved to the new dataframes
count_group_list = []
for group in to_count_groups:
    count_group_list.append(f"manual_{group['group']}_count")
count_group_list

['manual_love_count',
 'manual_money_count',
 'manual_violence_count',
 'manual_drugs_count',
 'manual_gendered_count',
 'manual_sadness_count',
 'manual_joy_count',
 'manual_yes_count',
 'manual_no_count']

In [5]:
# dataframes for genre comparing

to_save_cols = [*count_group_list, 'unique_words', 'total_words',
                'featured_count','producer_count', 'writer_count']

genre_sum_df = pd.DataFrame(columns=to_save_cols)
genre_mean_df = pd.DataFrame(columns=to_save_cols)
print('cols:')
print(list(genre_sum_df.columns))

cols:
['manual_love_count', 'manual_money_count', 'manual_violence_count', 'manual_drugs_count', 'manual_gendered_count', 'manual_sadness_count', 'manual_joy_count', 'manual_yes_count', 'manual_no_count', 'unique_words', 'total_words', 'featured_count', 'producer_count', 'writer_count']


In [6]:
genres = ['pop', 'rap', 'rock', 'soul']

In [7]:
for genre in genres:
    df_path = f'../dataframes/genres/cleaned_{genre}_10.csv'
    df = pd.read_csv(df_path, index_col=0)

    # count occurences and add to dataframes
    df = count_occurrences(df, to_count_groups)

    # save dataframes with counts for each song
    df.to_csv(f'../dataframes/with_counts/{genre}_count.csv')
    

    # group by artists and select only columns with value to us
    df_summed = df.groupby(['Artist']).sum()
    df_summed_cutout = df_summed[to_save_cols]

    df_mean = df.groupby(['Artist'])[to_save_cols].mean()
    df_mean_cutout = df_mean[to_save_cols]
    
    # save within-genre df summed for each artist
    df_len = len(df_summed)
    df_summed_cutout.to_csv(f'../dataframes/group_stats/sum/{genre}_{df_len}.csv')
    df_mean_cutout.to_csv(f'../dataframes/group_stats/mean/{genre}_{df_len}.csv')
    
    genre_sum = pd.DataFrame(df_summed_cutout.sum())
    genre_sum = genre_sum.T
    genre_sum.index = [genre]

    genre_mean = pd.DataFrame(df_mean_cutout.mean())
    genre_mean = genre_mean.T
    genre_mean.index = [genre]

    # add to all genres df
    genre_sum_df = pd.concat([genre_sum_df, genre_sum], axis=0)
    genre_mean_df = pd.concat([genre_mean_df, genre_mean], axis=0)

genre_sum_df.to_csv(f'../dataframes/group_stats/genre_sum_df.csv')
genre_mean_df.to_csv(f'../dataframes/group_stats/genre_mean_df.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['unique_words'][i] = lyrics_len
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_words'][i] = lyrics_len_total
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['unique_words'][i] = lyrics_len
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_words'][i] = lyrics_len_total
A value is trying 

### Add word percentage count columns

In [8]:
# define columns to be saved to the new dataframes
percent_group_list = []
for group in to_count_groups:
    percent_group_list.append(f"manual_{group['group']}_word_percent")
percent_group_list

['manual_love_word_percent',
 'manual_money_word_percent',
 'manual_violence_word_percent',
 'manual_drugs_word_percent',
 'manual_gendered_word_percent',
 'manual_sadness_word_percent',
 'manual_joy_word_percent',
 'manual_yes_word_percent',
 'manual_no_word_percent']

In [9]:
for genre in genres:
    df_path = f'../dataframes/with_counts/{genre}_count.csv'
    df = pd.read_csv(df_path, index_col=0)

    # prepare columns
    for group in percent_group_list:
        df[group] = 0
    
    for i in df.index:
        for group_index, group in enumerate(percent_group_list):
            df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)

    # save dataframes with counts for each song
    df.to_csv(f'../dataframes/with_counts/{genre}_count.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
  df[group][i]

### Add word percentages to grouped counts
that is, totals for each artist

sum

In [10]:
for genre in genres:
    df_path = f'../dataframes/group_stats/sum/{genre}_10.csv'
    df = pd.read_csv(df_path, index_col=0)

    # prepare columns
    for group in percent_group_list:
        df[group] = 0
    
    for i in df.index:
        for group_index, group in enumerate(percent_group_list):
            df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)

    # save dataframes with counts for each song
    df.to_csv(f'../dataframes/group_stats/sum/{genre}_10.csv')

  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.o

mean

In [11]:
for genre in genres:
    df_path = f'../dataframes/group_stats/mean/{genre}_10.csv'
    df = pd.read_csv(df_path, index_col=0)

    # prepare columns
    for group in percent_group_list:
        df[group] = 0
    
    for i in df.index:
        for group_index, group in enumerate(percent_group_list):
            df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)

    # save dataframes with counts for each song
    df.to_csv(f'../dataframes/group_stats/mean/{genre}_10.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
  df[group][i] = round((df[count_group_list[group_index]][i] / df['total_words'][i])*100,3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[group][i]

### Combine within-genre dataframes with counts into one df

In [12]:
combined_df = pd.DataFrame()
for genre in genres:
    genre_df_path = f'../dataframes/with_counts/{genre}_count.csv'
    genre_df = pd.read_csv(genre_df_path, index_col = 0)
    combined_df = pd.concat([combined_df, genre_df], ignore_index=True)

In [13]:
combined_df.shape

(800, 36)

In [14]:
combined_df.sample(5)

Unnamed: 0,Artist,Song Title,Full Title,Release Date,Year,Month,Day,Pageviews,url,featured_count,...,manual_no_count,manual_love_word_percent,manual_money_word_percent,manual_violence_word_percent,manual_drugs_word_percent,manual_gendered_word_percent,manual_sadness_word_percent,manual_joy_word_percent,manual_yes_word_percent,manual_no_word_percent
692,Marvin Gaye,Distant Lover,Distant Lover by Marvin Gaye,1973-08-28,1973.0,8.0,28.0,34029,https://genius.com/Marvin-gaye-distant-lover-l...,0,...,3,21.399,0.0,0.0,0.0,6.584,0.0,1.235,0.412,1.235
483,Led Zeppelin,Black Dog,Black Dog by Led Zeppelin,1971-11-08,1971.0,11.0,8.0,285291,https://genius.com/Led-zeppelin-black-dog-lyrics,0,...,5,4.11,0.457,0.0,0.0,9.589,0.0,0.457,0.457,2.283
413,Black Sabbath,Hand of Doom,Hand of Doom by Black Sabbath,1970-09-18,1970.0,9.0,18.0,65397,https://genius.com/Black-sabbath-hand-of-doom-...,0,...,12,0.429,0.0,0.429,0.0,6.867,0.0,0.429,1.288,5.15
183,Taylor Swift,Lover,Lover by Taylor Swift,2019-08-16,2019.0,8.0,16.0,2466432,https://genius.com/Taylor-swift-lover-lyrics,0,...,2,5.725,0.0,0.0,0.0,5.344,0.0,0.0,0.0,0.763
109,Dua Lipa,Homesick,Homesick by Dua Lipa,2017-06-02,2017.0,6.0,2.0,426137,https://genius.com/Dua-lipa-homesick-lyrics,0,...,23,0.664,0.0,0.0,0.0,10.631,0.997,0.0,0.0,7.641


In [15]:
combined_df.to_csv(f'../dataframes/with_counts/combined_count.csv')