# Run file for the raw* extended** song dataset with full lyrics

\* Each raw lyric file has not undergone text processing. Lyrics are taken as written on from https://www.azlyrics.com/

\** The extended song dataset, titled "songs-raw-extended", includes songs where the song artist had exactly one co-writer, excluding when the co-writer is featured or duets on the song. It includes all the songs in the dataset "songs-raw", which has only songs where the song artist was the sole songwriter.

In [1]:
import os
import nltk
import numpy as np
import pandas as pd
import csv
import string
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import TweetTokenizer
from datetime import datetime

In [2]:
# FUNCTIONS FOR WORD FREQUENCY ANALYSIS
# get word frequencies for song
def SongTextToFreqDict(text):
    wordfreq = [s.count(w) for w in s]
    return dict(list(zip(s,wordfreq)))

# sort by descending frequency
def sortFreqDict(freqdict):
    # reverse key and value, i.e. word and frequency
    newdict = [(freqdict[key],key) for key in freqdict]
    # sort by increasing frequency
    newdict.sort() 
    # reverse to sort by decreasing frequency
    newdict.reverse()
    return newdict

In [3]:
# DECLARING PRESETS
# artist age range categories
    # 1: 15-17, 2: 18-20, 3: 21-22, 4: 23-24, 5: 25-27, 6: 27-30, 7: 30-35, 8: 35-40
# first person singular pronouns
firstSG_pronouns = ['i','my','me','mine','myself']
firstPL_pronouns = ['we','us','our','ours','ourselves']
secondSG_pronouns = ['you','your','yours','yourself']

In [6]:
# read in artist info dataset
artist_data = pd.read_csv('artist_data.csv')

# create song dataframe for frequency analysis
songCorpusData = pd.DataFrame(columns=['song-title', 'artist', 'release-age-approx','release-age-range',
                                       'total-word-count',
                                       'i-count','me-count','my-count','mine-count','myself-count',
                                       'I-freq','1stsg-count','1stsg-freq',
                                      'we-count','us-count','our-count','ours-count','ourselves-count',
                                      '1stpl-count','1stpl-freq',
                                      'you-count','your-count','yours-count','yourself-count',
                                      '2ndsg-count','2ndsg-freq',
                                      'diff-1stsg-1stpl','diff-1stsg-1stsg2ndsg',
                                      'preposition_freq','prp_ALL_freq','noun-freq'])


directory = 'songs-raw-extended'
for file in os.scandir(directory): # for each song in the directory (folder) of unprocessed lyrics
    if (not file.name.startswith('~')) and (not file.name.startswith('.')) :
        name = os.path.basename(file)

        with open(directory + '/' + name, 'r') as song:
            lines = song.readlines()
            # save song info
            artist = lines[0] # singer and songwriter
            year = lines[1] # year of release
            lyrics = lines[2:] # extract only the lyrics from the file
            
        processedLyrics = open('songs-processed-extended/' + name.replace('.txt','_processed.txt'), 'w')
        wordFrequencies = open('word-frequencies/' + name.replace('.txt','_frequencies.csv'), 'w')
        
        s = []
        for lyric in lyrics:
            s.append(TweetTokenizer(preserve_case=False).tokenize(lyric))
        s = [word for line in s for word in line if word not in [',','?','(',')','.','"']]
        s = list(filter(("'").__ne__, s))
        word_count_total = len(s)

        processedLyrics.write(str(s))
        processedLyrics.close() # close file
        
         # FIND WORD FREQUENCIES
        freqdict = SongTextToFreqDict(s)
        for key in freqdict.keys():
            wordFrequencies.write("%s,%s\n"%(key,freqdict[key]))
        wordFrequencies.close() # close file
        # combining instances of 'i', e.g. conjunctions like 'i'll', 'i'm'
        if "i" in freqdict:
            for word in freqdict:
                if "i'" in word: 
                    freqdict['i'] = freqdict['i'] + 1
                    freqdict[word] = "merged-i"
            I_freq = freqdict['i']/word_count_total
        else: 
            I_freq = 0
        
        # calculate frequency of first person singular pronouns
        firstSG_freq = pd.DataFrame(columns=['i-count','me-count','my-count','mine-count','myself-count'])
        firstSG_dict = {}
        total_firstSG = 0
        for p in firstSG_pronouns:
            if p in freqdict.keys():
                firstSG_dict[p+'-count'] = freqdict[p]
                total_firstSG += freqdict[p]
            else:
                firstSG_dict[p+'-count'] = 0
        firstSG_freq = total_firstSG / word_count_total
        # calculate frequency of first person plural pronouns
        firstPL_freq = pd.DataFrame(columns=['we-count','us-count','our-count','ours-count','ourselves-count'])
        firstPL_dict = {}
        total_firstPL = 0
        for p in firstPL_pronouns:
            if p in freqdict.keys():
                firstPL_dict[p+'-count'] = freqdict[p]
                total_firstPL += freqdict[p]
            else:
                firstPL_dict[p+'-count'] = 0
        firstPL_freq = total_firstPL / word_count_total
        # calculate frequency of second person singular pronouns
        secondSG_freq = pd.DataFrame(columns=['you-count','your-count','yours-count','yourself-count'])
        secondSG_dict = {}
        total_secondSG = 0
        for p in secondSG_pronouns:
            if p in freqdict.keys():
                secondSG_dict[p+'-count'] = freqdict[p]
                total_secondSG += freqdict[p]
            else:
                secondSG_dict[p+'-count'] = 0
        secondSG_freq = total_secondSG / word_count_total
        # compare frequencies by computing the difference
        diff_1stsg_1stpl = firstSG_freq - firstPL_freq
        diff_1stsg_1stsg2ndsg = firstSG_freq - (firstPL_freq + secondSG_freq)

        # part of speech analysis
        tagged = nltk.pos_tag(s)
        pos_only = [pos for word, pos in tagged]
        preposition_count = pos_only.count('IN')
        preposition_freq = preposition_count/word_count_total
        all_personal_pns_count = pos_only.count('PRP')+pos_only.count('PRP$')
        all_prps_freq = all_personal_pns_count/word_count_total
        noun_count = pos_only.count('NN') + pos_only.count('NNS')
        noun_freq = noun_count/word_count_total
        
        # UPDATE DATASETS FOR ANALYSIS
        # calculate current age of artist
        current_age = datetime.now().year - int(artist_data["Birth Year"])
        # calculate age of artist at song's release
        release_age = int(year) - int(artist_data["Birth Year"])
        # categorize release age in age category (1-7) (see cell above 'Declaring Presets')
        age_categ = 0
        if 15 <= release_age <= 17:
            age_categ = 1
        elif 18 <= release_age <= 20:
            age_categ = 2
        elif 21 <= release_age <= 22:
            age_categ = 3
        elif 23 <= release_age <= 24:
            age_categ = 4
        elif 25 <= release_age <= 27:
            age_categ = 5
        elif 27 <= release_age <= 30:
            age_categ = 6
        elif 31 <= release_age <= 34:
            age_categ = 7
        elif 35 <= release_age <= 40:
            age_categ = 8
        else:
            age_categ = "out of bounds"
        
        # update song data
        songCorpusData = songCorpusData.append(
            {'song-title':name,
             'artist':artist.replace('\n',''),
             'release-age-approx':release_age,
             'release-age-range': age_categ,
            'total-word-count':word_count_total,
            'i-count':firstSG_dict['i-count'],
            'me-count':firstSG_dict['me-count'],
            'my-count':firstSG_dict['my-count'],
            'mine-count':firstSG_dict['mine-count'],
            'myself-count':firstSG_dict['myself-count'],
             'I-freq':I_freq,
            '1stsg-count':total_firstSG,
            '1stsg-freq':firstSG_freq,
            'we-count':firstPL_dict['we-count'],
             'us-count':firstPL_dict['us-count'],
             'our-count':firstPL_dict['our-count'],
             'ours-count':firstPL_dict['ours-count'],
             'ourselves-count':firstPL_dict['ourselves-count'],
             '1stpl-count':total_firstPL,
            '1stpl-freq':firstPL_freq,
            'you-count':secondSG_dict['you-count'],
            'your-count':secondSG_dict['your-count'],
            'yours-count':secondSG_dict['yours-count'],
            'yourself-count':secondSG_dict['yourself-count'],
            '2ndsg-count':total_secondSG,
            '2ndsg-freq':secondSG_freq,
            'diff-1stsg-1stpl':diff_1stsg_1stpl,
             'diff-1stsg-1stsg2ndsg':diff_1stsg_1stsg2ndsg,
            'preposition_freq':preposition_freq,
            'prp_ALL_freq':all_prps_freq,
            'noun-freq':noun_freq},
            ignore_index=True)
        # export updated song data
        songCorpusData.to_csv('all-data-extended.csv')
        dataSubset_age_1stsgfreq = songCorpusData[["release-age-range", "1stsg-freq"]]
        dataSubset_age_1stsgfreq.to_csv('dataSubset_age_1stsgfreq.csv',index=False)
        
        song.close() # close file
        
# print(songCorpusData)