# Lyrics Data Analysis

### IMPORTS

In [1]:
from collections import Counter
from progress.bar import PixelBar
from nltk.corpus import stopwords
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

import json
import re
import fasttext
import pandas as pd
import numpy as np
import yake

### Load File

In [2]:
data = None
with open('lyrics.json') as json_file:
    data = json.load(json_file)
if data is not None:
    print('Success.')
else:
    print('Could not read file.')
    

Success.


# Create  Raps

In [3]:
raps: list = list()
for key in data.keys():
    if key != 'interrupted':
        raps += data[key]
    else:
        continue


## Clean text

### Remove all non-english texts

In [4]:
fmodel = fasttext.load_model('lid.176.bin')



In [5]:
en_only = list()
for rap in raps:
    processed = re.sub(r'\n', '', rap)
    # detect language for each rap
    res = fmodel.predict(processed)[0][0]
    if res == '__label__en':
        en_only.append(rap)
raps = en_only
del en_only

### Remove brackets and respective text inside

In [8]:
regex_brackets: str = r"\[([-+&/\"'A-Za-z0-9_,\.;\s:()Ē?’$éöč!#*–ã“”{}Ž]+)\]|\[\]"

for i, rap in enumerate(raps):
    raps[i] = re.sub(regex_brackets, "", rap)

In [9]:
escaped = list()
for rap in raps:
    if bool(re.match(r'\[(.*)\]', rap)):
        escaped.append(rap)
        print(rap)

assert len(escaped) == 0, 'Texts not cleaned yet!'

### Remove empty lines

In [10]:
for i, rap in enumerate(raps):
    raps[i] = re.sub(r"\n{2,}", '', rap)


# Statistics

### Load problematic words

In [11]:
offensive_words: list = []
with open('OffWords.txt', 'r') as file:
    while (line := file.readline().rstrip()):
       offensive_words.append(line.lower())


### Count often occuring words

#### Init SpaCy

In [12]:
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

In [16]:
pipe: any = tokenizer.pipe(raps, batch_size=100)

In [14]:
raps_tmp: list = []
for i, doc in enumerate(pipe):
    if len(doc) < 1000:
        raps_tmp.append(raps[i])
raps = raps_tmp
del raps_tmp

In [17]:
word_n: Counter = Counter()
word_without_stopwords: Counter = Counter()
off_word_n: Counter = Counter()
length_arr: np.ndarray = np.array([])

bar = PixelBar('Processing', max=len(raps))

for doc in pipe:
    length_arr = np.append(length_arr, len(doc))
    for token in doc:
        token_text: str = token.text
        if token_text in offensive_words:
            off_word_n[token_text] += 1
        token_lemma: str = token.lemma_
        if token_lemma == '\n':
            continue
        if token_lemma.lower() not in stopwords.words('english'):
            word_without_stopwords[token_lemma] += 1
        word_n[token_lemma] += 1
    bar.next()
bar.finish()

    

In [18]:
counter2dict: callable = lambda counter: pd.DataFrame.from_dict({'keys': counter.keys(), 'n': counter.values()})

In [19]:
df_off: pd.DataFrame = counter2dict(off_word_n)
df_off.sort_values(by=['n'], ascending=False)

Unnamed: 0,keys,n
0,nigga,6797
11,niggas,6355
15,shit,6025
4,fuck,4894
8,bitch,4217
...,...,...
345,orgies,1
430,pixie,1
346,vaginal,1
427,pisses,1


In [20]:
df_lemmas: pd.DataFrame = counter2dict(word_n)
df_lemmas.sort_values(by=['n'], ascending=False).head(50)

Unnamed: 0,keys,n
15,the,84397
18,I,67189
137,a,49087
217,you,43939
13,to,39338
88,and,31619
57,my,30882
5,in,27115
77,it,24694
233,I'm,23595


In [23]:
df_lemmas_ws: pd.DataFrame = counter2dict(word_without_stopwords)
df_lemmas_ws.sort_values(by=['n'], ascending=False).head(50)

Unnamed: 0,keys,n
176,I'm,23595
164,like,18033
45,get,12702
28,got,12516
283,know,10923
84,ain't,8813
18,nigga,6797
221,niggas,6355
346,shit,6025
539,make,4908


## Keyword-Extraction

In [22]:
kw_extractor: any = yake.KeywordExtractor()
language: str = "en"
max_ngram_size: int = 3
deduplication_threshold: float = 0.9
numOfKeywords: int = 15

keyword_list: list = []
for rap in raps:
    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_threshold,
        top=numOfKeywords,
        features=None)
    keywords = custom_kw_extractor.extract_keywords(rap)
    keywords = list(zip(*keywords))
    keyword_list.append(', '.join(list(keywords[0])) if len(keywords) > 0 else None)

df: pd.DataFrame = pd.DataFrame.from_dict({'keywords': keyword_list, 'lyrics': raps})
df = df.dropna()
df['text'] = 'KEYWORDS ' + df['keywords'].astype(str) + ' RAP_BEGIN ' + df['lyrics'].astype(str)
df[['text']].to_json('KeywordLyrics.json')


In [None]:
df_: pd.DataFrame = pd.read_json('KeywordLyrics.json')
df_

Unnamed: 0,text
0,"KEYWORDS Dresta done stepped, Gangsta Dresta, ..."
1,"KEYWORDS boy, pull your card, hard, boys, talk..."
10,"KEYWORDS Hey Yella, Nigga, bitch, shit, Ruthle..."
100,"KEYWORDS Parental discretion, Ayo Dre, Dre, Pa..."
1000,"KEYWORDS picture worth, worth a thousand, thou..."
...,...
995,"KEYWORDS rap shit, shit, grade, felt fantastic..."
996,"KEYWORDS back, Nigga, real, round, back home, ..."
997,"KEYWORDS bag, Yup, girl, Chino Hills, love, lo..."
998,"KEYWORDS racks, Yeah, racks and things, things..."
