# Corpus Analysis with spaCy

## Preparing Files

In [23]:
import os

# Split album lyric files into individual song lyrics files
def split_txt_file(input_file, album):
    output_directory = 'data/txt_files'
    os.makedirs(output_directory, exist_ok=True)

    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()
    songs = content.split('\n\n\n\n')

    for i, song in enumerate(songs):
        output_file = os.path.join(output_directory, f"{album}_{i + 1}.txt")
        with open(output_file, 'w', encoding='utf-8') as output:
            output.write(song.strip())

albums = [('data/albums/album_1.txt', 'nightmares'),
('data/albums/album_2.txt', 'ruin'),
('data/albums/album_3.txt', 'hollow_crown'),
('data/albums/album_4.txt', 'here_and_now'),
('data/albums/album_5.txt', 'daybreaker'),
('data/albums/album_6.txt', 'lost'),
('data/albums/album_7.txt', 'abandoned'),
('data/albums/album_8.txt', 'holy_hell'),
('data/albums/album_9.txt', 'wish_to_exist')]

for input_file, album in albums:
    split_txt_file(input_file, album)

### Installing, Importing and Preprocessing

In [25]:
import spacy
!spacy download en_core_web_sm
from spacy import displacy
import pandas as pd
pd.options.mode.chained_assignment = None
import plotly.express as px

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [26]:
lyrics = []
file_names = []

for _file_name in os.listdir('data/txt_files'):
    if _file_name.endswith('.txt'):
        lyrics.append(open('data/txt_files' + '/' + _file_name, 'r', encoding='utf-8').read())
        file_names.append(_file_name)

d = {'filename':file_names,'text':lyrics}

lyrics_df = pd.DataFrame(d)
lyrics_df['text'] = lyrics_df['text'].str.replace('\n+', ' ', regex=True).str.strip()
lyrics_df.head()

Unnamed: 0,filename,text
0,here_and_now_9.txt,Burn everything you have And leave it all behi...
1,abandoned_11.txt,"As above, so below. Dismantled piece by piece,..."
2,abandoned_10.txt,"There is no endgame, So whisper the truth and ..."
3,here_and_now_8.txt,"You've done this to yourself or so, so it seem..."
4,ruin_10.txt,I won't hold my hand out to anyone but you I d...


In [27]:
# Merging Metadata
metadata_df = pd.read_csv('metadata.csv')

lyrics_df['filename'] = lyrics_df['filename'].str.replace('.txt', '', regex=True)

final_lyrics_df = metadata_df.merge(lyrics_df,on='filename')
final_lyrics_df.head()

Unnamed: 0,filename,title,album,year,text
0,nightmares_1,To the death,Nightmares,2006,When In Troy Do as the Greeks do This is treac...
1,nightmares_2,You Don’t Walk Away From Dismemberment,Nightmares,2006,I will spill blood On your filthy princess whi...
2,nightmares_3,Minesweeper,Nightmares,2006,Been searching all day For the answers Do you ...
3,nightmares_4,They'll Be Hanging Us Tonight,Nightmares,2006,Raid all the liquor stores Kidnap the kids Thi...
4,nightmares_5,This Confession Means Nothing,Nightmares,2006,I close my eyes And hold you in my arms Despit...


## Text Enrichment

In [29]:
# Creating Doc Objects
nlp = spacy.load('en_core_web_sm')

def process_text(text):
    return nlp(text)

final_lyrics_df['doc'] = final_lyrics_df['text'].apply(process_text)
final_lyrics_df.head()

Unnamed: 0,filename,title,album,year,text,doc
0,nightmares_1,To the death,Nightmares,2006,When In Troy Do as the Greeks do This is treac...,"(When, In, Troy, Do, as, the, Greeks, do, This..."
1,nightmares_2,You Don’t Walk Away From Dismemberment,Nightmares,2006,I will spill blood On your filthy princess whi...,"(I, will, spill, blood, On, your, filthy, prin..."
2,nightmares_3,Minesweeper,Nightmares,2006,Been searching all day For the answers Do you ...,"(Been, searching, all, day, For, the, answers,..."
3,nightmares_4,They'll Be Hanging Us Tonight,Nightmares,2006,Raid all the liquor stores Kidnap the kids Thi...,"(Raid, all, the, liquor, stores, Kidnap, the, ..."
4,nightmares_5,This Confession Means Nothing,Nightmares,2006,I close my eyes And hold you in my arms Despit...,"(I, close, my, eyes, And, hold, you, in, my, a..."


### Tokenisation

In [31]:
def get_token(doc):
    return [(token.text) for token in doc]

final_lyrics_df['tokens'] = final_lyrics_df['doc'].apply(get_token)
final_lyrics_df.head()

Unnamed: 0,filename,title,album,year,text,doc,tokens
0,nightmares_1,To the death,Nightmares,2006,When In Troy Do as the Greeks do This is treac...,"(When, In, Troy, Do, as, the, Greeks, do, This...","[When, In, Troy, Do, as, the, Greeks, do, This..."
1,nightmares_2,You Don’t Walk Away From Dismemberment,Nightmares,2006,I will spill blood On your filthy princess whi...,"(I, will, spill, blood, On, your, filthy, prin...","[I, will, spill, blood, On, your, filthy, prin..."
2,nightmares_3,Minesweeper,Nightmares,2006,Been searching all day For the answers Do you ...,"(Been, searching, all, day, For, the, answers,...","[Been, searching, all, day, For, the, answers,..."
3,nightmares_4,They'll Be Hanging Us Tonight,Nightmares,2006,Raid all the liquor stores Kidnap the kids Thi...,"(Raid, all, the, liquor, stores, Kidnap, the, ...","[Raid, all, the, liquor, stores, Kidnap, the, ..."
4,nightmares_5,This Confession Means Nothing,Nightmares,2006,I close my eyes And hold you in my arms Despit...,"(I, close, my, eyes, And, hold, you, in, my, a...","[I, close, my, eyes, And, hold, you, in, my, a..."


### Lemmatisation

In [33]:
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

final_lyrics_df['lemmas'] = final_lyrics_df['doc'].apply(get_lemma)
final_lyrics_df.head()

Unnamed: 0,filename,title,album,year,text,doc,tokens,lemmas
0,nightmares_1,To the death,Nightmares,2006,When In Troy Do as the Greeks do This is treac...,"(When, In, Troy, Do, as, the, Greeks, do, This...","[When, In, Troy, Do, as, the, Greeks, do, This...","[when, in, Troy, do, as, the, Greeks, do, this..."
1,nightmares_2,You Don’t Walk Away From Dismemberment,Nightmares,2006,I will spill blood On your filthy princess whi...,"(I, will, spill, blood, On, your, filthy, prin...","[I, will, spill, blood, On, your, filthy, prin...","[I, will, spill, blood, on, your, filthy, prin..."
2,nightmares_3,Minesweeper,Nightmares,2006,Been searching all day For the answers Do you ...,"(Been, searching, all, day, For, the, answers,...","[Been, searching, all, day, For, the, answers,...","[be, search, all, day, for, the, answer, do, y..."
3,nightmares_4,They'll Be Hanging Us Tonight,Nightmares,2006,Raid all the liquor stores Kidnap the kids Thi...,"(Raid, all, the, liquor, stores, Kidnap, the, ...","[Raid, all, the, liquor, stores, Kidnap, the, ...","[raid, all, the, liquor, store, Kidnap, the, k..."
4,nightmares_5,This Confession Means Nothing,Nightmares,2006,I close my eyes And hold you in my arms Despit...,"(I, close, my, eyes, And, hold, you, in, my, a...","[I, close, my, eyes, And, hold, you, in, my, a...","[I, close, my, eye, and, hold, you, in, my, ar..."


### Part of Speech Tagging


In [35]:
def get_pos(doc):
    return [(token.pos_, token.tag_) for token in doc]

final_lyrics_df['POS'] = final_lyrics_df['doc'].apply(get_pos)

final_lyrics_df.head()

Unnamed: 0,filename,title,album,year,text,doc,tokens,lemmas,POS
0,nightmares_1,To the death,Nightmares,2006,When In Troy Do as the Greeks do This is treac...,"(When, In, Troy, Do, as, the, Greeks, do, This...","[When, In, Troy, Do, as, the, Greeks, do, This...","[when, in, Troy, do, as, the, Greeks, do, this...","[(SCONJ, WRB), (ADP, IN), (PROPN, NNP), (NOUN,..."
1,nightmares_2,You Don’t Walk Away From Dismemberment,Nightmares,2006,I will spill blood On your filthy princess whi...,"(I, will, spill, blood, On, your, filthy, prin...","[I, will, spill, blood, On, your, filthy, prin...","[I, will, spill, blood, on, your, filthy, prin...","[(PRON, PRP), (AUX, MD), (VERB, VB), (NOUN, NN..."
2,nightmares_3,Minesweeper,Nightmares,2006,Been searching all day For the answers Do you ...,"(Been, searching, all, day, For, the, answers,...","[Been, searching, all, day, For, the, answers,...","[be, search, all, day, for, the, answer, do, y...","[(AUX, VBN), (VERB, VBG), (DET, DT), (NOUN, NN..."
3,nightmares_4,They'll Be Hanging Us Tonight,Nightmares,2006,Raid all the liquor stores Kidnap the kids Thi...,"(Raid, all, the, liquor, stores, Kidnap, the, ...","[Raid, all, the, liquor, stores, Kidnap, the, ...","[raid, all, the, liquor, store, Kidnap, the, k...","[(VERB, VB), (DET, PDT), (DET, DT), (NOUN, NN)..."
4,nightmares_5,This Confession Means Nothing,Nightmares,2006,I close my eyes And hold you in my arms Despit...,"(I, close, my, eyes, And, hold, you, in, my, a...","[I, close, my, eyes, And, hold, you, in, my, a...","[I, close, my, eye, and, hold, you, in, my, ar...","[(PRON, PRP), (VERB, VBP), (PRON, PRP$), (NOUN..."


In [36]:
# Download Enriched Dataset
final_lyrics_df.to_csv('Architects_Lyrics_with_spaCy_tags.csv')