In [1]:
#Importing libraries

import spacy
!spacy download en_core_web_sm  

import os
import re
from spacy import displacy
from IPython.display import display, HTML
import pandas as pd
pd.options.mode.chained_assignment = None 
import plotly.express as px

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.6 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
#Load corpus files
corpus_path = '/Users/minjacvetkovski/Documents/DH MA/Collecting data/A2/corpus'

texts = []
file_names = []

for _file_name in os.listdir(corpus_path):
    if _file_name.endswith('.txt'):
        path = os.path.join(corpus_path, _file_name)
        text = open(path, 'r', encoding='utf-8').read()
        texts.append(text)
        file_names.append(_file_name)

#Create DataFrame 
final_paper_df = pd.DataFrame({
    'Filename': file_names,
    'Document': texts 
})

In [3]:
#Preprocessing text
def preprocess_text(text):
    """Lowercase text, normalize common contractions, remove extra spaces."""
    text = text.lower()
    
    #Dictionary of contractions and informal words to normalized form
    contractions = {
        "who's": "who is", "what's": "what is", "she's": "she is", "he's": "he is",
        "it's": "it is", "that's": "that is", "there's": "there is", "i'm": "i am",
        "i've": "i have", "i'll": "i will", "you've": "you have", "you're": "you are",
        "we're": "we are", "they're": "they are", "can't": "cannot", "don't": "do not",
        "doesn't": "does not", "didn't": "did not", "won't": "will not", "wouldn't": "would not",
        "shouldn't": "should not", "couldn't": "could not", "'cause": "because", "’cause": "because",
        "gonna": "going to", "wanna": "want to", "gotta": "have got to", "owt": "anything",
        "summat": "something", "starin'": "staring", "drivin'": "driving", "givin'": "giving",
        "pullin'": "pulling", "goin'": "going", "fuckin'": "fucking", "dancin'": "dancing",
        "clingin'": "clinging", "soundin'": "sounding", "takin'": "taking", "makin'": "making",
        "smudgin'": "smudging", "askin'": "asking", "havin'": "having",
        "she dont": "she does not", "he dont": "he does not", "it dont": "it does not"
    }
    
    for contraction, full_form in contractions.items():
        text = re.sub(r'\b{}\b'.format(re.escape(contraction)), full_form, text, flags=re.IGNORECASE)
    
    #Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

#Apply preprocessing
final_paper_df['Text'] = final_paper_df['Document'].apply(preprocess_text)

In [4]:
#Load metadata and merge
metadata_path = os.path.join(corpus_path, 'metadata.csv')
metadata_df = pd.read_csv(metadata_path)

#Remove .txt from filenames for merging
metadata_df['Filename'] = metadata_df['Filename'].str.replace('.txt', '', regex=True)
final_paper_df['Filename'] = final_paper_df['Filename'].str.replace('.txt', '', regex=True)

#Merge metadata into main DataFrame
final_paper_df = metadata_df.merge(final_paper_df, on='Filename')

In [5]:
#Load spaCy NLP Model
nlp = spacy.load('en_core_web_sm')
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [6]:
#Tokenization, Lemmas, POS, NER
def annotate_text(text):
    """Return tokens, lemmas, and POS tags for a given text string."""
    doc = nlp(text)
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [token.pos_ for token in doc]
    return tokens, lemmas, pos_tags

In [7]:
#Apply annotation
final_paper_df[['Tokens', 'Lemmas', 'POS']] = final_paper_df['Text'].apply(lambda x: pd.Series(annotate_text(x)))

In [8]:
#Proper nouns and named entities
def extract_proper_nouns(text):
    doc = nlp(text)
    return [token.text for token in doc if token.pos_ == 'PROPN']

def extract_named_entities(text):
    doc = nlp(text)
    return [ent.label_ for ent in doc.ents]

def extract_ne_words(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

final_paper_df['Proper_Nouns'] = final_paper_df['Text'].apply(extract_proper_nouns)
final_paper_df['Named_Entities'] = final_paper_df['Text'].apply(extract_named_entities)
final_paper_df['NE_Words'] = final_paper_df['Text'].apply(extract_ne_words)

In [10]:
#Save as CSV
output_csv = os.path.join(corpus_path, 'annotated_dataset.csv')
final_paper_df.to_csv(output_csv, index=False)

final_paper_df.head(6)

Unnamed: 0,Filename,Title,Author,Year,Document,Text,Tokens,Lemmas,POS,Proper_Nouns,Named_Entities,NE_Words
0,song1,When The Sun Goes Down,Arctic Monkeys,2006,"I said, who's that girl there?\nI wonder what ...","i said, who is that girl there? i wonder what ...","[i, said, ,, who, is, that, girl, there, ?, i,...","[I, say, ,, who, be, that, girl, there, ?, I, ...","[PRON, VERB, PUNCT, PRON, AUX, DET, NOUN, ADV,...","[roxanne, mr, ., inconspicuous, givin]","[CARDINAL, TIME, PERSON, ORG, PERSON, DATE, CA...","[half, the night, roxanne, ford, inconspicuous..."
1,song2,Fluorescent Adolescent,Arctic Monkeys,2007,You used to get it in your fishnets\nNow you o...,you used to get it in your fishnets now you on...,"[you, used, to, get, it, in, your, fishnets, n...","[you, use, to, get, it, in, your, fishnet, now...","[PRON, VERB, PART, VERB, PRON, ADP, PRON, NOUN...","[mary, mecca, dauber, flo, mary]","[TIME, TIME]","[night, night]"
2,song3,The Jeweller's Hands,Arctic Monkeys,2009,Fiendish wonder in the carnival's wake\nDull c...,fiendish wonder in the carnival's wake dull ca...,"[fiendish, wonder, in, the, carnival, 's, wake...","[fiendish, wonder, in, the, carnival, 's, wake...","[ADJ, NOUN, ADP, DET, NOUN, PART, ADJ, ADJ, NO...",[],"[NORP, TIME]","[fiendish, the night]"
3,song4,Piledriver Waltz,Arctic Monkeys,2011,I etched the face of a stopwatch on the back o...,i etched the face of a stopwatch on the back o...,"[i, etched, the, face, of, a, stopwatch, on, t...","[I, etch, the, face, of, a, stopwatch, on, the...","[PRON, VERB, DET, NOUN, ADP, DET, NOUN, ADP, D...",[amber],"[TIME, TIME]","[this morning, this morning]"
4,song5,R U Mine?,Arctic Monkeys,2012,"I'm a puppet on a string\nTracy Island, time-t...","i am a puppet on a string tracy island, time-t...","[i, am, a, puppet, on, a, string, tracy, islan...","[I, be, a, puppet, on, a, string, tracy, islan...","[PRON, AUX, DET, NOUN, ADP, DET, NOUN, PROPN, ...",[tracy],"[CARDINAL, DATE, DATE, DATE, TIME, DATE, TIME,...","[four, years, days, tomorrow, tonight, tomorro..."
5,song6,Star Treatment,Arctic Monkeys,2018,I just wanted to be one of The Strokes\nNow lo...,i just wanted to be one of the strokes now loo...,"[i, just, wanted, to, be, one, of, the, stroke...","[I, just, want, to, be, one, of, the, stroke, ...","[PRON, ADV, VERB, PART, AUX, NUM, ADP, DET, NO...","[bandana, jukebox]","[CARDINAL, DATE, DATE, DATE, TIME, PERSON, DAT...","[half, 1984, 2019, '70s, tonight, jukebox, '70..."
