# Analysing Data - Assignment 1
Henry Alexander Hornung (S4156145)

## Downloading and Importing Modules

In [None]:
# Ensuring that all necessary packages and resources are downloaded
!pip install matplotlib
!pip install pandas
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!python -m spacy download nl_core_news_sm

In [None]:
# Importing packages
import os
import matplotlib.pyplot as plt
import pandas as pd
import re

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer
nltk.download('punkt')

import spacy
from spacy import displacy

## Part 1

#### Exercise 1
Perform sentence splitting and word tokenization. Report the statistics of word frequency (per story and in total) in a python dictionary, plot the 25 most common words (per story and in total) in the form of a histogram. 

In [None]:
# Importing files for part 1
part1 = []
for filename in os.listdir("Part_I_1-2/"):
    if filename != ".DS_Store":
        with open("Part_I_1-2/" + filename, encoding = "utf-8") as file:
            content = file.read()
        part1.append(content)

In [None]:
# Removing title from each text, saving it under metadata list
metadata = []
for index, text in enumerate(part1):
    metadata.append(text.split("---")[0])
    if "---" in text:
        part1[index] = text.split("---")[1]

In [None]:
# Sentence tokenizing
part1_sent = []
for text in part1:
    part1_sent.append(sent_tokenize(text))

# Word tokenizing
part1_word = []
for text in part1:
    part1_word.append(word_tokenize(text))

In [None]:
# Word frequencies function
def getwf(text):
    wf = {}
    punct = "“”.,?'’!''``"
    
    # Getting the total occurances of each word
    for word in text:
        if word not in punct:
            if word.lower() not in wf.keys():
                wf[word.lower()] = 1
            else:
                wf[word.lower()] += 1

    # Transforming the word counts into word frequencies
    total_words = len(wf.keys())
    for word, occurances in wf.items():
        wf[word] = occurances/total_words

    # Sorting them from highest to lowest, including only the top 25 most common words.
    wf = dict(sorted(wf.items(), key=lambda x: x[1], reverse=True)[:25])
    
    return wf

In [None]:
# Word frequencies total
word_total = []
for text in part1_word:
    for word in text:
        word_total.append(word)

wf_total = getwf(word_total)

In [None]:
# Word frequencies per story
wf_01 = getwf(part1_word[0])
wf_02 = getwf(part1_word[1])
wf_03 = getwf(part1_word[2])
wf_04 = getwf(part1_word[3])
wf_05 = getwf(part1_word[4])

In [None]:
# Word frequencies total
word_total = []
for text in part1_word:
    for word in text:
        word_total.append(word)

wf_total = getwf(word_total)

In [None]:
# Function to create/export the histogram
def get_wf_hist(dictionary, title, file_name):
    words = list(dictionary.keys())
    frequencies = list(dictionary.values())
    
    plt.figure(figsize=(12, 4))
    plt.bar(words, frequencies)
    plt.title(title)
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    filepath = os.path.join("Figures", file_name)
    
    return (plt.savefig(filepath), plt.show())

In [None]:
# Function to ouput the frequencies in a df
def createDF(wf_list):
    wf_df = pd.DataFrame()

    for dictionary, text in wf_list:
        # Saving the words and frequencies as pd series respectively
        words = pd.Series(dictionary.keys(), name = text + " words")
        frequencies = pd.Series(dictionary.values(), name = text + " freq")
    
        # Adding the series to a df, and subsequently concating this with the final df
        temp_df = pd.concat([words, frequencies], axis = 1)
        wf_df = pd.concat([wf_df, temp_df], axis = 1)
    
    # Displaying the final df
    return wf_df

In [None]:
# Saving each frequency dictionary in a tuple with its corresponding text to assign the titles of the columns
wf_list = [(wf_01, "Text 1"), (wf_02, "Text 2"), (wf_03, "Text 3"), (wf_04, "Text 4"), (wf_05, "Text 5"), (wf_total, "Total")]

# Displaying the most frequent words, along with their frequencies
createDF(wf_list).head()

In [None]:
# Outputting each histogram
get_wf_hist(wf_01, metadata[0]+" Most Frequent Words", "fig1")
get_wf_hist(wf_02, metadata[1]+" Most Frequent Words", "fig2")
get_wf_hist(wf_03, metadata[2]+" Most Frequent Words", "fig3")
get_wf_hist(wf_04, metadata[3]+" Most Frequent Words", "fig4")
get_wf_hist(wf_05, metadata[4]+" Most Frequent Words", "fig5")
get_wf_hist(wf_total, "Most Frequent Words Total", "fig6")

#### Exercise 2
Perform stemming on the text using the Porter and Lancaster stemmer. Re-create the statistics and the plot from 1. Compare the differences between the stemmed and unstemmed results, and discuss the difference in the results of the two stemmers.

In [None]:
# Function to apply porter stemmer to list of words
def Pstemmer(tokenized_text):
    Porter = PorterStemmer()
    stemmed_words = []
    
    for word in tokenized_text:
        stemmed_words.append(Porter.stem(word))
    
    return stemmed_words

In [None]:
# Function to apply lancaster stemmer to list of words
def Lstemmer(tokenized_text):
    Lancaster = LancasterStemmer()
    stemmed_words = []
    
    for word in tokenized_text:
        stemmed_words.append(Lancaster.stem(word))
    
    return stemmed_words

In [None]:
# Applying both stemmers to the texts
porter_01 = Pstemmer(part1_word[0])
porter_02 = Pstemmer(part1_word[1])
porter_03 = Pstemmer(part1_word[2])
porter_04 = Pstemmer(part1_word[3])
porter_05 = Pstemmer(part1_word[4])
porter_total = Pstemmer(word_total)

lancaster_01 = Lstemmer(part1_word[0])
lancaster_02 = Lstemmer(part1_word[1])
lancaster_03 = Lstemmer(part1_word[2])
lancaster_04 = Lstemmer(part1_word[3])
lancaster_05 = Lstemmer(part1_word[4])
lancaster_total = Lstemmer(word_total)

In [None]:
# Getting the word frequencies for each stemmed text
porter_wf_01 = getwf(porter_01)
porter_wf_02 = getwf(porter_02)
porter_wf_03 = getwf(porter_03)
porter_wf_04 = getwf(porter_04)
porter_wf_05 = getwf(porter_05)
porter_wf_total = getwf(porter_total)

lancaster_wf_01 = getwf(lancaster_01)
lancaster_wf_02 = getwf(lancaster_02)
lancaster_wf_03 = getwf(lancaster_03)
lancaster_wf_04 = getwf(lancaster_04)
lancaster_wf_05 = getwf(lancaster_05)
lancaster_wf_total = getwf(lancaster_total)

In [None]:
# Outputting the porter-stemmed frequencies in df
porter_list = [(porter_wf_01, "Text 1"), (porter_wf_02, "Text 2"), (porter_wf_03, "Text 3"), (porter_wf_04, "Text 4"), (porter_wf_05, "Text 5"), (porter_wf_total, "Total")]

createDF(porter_list).head()

In [None]:
# Generating the histograms for the porter-stemmed texts
porter_wfs = [porter_wf_01, porter_wf_02, porter_wf_03, porter_wf_04, porter_wf_05, porter_wf_total]
n = 7

for index, wf in enumerate(porter_wfs):
    if index <= 4:
        get_wf_hist(wf, "WF with porter stemmer: Text " + str(index+1), f"fig{n}")
        n += 1
    else:
        get_wf_hist(wf, "WF with porter stemmer: Total", f"fig{n}")

In [None]:
# Doing the same for the lancaster-stemmed texts
lancaster_list = [(lancaster_wf_01, "Text 1"), (lancaster_wf_02, "Text 2"), (lancaster_wf_03, "Text 3"), (lancaster_wf_04, "Text 4"), (lancaster_wf_05, "Text 5"), (lancaster_wf_total, "Total")]

createDF(lancaster_list).head()

In [None]:
lancaster_wfs = [lancaster_wf_01, lancaster_wf_02, lancaster_wf_03, lancaster_wf_04, lancaster_wf_05, lancaster_wf_total]
n = 13

for index, wf in enumerate(lancaster_wfs):
    if index <= 4:
        get_wf_hist(wf, "WF with lancaster stemmer: Text " + str(index+1), f"fig{n}")
        n += 1
    else:
        get_wf_hist(wf, "WF with lancaster stemmer: Total", f"fig{n}")

#### Exercise 3
In Brightspace, you will find three translations of Tom Sawyer by Mark Twain. After cleaning the document (e.g. removing the preamble, TOC, licensing information …), use the appropriate spacy models to derive the POS-tags of the text. Report the frequencies of the tags for the three languages. What assumptions do you make based on the findings?

In [None]:
# Importing the texts
part1_3 = []
for filename in os.listdir("Part_I_3/"):
    if filename != ".DS_Store":
        with open("Part_I_3/" + filename, encoding = "utf-8") as file:
            content = file.read()
        part1_3.append(content)

In [None]:
# Removing the text at the beginning and end of the documents
for text in part1_3:
    text = text.split("*** END OF THE PROJECT GUTENBERG EBOOK")[0]
    if "Erstes Kapitel." in text:
        de_text = text.split("Erstes Kapitel.")[1]
    elif "HOOFDSTUK I." in text:
        nl_text = text.split("HOOFDSTUK I.")[1]
    elif "CHAPTER I.":
        en_text = text.split("\n\n\n\nCHAPTER I\n")[1]

# Removing excess line breaks
de_text = re.sub(r'\n{1,}', ' ', de_text)
nl_text = re.sub(r'\n{1,}', ' ', nl_text)
en_text = re.sub(r'\n{1,}', ' ', en_text)

In [None]:
# Tokenizing English text and getting POS tags using SpaCy
nlp = spacy.load('en_core_web_sm')

sawyer_en_tokenized = [token.text for token in nlp(en_text)]
sawyer_en_pos = [token.pos_ for token in nlp(en_text)]

In [None]:
# Tokenizing German text and getting POS tags using SpaCy
nlp = spacy.load('de_core_news_sm')

sawyer_de_tokenized = [token.text for token in nlp(de_text)]
sawyer_de_pos = [token.pos_ for token in nlp(de_text)]

In [None]:
# Tokenizing Duch text and getting POS tags using SpaCy
nlp = spacy.load('nl_core_news_sm')

sawyer_nl_tokenized = [token.text for token in nlp(nl_text)]
sawyer_nl_pos = [token.pos_ for token in nlp(nl_text)]

In [None]:
# Adding tokenized texts and POS tags to a df
df_en = pd.DataFrame()
df_en["text_en"] = sawyer_en_tokenized
df_en["POS_en"] = sawyer_en_pos

df_de = pd.DataFrame()
df_de["text_de"] = sawyer_de_tokenized
df_de["POS_de"] = sawyer_de_pos

df_nl = pd.DataFrame()
df_nl["text_nl"] = sawyer_nl_tokenized
df_nl["POS_nl"] = sawyer_nl_pos

sawyer_df = pd.concat([df_en, df_de, df_nl], axis=1)
sawyer_df.head()

In [None]:
# Finding the frequencies of each tag
def posfreq(pos):
    pos_dict = {}
    for tag in pos:
        if tag not in pos_dict.keys():
            pos_dict[tag] = 1
        else:
            pos_dict[tag] += 1
    total_tags = sum(pos_dict.values())

    for word, occurances in pos_dict.items():
        pos_dict[word] = occurances/total_tags
    
    return pos_dict

en_freq = posfreq(sawyer_en_pos)
de_freq = posfreq(sawyer_de_pos)
nl_freq = posfreq(sawyer_nl_pos)

# Presenting frequencies in a df
pos_freq_df = pd.DataFrame({'English': en_freq, 'German': de_freq, 'Dutch': nl_freq})
pos_freq_df

In [None]:
# Defining function to plot POS tags on a histogram
def get_pos_hist(dictionary, title, file_name):
    words = list(dictionary.keys())
    frequencies = list(dictionary.values())
    
    plt.figure(figsize=(12, 4))
    plt.bar(words, frequencies)
    plt.title(title)
    plt.xlabel("Tags")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    filepath = os.path.join("Figures", file_name)
    
    return (plt.savefig(filepath), plt.show())

# Applying the function
get_pos_hist(dict(sorted(en_freq.items(), key=lambda x: x[1], reverse=True)), "English Text POS Tags", "fig19")
get_pos_hist(dict(sorted(de_freq.items(), key=lambda x: x[1], reverse=True)), "German Text POS Tags", "fig20")
get_pos_hist(dict(sorted(nl_freq.items(), key=lambda x: x[1], reverse=True)), "Dutch Text POS Tags", "fig21")

## Part 2

On the data for Part I (1+2), perform Named Entity Recognition using spacy. Annotate a small data set of 1-2 sentences per text with named entity categories. (The sentences are supposed to include names of characters, places and other entities.)

How good or bad is the performance of the automatic method on the manually annotated text? Report Precision, Recall, F1 Score and discuss the results.

In [None]:
# Re-loading the English core
nlp = spacy.load("en_core_web_sm")

#### Text 1: Another plot? - Bones_Bard 

##### Automatic NER

In [None]:
# Text 1 automatic NER
doc = nlp(part1[0])
displacy.render(doc, style="ent")

##### Manual NER

"(Missy: PER) was sitting on a chair in a house, maps and papers spread around, normally her planning was mental, but (River: PER) had suggested a physical map to refer to and it was helpful. (River: PER) walked out of the bedroom. “The people who own this house really have no good clothes.” She was wearing a t-shirt that drenched her form, and some trousers that looked about the right size, (Missy: PER) smiled, still in her (Mary Poppins: PER) looking outfit, (River: PER) looked over at the maps."

#### Text 2: You’d looked me in my eyes and told me - Bones_Bard 

##### Automatic NER

In [None]:
# Text 2 automatic NER
doc = nlp(part1[1])
displacy.render(doc, style="ent")

##### Manual NER

"She sat in her bed, thinking over how she got here, her husband, a man, and occasionally women, she loved, who also made her want to slap him, he hadn’t known who she was, she thought back to something she once told her father, well, (the Doctor: PER) not knowing who she was had killed her. She took a book from her bedside table, she was surrounded by them these day, she whispered to herself. “It’s a bit boring with those trips, isn’t it.” She didn’t expect anyone to agree, and nobody did."

#### Text 3: I Want You Safe - Beegabbagabba 

##### Automatic NER

In [None]:
# Text 3 automatic NER
doc = nlp(part1[2])
displacy.render(doc, style="ent")

##### Manual NER

"(Rose Tyler: PER)." (Rassilon: PER), how he loved to say her name. "I was going to take you to so many places. (Barcelona: LOC). Not the city (Barcelona: LOC/GPE), the planet (Barcelona: LOC). You'd love it. Fantastic place! They've got dogs with no noses! Imagine how many times a day you end up tellin' that joke and it's still funny!"

#### Text 4: Even If the Language of Flowers is Dead, Roses Always Mean Love - aubreyplvr 

##### Automatic NER

In [None]:
# Text 4 automatic NER
doc = nlp(part1[3])
displacy.render(doc, style="ent")

##### Manual NER

"The TARDIS wasn’t meant to translate (Gallifreyan: LANG) to other languages. So, very early on, (The Doctor: PER) had adjusted some things so she could translate (Gallifreyan: LANG) to (English: LANG). The system had to be repaired regularly, but it was worth it for people like (Rose: PER)."

#### Text 5: WWTDD: What would the Doctor do? - aboutcustardcreams 

##### Automatic NER

In [None]:
# Text 5 automatic NER
doc = nlp(part1[4])
displacy.render(doc, style="ent")

##### Manual NER

"You didn’t mean to be away for long, however, spending some time alone allowed you to really focus on (the Doctor: PER)’s words. And you realized all this time she had been right. She was always right and it was annoying. You couldn’t rely on violence for whatever reason, so you made use of the time alone to fix yourself. You never thought you’d do that for anything or anyone, yet you were willing to change for her. You had a vortex manipulator to move around and that’s what you used to answer distress calls all over the (Universe: LOC)."