In [None]:
# import necessary libraries
import nltk
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import os 
from glob import glob
#install English language model
!spacy download en_core_web_sm
!spacy download en_core_web_lg



In [None]:
# First analysis: three English texts

# define function for reading the text files
def read_file(filename):
    # Read the contents of `filename` and return as a string
    with open(filename, encoding='utf8') as infile:
        contents = infile.read()
    return contents


# store the content of the text files in a dictionary (file's name as KEY and content as VALUE)
corpus = {}
# iterate through all text files, read, and return into dictionary
for filename in glob('Part_I_1-2/*.txt'):
    corpus[filename] = read_file(filename)
    

In [None]:
corpus

In [None]:
#import from nltk library
from nltk.tokenize import sent_tokenize, word_tokenize 

In [None]:
# Part I.1: sentence splitting and word tokenization 

# define function for sentence splitting
def split_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

# define function for word tokenization
def tokenize_words(sentence):
    tokens = word_tokenize(sentence)
    cleaned_tokens = []
    punctuation = ".,?!:;()[]''``*\""
    for token in tokens:
        # Convert to lowercase and check if it's not punctuation
        if token.lower() not in punctuation:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

# define function for removing .txt extension
def remove_ext(filepath):
    name, ext = os.path.splitext(filepath)
    return name

# store the preprocessed texts in a dictionary 
processed_texts = {}
# add tokenized sentences and words to the processed_texts dictionary
for filename, text in corpus.items():
    clean_filename = remove_ext(filename)
    sentences = split_sentences(text)  
    words = []
    for sentence in sentences:
        tokenized_words = tokenize_words(sentence)
        words.extend(tokenized_words)
    processed_texts[clean_filename] = {'sentences': sentences, 'words': words}


In [None]:
processed_texts

In [None]:
# calculate word frequency (per story and in total)

# create dictionary for storing word frequencies per story and in total
word_frequencies = {}
# initialize list to collect all words from all files
all_words = []

# Calculate word frequencies for each file
for filename, data in processed_texts.items():
    # Create a nested dictionary for each file's word frequencies
    word_frequencies[filename] = {}
    
    for word in data['words']:
        if word not in word_frequencies[filename]:
            word_frequencies[filename][word] = 0
        word_frequencies[filename][word] += 1
    
    # Collect all words from all files for total frequency calculation
    all_words.extend(data['words'])

# Calculate total word frequencies across all files
word_frequencies['total'] = {}
for word in all_words:
    if word not in word_frequencies['total']:
        word_frequencies['total'][word] = 0
    word_frequencies['total'][word] += 1

print(f"Word frequencies text 1: {word_frequencies['Part_I_1-2/01']}\n")
print(f"Word frequencies text 2: {word_frequencies['Part_I_1-2/02']}\n")
print(f"Word frequencies text 3: {word_frequencies['Part_I_1-2/03']}\n")
print(f"Word frequencies text 4: {word_frequencies['Part_I_1-2/04']}\n")
print(f"Word frequencies text 5: {word_frequencies['Part_I_1-2/05']}\n")
print(f"Word frequencies in total: {word_frequencies['total']}\n")
    

In [None]:
# Part I.2: plot the 25 most frequent words (per text and in total) in a histogram

for story_name, freq_dict in word_frequencies.items():
    # Convert dictionary to a DataFrame
    df = pd.DataFrame(list(freq_dict.items()), columns=['Word', 'Frequency'])
    # sort values in descending order and select the top 25 mfw
    df = df.sort_values(by='Frequency', ascending=False).head(25)
    # create a bar plot
    ax = df.plot.bar(x='Word', y='Frequency')
    # set the title
    ax.set_title(f"Top 25 Words in {story_name}")
    # set x axis label 
    ax.set_xlabel("Words")
    #set y axis label
    ax.set_ylabel("Frequency")


In [None]:
#import Porter and Lancaster modules from nltk.stem package
from nltk.stem import PorterStemmer, LancasterStemmer

In [None]:
# Step four: stemming 

# initialize Porter and Lancaster stemmers
stemmer_p = PorterStemmer()
stemmer_l = LancasterStemmer()

# define functions to retrieve stemmed words 
def stem_words_porter(words):
    stemmed_words = []
    for word in words:
        stemmed_word = stemmer_p.stem(word)
        stemmed_words.append(stemmed_word)
    return stemmed_words

def stem_words_lancaster(words):
    stemmed_words = []
    for word in words:
        stemmed_word = stemmer_l.stem(word)
        stemmed_words.append(stemmed_word)
    return stemmed_words
    
# store the stemmed texts into separate dictionaries 
stemmed_texts_porter = {}
stemmed_texts_lancaster = {}

# apply both methods for each file's tokenized words
for filename, data in processed_texts.items():
    words = data['words']   
    stemmed_porter = stem_words_porter(words)
    stemmed_texts_porter[filename] = stemmed_porter
    stemmed_lancaster = stem_words_lancaster(words)
    stemmed_texts_lancaster[filename] = stemmed_lancaster

print("Porter Stemmer Results:")
for filename, stems in stemmed_texts_porter.items():
    print(f"{filename}: {stems}")

print("Lancaster Stemmer Results:")
for filename, stems in stemmed_texts_lancaster.items():
    print(f"{filename}: {stems}")


In [None]:
# Step five: calculate word frequency and plot the 25 mfw for stems

# create dictionary for storing word frequencies for Porter stemmer
word_frequencies_porter = {}
# initialize list to collect all Porter-stemmed words
all_stemmed_words_porter = []  

# calculate word frequencies for each text file with Porter-stemmed words
for filename, stemmed_words in stemmed_texts_porter.items():
    word_frequencies_porter[filename] = {}
    
    for word in stemmed_words:
        if word not in word_frequencies_porter[filename]:
            word_frequencies_porter[filename][word] = 0
        word_frequencies_porter[filename][word] += 1
    
    # collect all words for total frequency calculation
    all_stemmed_words_porter.extend(stemmed_words)

# calculate total word frequencies in all text files
word_frequencies_porter['total'] = {}
for word in all_stemmed_words_porter:
    if word not in word_frequencies_porter['total']:
        word_frequencies_porter['total'][word] = 0
    word_frequencies_porter['total'][word] += 1

print(f"Word frequencies Porter text 1: {word_frequencies_porter['Part_I_1-2/01']}\n")
print(f"Word frequencies Porter text 2: {word_frequencies_porter['Part_I_1-2/02']}\n")
print(f"Word frequencies Porter text 3: {word_frequencies_porter['Part_I_1-2/03']}\n")
print(f"Word frequencies Porter text 4: {word_frequencies_porter['Part_I_1-2/04']}\n")
print(f"Word frequencies Porter text 5: {word_frequencies_porter['Part_I_1-2/05']}\n")
print(f"Word frequencies Porter in total: {word_frequencies_porter['total']}\n")

# create dictionary for storing word frequencies for Lancaster stemmer
word_frequencies_lancaster = {}
# initialize list to collect all words Lancaster-stemmed words
all_stemmed_words_lancaster = []  


for filename, stemmed_words in stemmed_texts_lancaster.items():
    word_frequencies_lancaster[filename] = {}
    
    for word in stemmed_words:
        if word not in word_frequencies_lancaster[filename]:
            word_frequencies_lancaster[filename][word] = 0
        word_frequencies_lancaster[filename][word] += 1
      
    all_stemmed_words_lancaster.extend(stemmed_words)


word_frequencies_lancaster['total'] = {}
for word in all_stemmed_words_lancaster:
    if word not in word_frequencies_lancaster['total']:
        word_frequencies_lancaster['total'][word] = 0
    word_frequencies_lancaster['total'][word] += 1

print(f"Word frequencies Lancaster text 1: {word_frequencies_lancaster['Part_I_1-2/01']}\n")
print(f"Word frequencies Lancaster text 2: {word_frequencies_lancaster['Part_I_1-2/02']}\n")
print(f"Word frequencies Lancaster text 3: {word_frequencies_lancaster['Part_I_1-2/03']}\n")
print(f"Word frequencies Lancaster text 4: {word_frequencies_lancaster['Part_I_1-2/04']}\n")
print(f"Word frequencies Lancaster text 5: {word_frequencies_lancaster['Part_I_1-2/05']}\n")
print(f"Word frequencies Lancaster total: {word_frequencies_lancaster['total']}\n")

In [None]:
# plot 25 mfs for Porter
for story_name, freq_dict in word_frequencies_porter.items():
    df = pd.DataFrame(list(freq_dict.items()), columns=['Stem', 'Frequency'])
    df = df.sort_values(by='Frequency', ascending=False).head(25)
    ax = df.plot.bar(x='Stem', y='Frequency')
    ax.set_title(f"Top 25 Stems (Porter) in {story_name}")
    ax.set_xlabel("Stems")
    ax.set_ylabel("Frequency")
   

In [None]:
# plot 25 mfs for Lancaster
for story_name, freq_dict in word_frequencies_lancaster.items():
    df = pd.DataFrame(list(freq_dict.items()), columns=['Stem', 'Frequency'])
    df = df.sort_values(by='Frequency', ascending=False).head(25)
    ax = df.plot.bar(x='Stem', y='Frequency')
    ax.set_title(f"Top 25 Stems (Lancaster) in {story_name}")
    ax.set_xlabel("Stems")
    ax.set_ylabel("Frequency")

In [None]:
# Part I.3: translations of "The Adventures of Tom Sawyer"
# import German and Dutch language models
# load all three language models
!spacy download de_core_news_sm
nlp_de = spacy.load("de_core_news_sm")
!spacy download nl_core_news_sm
nlp_nl = spacy.load("nl_core_news_sm")
nlp_en = spacy.load('en_core_web_sm')

In [None]:
# Define paths to folders containing the different texts 

file_path_en = 'Part_I_3/pg74.txt'
file_path_de = 'Part_I_3/pg30165.txt'
file_path_nl = 'Part_I_3/pg18381.txt'


def read_file(file_path):
    with open(file_path, encoding='utf8') as infile:
        contents = infile.read()
    return contents


In [None]:
file_path_en = 'Part_I_3/pg74.txt'
text_en = read_file(file_path_en)
# clean file from preamble, TOC, licensing information

# split at PREFACE first, and, once removed, split at CHAPTER I and keep everything coming after
text_en = text_en.split("PREFACE", 1)[-1]
text_en = text_en.split("CHAPTER I", 1) [-1]
# split at *** END OF THE PROJECT GUTENBERG EBOOK and keep everything coming before 
text_en = text_en.split("*** END OF THE PROJECT GUTENBERG EBOOK", 1)[0]

# strip whitespace from text
cleaned_text_en = text_en.replace('\n', ' ').replace('\u2028', ' ').strip()
cleaned_text_en

In [None]:
file_path_de = 'Part_I_3/pg30165.txt'
text_de = read_file(file_path_de)

text_de = text_de.split("Erstes Kapitel.", 1)[-1]
text_de = text_de.split("*** END OF THE PROJECT GUTENBERG EBOOK", 1)[0]

cleaned_text_de = text_de.replace('\n', ' ').replace('\u2028', ' ').strip()
cleaned_text_de

In [None]:
file_path_nl = 'Part_I_3/pg18381.txt'
text_nl = read_file(file_path_nl)

text_nl = text_nl.split("HOOFDSTUK I.", 1)[1]
text_nl = text_nl.split("AANTEEKENINGEN", 1)[0]

cleaned_text_nl = text_nl.replace('\n', ' ').replace('\u2028', ' ').strip()
cleaned_text_nl 

In [None]:
# tokenize and extract POS tags from the English text

# load suitable language model
nlp_en = spacy.load("en_core_web_sm")
# Process the cleaned English text with the language model
doc = nlp_en(cleaned_text_en)
for token in doc:
    # Print each token and its corresponding part-of-speech tag
    print(token, "-", token.pos_)

In [None]:
# tokenize and extract POS tags from the German translation

nlp_de = spacy.load("de_core_news_sm")
doc = nlp_de(cleaned_text_de)
for token in doc:
    print(token, "-", token.pos_)

In [None]:
# tokenize and extract POS tags from the Dutch translation

nlp_nl = spacy.load("nl_core_news_sm")
doc = nlp_nl(cleaned_text_nl)
for token in doc:
    print(token, "-", token.pos_)

In [None]:
# Report the frequencies of the tags for the three languages- English 
# use built-in spacy function from the documentation (doc.count_by) for counting the occurrences of each POS tag

nlp_en = spacy.load("en_core_web_sm")
doc = nlp_en(cleaned_text_en)
pos_counts = doc.count_by(POS)

# convert integer IDs of POS tags to readable tags and sort by frequency
for pos, count in sorted(pos_counts.items()):
    readable_tag = doc.vocab[pos].text
    print(f"{readable_tag}: {count}")

In [None]:
# Report the frequencies of the tags for the three languages- German 

from spacy.attrs import POS

nlp_de = spacy.load("de_core_news_sm")
doc = nlp_de(cleaned_text_de)
pos_counts = doc.count_by(POS)

# convert integer IDs of POS tags to readable tags and sort by frequency
for pos, count in sorted(pos_counts.items()):
    readable_tag = doc.vocab[pos].text
    print(f"{readable_tag}: {count}")

In [None]:
# Report the frequencies of the tags for the three languages- Dutch

from spacy.attrs import POS

nlp_nl = spacy.load("nl_core_news_sm")
doc = nlp_nl(cleaned_text_nl)
pos_counts = doc.count_by(POS)

# convert integer IDs of POS tags to readable tags and sort by frequency
for pos, count in sorted(pos_counts.items()):
    readable_tag = doc.vocab[pos].text
    print(f"{readable_tag}: {count}")

In [None]:
# Part 2: perform Named Entity Recognition on texts from Part_I_1-2
from spacy import displacy 

# Load the large language model for better performance and accuracy
nlp_en_lg = spacy.load("en_core_web_lg")

def read_file(file_path):
    with open(file_path, encoding='utf8') as infile:
        contents = infile.read()
    return contents
 
file_path_1 = 'Part_I_1-2/01.txt'
text_1 = read_file(file_path_1)
cleaned_text_1 = text_1.replace('\n', ' ').replace('\u2028', ' ').strip()
doc = nlp_en_lg(cleaned_text_1)

# use command from the displacy library to visualize the entity recognizer 
displacy.render(doc, style="ent")

In [None]:
# repeat for all texts 
file_path_2 = 'Part_I_1-2/02.txt'
text_2 = read_file(file_path_2)
cleaned_text_2 = text_2.replace('\n', ' ').replace('\u2028', ' ').strip()
doc = nlp_en_lg(cleaned_text_2)
 
displacy.render(doc, style="ent")

In [None]:
file_path_3 = 'Part_I_1-2/03.txt'
text_3 = read_file(file_path_3)
cleaned_text_3 = text_3.replace('\n', ' ').replace('\u2028', ' ').strip()
doc = nlp_en_lg(cleaned_text_3)
 
displacy.render(doc, style="ent")

In [None]:
file_path_4 = 'Part_I_1-2/04.txt'
text_4 = read_file(file_path_4)
cleaned_text_4 = text_4.replace('\n', ' ').replace('\u2028', ' ').strip()
doc = nlp_en_lg(cleaned_text_4)

displacy.render(doc, style="ent")

In [None]:
file_path_5 = 'Part_I_1-2/05.txt'
text_5 = read_file(file_path_5)
cleaned_text_5 = text_5.replace('\n', ' ').replace('\u2028', ' ').strip()
doc = nlp_en_lg(cleaned_text_5)
 
displacy.render(doc, style="ent")

From text 1
# Manual annotation
[Missy]_PERSON was sitting on a chair in a [house]_FAC, maps and papers spread around, normally her planning was mental, but [River]_PERSON had suggested a physical map to refer to and it was helpful.
# Automatic annotation
[Missy]_PERSON was sitting on a chair in a house, maps and papers spread around, normally her planning was mental, but [River]_LOC had suggested a physical map to refer to and it was helpful.

From text 2
# Manual annotation
She laughed and smiled. She wished it could have been him after he knew who she was, because what he was mostly talking about was [1969]_DATE , she wanted to know how her parents were in [New York]_GPE , she wanted the more mature version of him, but she lent against him, sitting in the garden, gently pulling him up to show him the [three]_CARDINAL children she had been left with, it wasn’t night yet and they were playing together. 
# Automatic annotation 
She laughed and smiled. She wished it could have been him after he knew who she was, because what he was mostly talking about was [1969]_DATE , she wanted to know how her parents were in [New York]_GPE , she wanted the more mature version of him, but she lent against him, sitting in the garden, gently pulling him up to show him the [three]_CARDINAL children she had been left with, it wasn’t night yet and they were playing together. 

From text 3
# Manual annotation
Slowly, the [Bad Wolf]_ENTITY held [Rose]_PERSON's hand out, and [the Doctor]_PERSON took it. She stepped to him, inches apart. 'We will make you a deal. The [TARDIS]_PRODUCT, the [Vortex]_LOC, and I'.
# Automatic annotation
Slowly, the Bad Wolf held [Rose]_PERSON's hand out, and the Doctor took it. She stepped to him, inches apart. 'We will make you a deal. The [TARDIS]_ORG, the [Vortex]_LOC, and [I." Rose]_ORG 


From text 4
# Manual annotation
Then [The Doctor]_PERSON was repairing some of the wiring under the console. He had recently noticed that the emergency system buttons weren’t translating for [Rose]_PERSON correctly. The [TARDIS]_PRODUCT wasn’t meant to translate [Gallifreyan]_LANGUAGE to other languages.
# Automatic annotation
Then The Doctor was repairing some of the wiring under the console. He had recently noticed that the emergency system buttons weren’t translating for [Rose]_PERSON correctly. The TARDIS wasn’t meant to translate [Gallifreyan]_PRODUCT to other languages.


From text 5
# Manual annotation
You met his kind with [the Doctor]_PERSON a [few months back]_DATE. They were called [Torwash]_NORP  or [Torvash]_NORP  , you weren’t sure. You may or may not have set off a couple of explosives to blow up their building on planet [Serentara]_LOC , ruining their plans to enslave its population.
# Automatic annotation
You met his kind with the Doctor a few months back. They were called [Torwash]_PERSON  or [Torvash]_PERSON  , you weren’t sure. You may or may not have set off a couple of explosives to blow up their building on planet [Serentara]_ORG , ruining their plans to enslave its population.

In [None]:
# calculate the Precision, Recall and F1 Score

from sklearn.metrics import classification_report
y_true = ['PERSON', 'FAC', 'PERSON', 'DATE', 'GPE', 'CARDINAL', 'ENTITY', 'PERSON', 'PERSON', 'PRODUCT', 'LOC', '', 'PERSON', 'PERSON', 'PRODUCT', 'LANGUAGE', 'PERSON', 'DATE', 'NORP', 'NORP', 'LOC']
y_pred = ['PERSON', '', 'LOC', 'DATE', 'GPE', 'CARDINAL', '', 'PERSON', '', 'ORG', 'LOC', 'ORG', '', 'PERSON', '', 'PRODUCT', '', '', 'PERSON', 'PERSON', 'ORG']
target_names = ['PERSON', 'FAC', 'DATE', 'GPE', 'CARDINAL', 'ENTITY', 'PRODUCT', 'LOC', 'LANGUAGE', 'NORP', 'ORG', '']
print(classification_report(y_true, y_pred, target_names=target_names, zero_division = 0))
