## Goal was to do an linguistic analysis of a text with spaCy, making use of POS tagging and NER

In [1]:
import re
import spacy
from spacy import displacy
import nltk
from nltk.corpus import stopwords
from prettytable import PrettyTable
from collections import Counter

## Clean Corpus

In [None]:
# Get the corpus
with open("sample.txt", "r", encoding="utf-8") as file:
    text = file.read()  # Reads the entire file

# Remove Illustrations comments
text = re.sub(r'\[Illustration:.*?\]', '', text, flags=re.DOTALL).strip()

# Remove underscores around words
text = re.sub(r'([^\n]+)_', r'\1', text)

# Split text into lines
lines = text.split("\n")

# Remove the first line
lines = lines[1:]

# Remove chapter numeration (Roman numerals)
filtered_lines = []
for i, line in enumerate(lines): # Iterate over lines
    if re.match(r'^\s*[IVXLCDM]+\s*$', line):  # Check if line is only Roman numerals
        continue  # Skip only the numeration line
    filtered_lines.append(line)

text = "\n".join(filtered_lines) # Text without numeration

# Lower text
text = text.lower()

# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)  # Removes all non-word characters except spaces

# Remove extra spaces
text = re.sub(r'\s+', ' ', text).strip()

# Remove stopwords
nltk.download('stopwords')  # Uncomment if running for the first time
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in text.split() if word not in stop_words]
text = ' '.join(filtered_words)
    
# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Process the text with SpaCy
doc = nlp(text)

# Tokenize and normalize text
tokens = []
for token in doc:
    tokens.append(token.lemma_)  # Lemmatize

# Cleaned text
cleaned_text = " ".join(tokens)


## POS Tagging with most common POS tags

In [3]:
# Process text (string) with spacy again to apply POS
doc = nlp(cleaned_text)

# POS Tagging
pos_tags = [token.pos_ for token in doc]  # Collect all POS tags

# Analysis: Determine the most common POS tags
pos_counts = Counter(pos_tags)  # Count the frequency of each POS tag
sorted_pos_counts = pos_counts.most_common()  # Sort by frequency

# Create a table to display the frequency of POS tags
pos_count_table = PrettyTable(["POS Tag", "Frequency"])
for pos, count in sorted_pos_counts:
    pos_count_table.add_row([pos, count])

print("\nMost Common POS Tags:\n", pos_count_table)


Most Common POS Tags:
 +---------+-----------+
| POS Tag | Frequency |
+---------+-----------+
|   NOUN  |    2593   |
|   VERB  |    1584   |
|   ADJ   |    1217   |
|  PROPN  |    855    |
|   ADV   |    641    |
|   AUX   |    402    |
|   PRON  |    177    |
|   PART  |    170    |
|   NUM   |    112    |
|   ADP   |    108    |
|   INTJ  |     56    |
|  SCONJ  |     25    |
|   DET   |     24    |
|  CCONJ  |     8     |
|    X    |     3     |
+---------+-----------+


## NER

In [4]:
# Collect labels of named entities
entities = [(ent.label_) for ent in doc.ents]

# Analysis: Determine the most common NER labels
ner_counts = Counter(entities)  # Count the frequency of each label
sorted_ner_counts = ner_counts.most_common()  # Sort by frequency

# Create a table to display the frequency of NER labels
ner_count_table = PrettyTable(["Label", "Frequency"])
for label, count in sorted_ner_counts:
    ner_count_table.add_row([label, count])

print("\nMost Common NER Labels:\n", ner_count_table)


Most Common NER Labels:
 +----------+-----------+
|  Label   | Frequency |
+----------+-----------+
|  PERSON  |    190    |
| CARDINAL |     61    |
|   DATE   |     23    |
| ORDINAL  |     23    |
|   TIME   |     20    |
|   ORG    |     9     |
|   GPE    |     1     |
|   FAC    |     1     |
+----------+-----------+


In [5]:
# Show frequency of each named entity
# Collect text and labels of named entities
entities_2 = [(ent.text, ent.label_) for ent in doc.ents]

# Analysis: Determine the most common NER labels
ner_counts_2 = Counter(entities_2)  # Count the frequency of each label
sorted_ner_counts_2 = ner_counts_2.most_common()  # Sort by frequency

# Create a table to display the frequency of NER labels
ner_count_table_2 = PrettyTable(["Text and Label", "Frequency"])
for label, count in sorted_ner_counts_2:
    ner_count_table_2.add_row([label, count])

print("\nMost Common NER Labels (with Text):\n", ner_count_table_2)


Most Common NER Labels (with Text):
 +------------------------------------------------+-----------+
|                 Text and Label                 | Frequency |
+------------------------------------------------+-----------+
|               ('joe', 'PERSON')                |     57    |
|              ('one', 'CARDINAL')               |     43    |
|            ('joe otter', 'PERSON')             |     18    |
|              ('first', 'ORDINAL')              |     18    |
|            ('sammy jay', 'PERSON')             |     14    |
|              ('two', 'CARDINAL')               |     12    |
|              ('tomorrow', 'DATE')              |     7     |
|              ('peter', 'PERSON')               |     7     |
|               ('bush', 'PERSON')               |     7     |
|              ('morning', 'TIME')               |     6     |
|          ('jerry muskrat', 'PERSON')           |     5     |
|             ('second', 'ORDINAL')              |     5     |
|             ('t

In [6]:
print('\n NER with spaCy')

#function takes a doc (string) and prints NER table with all identified entities from the doc
def NER_spacy (doc):
    tab2 = PrettyTable(['Entity', 'Label'])
    for e in doc.ents:
        tab2.add_row([e.text, e.label_])

    print(tab2)

#run the language module
my_doc = nlp(cleaned_text)

displacy.render((my_doc), jupyter=True, style='ent') #this is for the colour NEW


 NER with spaCy
