<a href="https://colab.research.google.com/github/JonasVerbickas/test-jupyter/blob/main/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import os
import time
import nltk
import string
import re
import unicodedata

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
porter = nltk.PorterStemmer()

In [29]:
nltk.download('stopwords')
nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [30]:
class StringWithDocId:
  def __init__(self, string, doc_id):
    self.string = string
    self.doc_id = doc_id
  
  def __lt__(token_with_doc_A, token_with_doc_B):
    if token_with_doc_A.string == token_with_doc_B.string:
      return token_with_doc_A.doc_id < token_with_doc_B.doc_id
    else:
      return token_with_doc_A.string < token_with_doc_B.string
    
  def __str__(self):
    return f"{self.string}: {self.doc_id}"

  def __repr__(self):
    return f"{self.string}: {self.doc_id}"

In [31]:
class StringWithPosition(StringWithDocId):
  def __init__(self, string, doc_id, position):
    super().__init__(string, doc_id)
    self.position = position
  

In [32]:
def wikipediaPreprocessing(text):
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'\[.*?\]+', '', text) # index documents/corpus
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'Contents\s+1\s+Plot.+Plot', '', text) 
    text = re.sub(r'From Wikipedia, the free encyclopedia.+List of episodes', '', text)
    return text

In [39]:
STOP_WORDS = [porter.stem(w) for w in nltk.corpus.stopwords.words('english')]#"the", "is", "am", "are", "will", "was", "were", "to", 'be', 'or', 'not'] 
print("as list", len(STOP_WORDS))
STOP_WORDS = set(STOP_WORDS)
print("as set", len(STOP_WORDS))

class InvertedIndex:
    """
    Construct Inverted Index
    """
    def __init__(self):
        self.inverted_index = {}

    def __getitem__(self, key):
        return self.inverted_index[key]
      
    def keys(self):
      return self.inverted_index.keys()
        
    def read_data(self, path: str) -> list:
        """
        Read files from a directory and then append the data of each file into a list.
        """
        output = []
        for file in os.listdir(path):
          with open(os.path.join(path, file), 'r') as f:
            if file[-3:] != 'txt':
              print(file, "will be skipped")
            else:
              output.append(StringWithDocId(f.read(), file))
        print(len(output))
        return output

    def process_document(self, document: str) -> list:
        """
        pre-process a document and return a list of its terms
        str->list"""
        # 1. Wikipedia hyperlinks should be removed
        # 4. Use multi-word character/location names from .csv files
        text = wikipediaPreprocessing(document)
        tokenized  = nltk.tokenize.wordpunct_tokenize(text)
        output = []
        for token in tokenized:
          # 2. porter stemmer makes everything lowercase as well
          stemmed = porter.stem(token)
          # 3. ignore stop-words
          if stemmed not in STOP_WORDS: 
            output.append(stemmed)
        return output
    
    def index_corpus(self, documents: list) -> None:
        """
        index given documents
        list->None"""
        starting_time = time.perf_counter()
        token_list = []
        # 1. Generate token sequence
        for doc in documents:
          curr_doc_id = doc.doc_id
          processed_string = self.process_document(doc.string)
          for i, token in enumerate(processed_string):
            token_with_doc_id_and_pos = StringWithPosition(token, curr_doc_id, i)
            token_list.append(token_with_doc_id_and_pos)
        # 2. Sort
        sorted_token_list = sorted(token_list)
        print("First 10 of sorted_token_list:", sorted_token_list[:10])
        # 3. Convert into dictionary of postings
        for token in sorted_token_list:
          if token.string in self.inverted_index:
            self.inverted_index[token.string].append((token.doc_id, token.position))
          else:
            self.inverted_index[token.string] = [(token.doc_id, token.position)]
        # Print out some details about the dataset
        total_time_taken = round(time.perf_counter() - starting_time, 4)
        print(f"It took: {total_time_taken} seconds to index the whole corpus.")
        print(f"It has {len(self.inverted_index)} entries in total.")
      
    def dump(self, examples: list) -> None:
        """
        provide a dump function to show index entries for a given set of terms        
        """
        for e in examples:
          processed_e = " ".join(self.process_document(e))
          try:
            print(f'Match for {e}:', self.inverted_index[processed_e])
          except KeyError:
            print(e, "was not found")

     
    def proximity_search(self, term1: str, term2: str, window_size: int = 3) -> dict:
        """
        1) check whether given two terms appear within a window
        2) calculate the number of their co-existance in a document
        3) add the document id and the number of matches into a dict
        return the dict"""
        documents_containing_both_terms = []
        term1_i = 0
        term2_i = 0
        while term1_i < len(self[term1]) and term2_i < len(self[term2]):
          if self[term1][term1_i] == self[term2][term2_i]:
            documents_containing_both_terms.append(self[term1][term1_i])
            term1_i += 1
            term2_i += 1
          else:
            if self[term1][term1_i] < self[term2][term2_i]:
              term1_i += 1
            else:
              term2_i += 1
        return documents_containing_both_terms
    
    

as list 179
as set 171


In [34]:
def testReadData():
  index = InvertedIndex()
  corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022')
  return (corpus[1]).string
testReadData()

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118


'Mr. Lisa Goes to Washington\nFrom Wikipedia, the free encyclopedia\n\n\n\nJump to navigation\nJump to search\n"Mr. Lisa Goes to Washington"\nThe Simpsons\xa0episode\nEpisode\xa0no.\nSeason\xa03\nEpisode 2\nDirected by\nWes Archer\nWritten by\nGeorge Meyer\nProduction code\n8F01\nOriginal air date\nSeptember 26, 1991[1]\n\nEpisode features\nChalkboard gag\n"Spitwads are not free speech"\nCouch gag\nThe family sits down and Homer pulls\xa0Santa\'s Little Helper\xa0from under him.\nCommentary\nMatt Groening\nAl Jean\nMike Reiss\nJulie Kavner\nWes Archer\nDavid Silverman\nEpisode chronology\n←\xa0Previous\n"Stark Raving Dad"\nNext\xa0→\n"When Flanders Failed"\nThe Simpsons\xa0(season 3)\nList of episodes\n"Mr. Lisa Goes to Washington" is the second episode of the\xa0third season\xa0of the American animated television series\xa0The Simpsons. It originally aired on the\xa0Fox network\xa0in the United States on September 26, 1991. In the episode,\xa0Lisa\xa0wins a patriotic essay contest abo

In [35]:
def testTokenzination():
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022') # specify the directory path in which files are located
    text = (corpus[1]).string
    text = wikipediaPreprocessing(text)
    tokenized = split_into_words = nltk.tokenize.wordpunct_tokenize(text)
    return tokenized
    
testTokenzination()[:20]

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118


['Mr',
 'Lisa',
 'Goes',
 'to',
 'Washington',
 'From',
 'Wikipedia',
 'the',
 'free',
 'encyclopedia',
 'Jump',
 'to',
 'navigation',
 'Jump',
 'to',
 'search',
 'Mr',
 'Lisa',
 'Goes',
 'to']

In [36]:
def testStemming():
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022') # specify the directory path in which files are located
    text = (corpus[1]).string
    text = wikipediaPreprocessing(text)
    tokenized = split_into_words = nltk.tokenize.wordpunct_tokenize(text)
    output = []
    for token in tokenized:
      output.append(porter.stem(token))
    return output
    
testStemming()[:20]

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118


['mr',
 'lisa',
 'goe',
 'to',
 'washington',
 'from',
 'wikipedia',
 'the',
 'free',
 'encyclopedia',
 'jump',
 'to',
 'navig',
 'jump',
 'to',
 'search',
 'mr',
 'lisa',
 'goe',
 'to']

In [37]:
def main():
    "main call function"
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022') # specify the directory path in which files are located
    index.index_corpus(corpus) # index documents/corpus
    return index
    
index = main()

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118
First 10 of sorted_token_list: [1: 3.1.txt, 1: 3.1.txt, 1: 3.10.txt, 1: 3.17.txt, 1: 3.18.txt, 1: 3.3.txt, 1: 3.5.txt, 1: 4.1.txt, 1: 4.14.txt, 1: 4.18.txt]
It took: 6.0889 seconds to index the whole corpus.
It has 10436 entries in total.


In [38]:
dump_list = ["Bart",
  "first",
  "image",
  "montage",
  "well",
  "top",
  "arguably",
  "best",
  "number",
  "humor",
  "dollarydoos",
  "Bart Simpson",
  "Gordie Howe",
  "recalled",
  "Bart the Lover"]
index.dump(dump_list)

Match for Bart: [('3.1.txt', 147), ('3.1.txt', 292), ('3.1.txt', 323), ('3.1.txt', 373), ('3.1.txt', 395), ('3.1.txt', 400), ('3.1.txt', 413), ('3.1.txt', 427), ('3.1.txt', 575), ('3.1.txt', 585), ('3.1.txt', 846), ('3.1.txt', 849), ('3.1.txt', 1075), ('3.1.txt', 1096), ('3.1.txt', 1203), ('3.1.txt', 1737), ('3.1.txt', 1770), ('3.10.txt', 928), ('3.10.txt', 1092), ('3.10.txt', 1217), ('3.10.txt', 1295), ('3.11.txt', 422), ('3.11.txt', 613), ('3.11.txt', 707), ('3.11.txt', 735), ('3.12.txt', 105), ('3.12.txt', 145), ('3.12.txt', 151), ('3.12.txt', 214), ('3.12.txt', 240), ('3.12.txt', 247), ('3.12.txt', 276), ('3.12.txt', 281), ('3.12.txt', 486), ('3.12.txt', 489), ('3.12.txt', 507), ('3.12.txt', 562), ('3.12.txt', 662), ('3.13.txt', 1), ('3.13.txt', 10), ('3.13.txt', 77), ('3.13.txt', 97), ('3.13.txt', 121), ('3.13.txt', 132), ('3.13.txt', 165), ('3.13.txt', 203), ('3.13.txt', 206), ('3.13.txt', 216), ('3.13.txt', 241), ('3.13.txt', 288), ('3.13.txt', 300), ('3.13.txt', 306), ('3.13.tx