<a href="https://colab.research.google.com/github/JonasVerbickas/test-jupyter/blob/main/NLP_CW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
import os
import time
import nltk
import string
import re
import unicodedata

In [96]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [97]:
porter = nltk.PorterStemmer()

In [110]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.corpus.stopwords.words('english')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [99]:
class StringWithDocId:
  def __init__(self, string, doc_id):
    self.string = string
    self.doc_id = doc_id
  
  def __lt__(token_with_doc_A, token_with_doc_B):
    if token_with_doc_A.string == token_with_doc_B.string:
      return token_with_doc_A.doc_id < token_with_doc_B.doc_id
    else:
      return token_with_doc_A.string < token_with_doc_B.string
    
  def __str__(self):
    return f"{self.string}: {self.doc_id}"

  def __repr__(self):
    return f"{self.string}: {self.doc_id}"

In [100]:
class StringWithDocIdAndPosition(StringWithDocId):
  def __init__(self, string, doc_id, position):
    super().__init__(string, doc_id)
    self.position = position
  
  def __lt__(token_with_doc_A, token_with_doc_B):
    if token_with_doc_A.string != token_with_doc_B.string:
      return token_with_doc_A.string < token_with_doc_B.string
    elif token_with_doc_A.doc_id != token_with_doc_B.doc_id:
      return token_with_doc_A.doc_id < token_with_doc_B.doc_id
    else:
      return token_with_doc_A.position < token_with_doc_B.position

In [101]:
class Posting():
  def __init__(self, doc_id, position, doc_freq):
    self.doc_id = doc_id
    self.position = position
    self.doc_freq = doc_freq

  def __str__(self):
    return f"({self.doc_id} {self.position})"
  
  def __repr__(self):
    return f"({self.doc_id} {self.position})"

In [102]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [174]:
def wikipediaPreprocessing(text):
  text = unicodedata.normalize('NFKD', text)
  text = text.replace('\n', '')
  text = re.sub(r'\[.*?\]+', '', text)
  # text = text.translate(str.maketrans('', '', string.punctuation))
  text = re.sub(r'Contents\s+1\s+Plot.+Plot', '', text) 
  text = re.sub(r'^.+From Wikipedia, the free encyclopedia.+List of episodes', '', text)
  return text

In [175]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
class InvertedIndex:
    """
    Construct Inverted Index
    """
    def __init__(self):
        self.inverted_index = {}

    def __getitem__(self, key):
        return self.inverted_index[key]
      
    def keys(self):
      return self.inverted_index.keys()
        
    def read_data(self, path: str) -> list:
        """
        Read files from a directory and then append the data of each file into a list.
        """
        output = []
        for file in os.listdir(path):
          with open(os.path.join(path, file), 'r') as f:
            if file[-3:] != 'txt':
              print(file, "will be skipped")
            else:
              output.append(StringWithDocId(f.read(), file))
        print(len(output))
        return output

    def process_document(self, document: str) -> list:
        """
        pre-process a document and return a list of its terms
        str->list"""
        # 1. Wikipedia hyperlinks should be removed
        # 4. Use multi-word character/location names from .csv files
        text = wikipediaPreprocessing(document)
        tokenized  = nltk.tokenize.word_tokenize(text)
        output = []
        for token in tokenized:
          # 2. ignore stop-words
          if token in STOP_WORDS:
            continue
          # 2. porter stemmer makes everything lowercase as well
          stemmed = porter.stem(token)
          output.append(stemmed)
          
        return output
    
    def index_corpus(self, documents: list) -> None:
        """
        index given documents
        list->None"""
        starting_time = time.perf_counter()
        token_list = []
        # 1. Generate token sequence
        for doc in documents:
          curr_doc_id = doc.doc_id
          processed_string = self.process_document(doc.string)
          for i, token in enumerate(processed_string):
            token_with_doc_id_and_pos = StringWithDocIdAndPosition(token, curr_doc_id, i)
            token_list.append(token_with_doc_id_and_pos)
        # 2. Sort
        sorted_token_list = sorted(token_list)
        print("First 10 of sorted_token_list:", sorted_token_list[:10])
        # 3. Convert into dictionary of postings
        for token in sorted_token_list:
          if token.string in self.inverted_index:
            self.inverted_index[token.string].append(Posting(token.doc_id, token.position, 1))
          else:
            self.inverted_index[token.string] = [Posting(token.doc_id, token.position, 1)]
        # Print out some details about the dataset
        total_time_taken = round(time.perf_counter() - starting_time, 4)
        print(f"It took: {total_time_taken} seconds to index the whole corpus.")
        print(f"It has {len(self.inverted_index)} entries in total.")
      
    def dump(self, path: str) -> None:
        """
        provide a dump function to show index entries for a given set of terms        
        """
        if os.path.exists(path) == False:
          print("Path to file you provided doesn't exist")
          return
        with open(path, 'r') as f:
          file_contents = f.read()
          examples = file_contents.split('\n')
          for e in examples:
            processed_e = " ".join(self.process_document(e))
            try:
              print(f'Match for {e}:', self.inverted_index[processed_e])
            except KeyError:
              print(e, "was not found")

     
    def proximity_search(self, term1: str, term2: str, window_size: int = 3) -> dict:
        """
        1) check whether given two terms appear within a window
        2) calculate the number of their co-existance in a document
        3) add the document id and the number of matches into a dict
        return the dict"""
        term1 = " ".join(self.process_document(term1))
        term2 = " ".join(self.process_document(term2))
        documents_containing_both_terms = []
        term1_i = 0
        term2_i = 0
        while term1_i < len(self.inverted_index[term1]) and term2_i < len(self.inverted_index[term2]):
          if self.inverted_index[term1][term1_i].doc_id == self.inverted_index[term2][term2_i].doc_id:
            if abs(self.inverted_index[term1][term1_i].position - self.inverted_index[term2][term2_i].position) <= window_size:
              documents_containing_both_terms.append((str(self.inverted_index[term1][term1_i]), str(self.inverted_index[term2][term2_i])))
            term1_i += 1
            term2_i += 1
          else:
            if self.inverted_index[term1][term1_i].doc_id < self.inverted_index[term2][term2_i].doc_id:
              term1_i += 1
            else:
              term2_i += 1
        return documents_containing_both_terms

    

In [176]:
def testReadData():
  index = InvertedIndex()
  corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022')
  return (corpus[0]).string
testReadData()

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118


'Mr. Lisa Goes to Washington\nFrom Wikipedia, the free encyclopedia\n\n\n\nJump to navigation\nJump to search\n"Mr. Lisa Goes to Washington"\nThe Simpsons\xa0episode\nEpisode\xa0no.\nSeason\xa03\nEpisode 2\nDirected by\nWes Archer\nWritten by\nGeorge Meyer\nProduction code\n8F01\nOriginal air date\nSeptember 26, 1991[1]\n\nEpisode features\nChalkboard gag\n"Spitwads are not free speech"\nCouch gag\nThe family sits down and Homer pulls\xa0Santa\'s Little Helper\xa0from under him.\nCommentary\nMatt Groening\nAl Jean\nMike Reiss\nJulie Kavner\nWes Archer\nDavid Silverman\nEpisode chronology\n←\xa0Previous\n"Stark Raving Dad"\nNext\xa0→\n"When Flanders Failed"\nThe Simpsons\xa0(season 3)\nList of episodes\n"Mr. Lisa Goes to Washington" is the second episode of the\xa0third season\xa0of the American animated television series\xa0The Simpsons. It originally aired on the\xa0Fox network\xa0in the United States on September 26, 1991. In the episode,\xa0Lisa\xa0wins a patriotic essay contest abo

In [177]:
def testTokenzination():
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022') # specify the directory path in which files are located
    text = (corpus[0]).string
    tokenized = index.process_document(text)
    return tokenized
    
testTokenzination()[:20]

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118


['``',
 'mr.',
 'lisa',
 'goe',
 'washington',
 "''",
 'second',
 'episod',
 'third',
 'season',
 'american',
 'anim',
 'televis',
 'seri',
 'the',
 'simpson',
 '.',
 'it',
 'origin',
 'air']

In [178]:
def main():
    "main call function"
    index = InvertedIndex() # initilaise the index
    corpus = index.read_data('/content/drive/MyDrive/Colab Notebooks/Simpsons2022') # specify the directory path in which files are located
    index.index_corpus(corpus) # index documents/corpus
    return index
    
index = main()

simpsons_characters - row.csv will be skipped
simpsons_locations row.csv will be skipped
118
First 10 of sorted_token_list: [!: 3.11.txt, !: 3.14.txt, !: 3.18.txt, !: 3.19.txt, !: 3.2.txt, !: 3.2.txt, !: 3.20.txt, !: 3.20.txt, !: 3.20.txt, !: 3.20.txt]
It took: 6.446 seconds to index the whole corpus.
It has 12373 entries in total.


In [179]:
index.dump("/content/drive/MyDrive/Colab Notebooks/26957722.txt")

Match for Bart: [(3.1.txt 52), (3.1.txt 250), (3.1.txt 286), (3.1.txt 349), (3.1.txt 376), (3.1.txt 381), (3.1.txt 399), (3.1.txt 416), (3.1.txt 596), (3.1.txt 608), (3.1.txt 932), (3.1.txt 936), (3.1.txt 1232), (3.1.txt 1265), (3.1.txt 1404), (3.1.txt 2133), (3.1.txt 2178), (3.10.txt 1089), (3.10.txt 1302), (3.10.txt 1478), (3.10.txt 1588), (3.11.txt 433), (3.11.txt 670), (3.11.txt 799), (3.11.txt 835), (3.12.txt 48), (3.12.txt 57), (3.12.txt 151), (3.12.txt 191), (3.12.txt 206), (3.12.txt 250), (3.12.txt 256), (3.12.txt 501), (3.12.txt 506), (3.12.txt 528), (3.12.txt 604), (3.12.txt 739), (3.13.txt 2), (3.13.txt 30), (3.13.txt 61), (3.13.txt 75), (3.13.txt 121), (3.13.txt 164), (3.13.txt 171), (3.13.txt 186), (3.13.txt 277), (3.13.txt 292), (3.13.txt 302), (3.13.txt 314), (3.13.txt 322), (3.13.txt 349), (3.13.txt 364), (3.13.txt 383), (3.13.txt 707), (3.13.txt 811), (3.13.txt 839), (3.13.txt 917), (3.13.txt 966), (3.13.txt 1062), (3.13.txt 1069), (3.13.txt 1114), (3.13.txt 1138), (3.

In [180]:
index.proximity_search('award', 'winning', 100)

[('(3.13.txt 950)', '(3.13.txt 1016)'),
 ('(3.21.txt 462)', '(3.21.txt 460)'),
 ('(3.21.txt 464)', '(3.21.txt 482)'),
 ('(3.21.txt 792)', '(3.21.txt 789)'),
 ('(3.6.txt 911)', '(3.6.txt 975)'),
 ('(4.19.txt 196)', '(4.19.txt 268)'),
 ('(4.19.txt 213)', '(4.19.txt 297)'),
 ('(5.1.txt 388)', '(5.1.txt 386)'),
 ('(5.15.txt 143)', '(5.15.txt 140)'),
 ('(5.17.txt 35)', '(5.17.txt 32)'),
 ('(5.9.txt 413)', '(5.9.txt 414)'),
 ('(6.19.txt 64)', '(6.19.txt 75)'),
 ('(6.19.txt 758)', '(6.19.txt 769)'),
 ('(6.2.txt 203)', '(6.2.txt 174)'),
 ('(6.22.txt 1250)', '(6.22.txt 1263)'),
 ('(7.12.txt 709)', '(7.12.txt 668)'),
 ('(7.8.txt 1035)', '(7.8.txt 1064)')]