# Intro to NLP - Assignment 1

In [160]:
import numpy as np
import re
import torch
import pprint as pp

In [161]:
paragraph_text = """In 'Pride and Prejudice' by Jane Austen, Elizabeth Bennett meets Mr Darcy at a ball hosted by her friend @charles_bingly. They dance, but Mr Darcy finds her behaviour "tolerable, but not handsome enough to tempt him" #rude. She later visits Pemberley, Mr Darcy's estate, where she learns more about his Character. Check out more information at https://janeausten.co.uk. This mail id: abc@mno.xyz is a test mail-id."""

## 1 - Tokenizer

In [162]:
# def sentence_tokenizer_updated(text):
#     # First, split sentences based on periods, question marks, and exclamation marks
#     sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!|\n)\s')
#     sentences = re.split(sentence_pattern, text)

#     # Tokenize punctuation marks as individual tokens for each sentence
#     tokenized_sentences = []
#     for sentence in sentences:
#         lines = sentence.split('\n')
#         tokenized_sentence = []
#         for line in lines:
#             tokens = re.findall(r'\b\w+\b|[.,!?]', line)
#             tokenized_sentence.extend(tokens)
#         tokenized_sentences.append(tokenized_sentence)

#     return tokenized_sentences

# sentences_updated = sentence_tokenizer_updated(paragraph_text)
# pp.pprint(sentences_updated)

In [163]:
def sentence_tokenizer(text):
    # Define patterns for special cases
    # num or num.num or .num and not num.
    num_pattern = re.compile(r'(\d+\.?\d+|\.\d+)')
    mention_pattern = re.compile(r'@(\w+)')
    hashtag_pattern = re.compile(r'#(\w+)')
    mail_pattern = re.compile(r'(\S+@\S+\.\w+)')
    url_pattern = re.compile(r'\S+\.\w+')

    # First, split sentences based on periods, question marks, exclamation marks, and new lines
    sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!|\n)\s')
    sentences = re.split(sentence_pattern, text)

    # Tokenize punctuation marks as individual tokens for each sentence
    tokenized_sentences = []
    for sentence in sentences:
        # Replace mentions, hashtags, and URLs with special markers
        sentence = re.sub(num_pattern, '<NUM>', sentence)
        sentence = re.sub(mail_pattern, '<MAILID>', sentence)
        sentence = re.sub(mention_pattern, '<MENTION>', sentence)
        sentence = re.sub(hashtag_pattern, '<HASHTAG>', sentence)
        sentence = re.sub(url_pattern, '<URL>', sentence)

        tokenized_sentences.append(sentence)

    return tokenized_sentences

    # return sentences


sentences_updated = sentence_tokenizer(paragraph_text)
# pp.pprint(sentences_updated)
print(sentences_updated)

["In 'Pride and Prejudice' by Jane Austen, Elizabeth Bennett meets Mr Darcy at a ball hosted by her friend <MENTION>.", 'They dance, but Mr Darcy finds her behaviour "tolerable, but not handsome enough to tempt him" <HASHTAG>.', "She later visits Pemberley, Mr Darcy's estate, where she learns more about his Character.", 'Check out more information at <URL>.', 'This mail id: <MAILID> is a test mail-id.']


In [164]:
def mail_detector(sentence):
    mail_pattern = re.compile(r'(\S+@\S+\.\w+)')
    sentence = re.sub(mail_pattern, '<MAILID>', sentence)
    return sentence

def mention_detector(sentence):
    mention_pattern = re.compile(r'@(\w+)')
    sentence = re.sub(mention_pattern, '<MENTION>', sentence)
    return sentence

def hashtag_detector(sentence):
    hashtag_pattern = re.compile(r'#(\w+)')
    sentence = re.sub(hashtag_pattern, '<HASHTAG>', sentence)
    return sentence

def url_detector(sentence):
    url_pattern = re.compile(r'\S+\.\w+')
    sentence = re.sub(url_pattern, '<URL>', sentence)
    return sentence

def punctuation_detector(sentence):
    punctuation_pattern = re.compile(r'\w+')

In [165]:
class Tokenizer:
    def __init__(self, path):
        self.path = path
        self.tokens = []

        self.sentence_pattern = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!|\n)\s')
        self.newline_pattern = re.compile(r'\n')

        self.mention_pattern = re.compile(r'@(\w+)')
        self.hashtag_pattern = re.compile(r'#(\w+)')
        self.mail_pattern = re.compile(r'(\S+@\S+\.\w+)')
        self.url_pattern = re.compile(r'\S+\.\w+')
        # self.num_pattern = re.compile(r'(\d+\.?\d+|\.\d+)')
        self.num_pattern = re.compile(r'(?<!\w)(\d+\.?\d+|\.\d+)(?!\w)')

        # self.punctuation_spacing_pattern1 = re.compile(r'(?<!\s)([\)\(\[\],;:*\'"!?])(?!\S)')
        # self.punctuation_spacing_pattern2 = re.compile(r'(?<!\S)([\)\(\[\].,;:*\'"!?])(?!\s)')
        self.punctuation_spacing_pattern1 = re.compile(r'(?<!\s)([^\w\s])(?!\S)')
        self.punctuation_spacing_pattern2 = re.compile(r'(?<!\S)([^\w\s])(?!\s)')

        self.text = self.readFile()

        self.detectAndReplaceWithTags()

        self.separateChars()
        self.fixTags()

        pp.pprint(self.tokens)

    def readFile(self):
        with open(self.path, 'r') as f:
            return f.read()

    def detectAndReplaceWithTags(self):
        sentences = self.sentenceTokenizer(self.text)

        for sentence in sentences:
            sentence = self.newlineDetector(sentence)
            sentence = self.mailDetector(sentence)
            sentence = self.mentionDetector(sentence)
            sentence = self.hashtagDetector(sentence)
            sentence = self.numDetector(sentence)
            sentence = self.urlDetector(sentence)

            self.tokens.append(sentence)

    def sentenceTokenizer(self, text):
        return re.split(self.sentence_pattern, text)

    def newlineDetector(self, sentence):
        return re.sub(self.newline_pattern, ' ', sentence)

    def mailDetector(self, sentence):
        return re.sub(self.mail_pattern, '<MAILID>', sentence)

    def mentionDetector(self, sentence):
        return re.sub(self.mention_pattern, '<MENTION>', sentence)

    def hashtagDetector(self, sentence):
        return re.sub(self.hashtag_pattern, '<HASHTAG>', sentence)

    def urlDetector(self, sentence):
        return re.sub(self.url_pattern, '<URL>', sentence)

    def numDetector(self, sentence):
        return re.sub(self.num_pattern, '<NUM>', sentence)

    def separateChars(self):
        for i in range(len(self.tokens)):
            # self.tokens[i] = re.sub(self.punctuation_spacing_pattern1, r' \1', self.tokens[i])
            # self.tokens[i] = re.sub(self.punctuation_spacing_pattern2, r'\1 ', self.tokens[i])
            length = 0
            while length != len(self.tokens[i]):
                length = len(self.tokens[i])
                self.tokens[i] = re.sub(self.punctuation_spacing_pattern1, r' \1', self.tokens[i])
                self.tokens[i] = re.sub(self.punctuation_spacing_pattern2, r'\1 ', self.tokens[i])
            self.tokens[i] = re.sub(r'\s+', ' ', self.tokens[i])
            self.tokens[i] = self.tokens[i].strip()
            self.tokens[i] = self.tokens[i].split(' ')

    def fixTags(self):
        for i in range(len(self.tokens)):
            j = 0
            while j < len(self.tokens[i]):
                if self.tokens[i][j] == '>' and self.tokens[i][j-2] == '<':
                    if self.tokens[i][j-1] == 'MAILID':
                        self.tokens[i][j-2] = '<MAILID>'
                        self.tokens[i].pop(j-1)
                        self.tokens[i].pop(j-1)
                    elif self.tokens[i][j-1] == 'MENTION':
                        self.tokens[i][j-2] = '<MENTION>'
                        self.tokens[i].pop(j-1)
                        self.tokens[i].pop(j-1)
                    elif self.tokens[i][j-1] == 'HASHTAG':
                        self.tokens[i][j-2] = '<HASHTAG>'
                        self.tokens[i].pop(j-1)
                        self.tokens[i].pop(j-1)
                    elif self.tokens[i][j-1] == 'URL':
                        self.tokens[i][j-2] = '<URL>'
                        self.tokens[i].pop(j-1)
                        self.tokens[i].pop(j-1)
                    elif self.tokens[i][j-1] == 'NUM':
                        self.tokens[i][j-2] = '<NUM>'
                        self.tokens[i].pop(j-1)
                        self.tokens[i].pop(j-1)
                j += 1

In [166]:
tokenizer = Tokenizer('Pride and Prejudice - Jane Austen.txt')

[['The',
  'Project',
  'Gutenberg',
  'eBook',
  ',',
  'Pride',
  'and',
  'Prejudice',
  ',',
  'by',
  'Jane',
  'Austen',
  ',',
  'Edited',
  'by',
  'R',
  '.',
  'W',
  '.',
  '(',
  'Robert',
  'William',
  ')',
  'Chapman'],
 [''],
 ['This',
  'eBook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions',
  'whatsoever',
  '.'],
 ['You',
  'may',
  'copy',
  'it',
  ',',
  'give',
  'it',
  'away',
  'or',
  're-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'Project',
  'Gutenberg',
  'License',
  'included',
  'with',
  'this',
  'eBook',
  'or',
  'online',
  'at',
  '<URL>'],
 [''],
 [''],
 [''],
 [''],
 ['Title', ':', 'Pride', 'and', 'Prejudice'],
 [''],
 ['Author', ':', 'Jane', 'Austen'],
 ['Editor', ':', 'R', '.', 'W', '.', '(', 'Robert', 'William', ')', 'Chapman'],
 ['Release',
  'Date',
  ':',
  'May',
  '9',
  ',',
  '<NUM>',
  '[',
  'eBook',
  '<HASHT

## 2 - N-Gram

In [167]:
class NGram:
    # def __init__(self):

    def createNGram(self, tokens, n):
        self.tokens = tokens
        self.n = n
        self.ngrams = {}

        self.setUpTokens()

        self.generateNGrams()

    def setUpTokens(self):
        i = 0
        while i < len(self.tokens):
            if len(self.tokens[i]) == 1 and (self.tokens[i][0] == ' ' or self.tokens[i][0] == ''):
                self.tokens.pop(i)
                i -= 1
            i += 1

        for i in range(self.n-1):
            for j in range(len(self.tokens)):
                self.tokens[j].insert(0, '<s>')
                self.tokens[j].append('</s>')

    # def generateNGrams(self):
    #     for token in self.tokens:
    #         for i in range(len(token)-self.n+1):
    #             ngram = tuple(token[i:i+self.n])
    #             # print(ngram)
    #             if ngram in self.ngrams:
    #                 self.ngrams[ngram] += 1
    #             else:
    #                 self.ngrams[ngram] = 1

    #     # pp.pprint(self.ngrams)

    def generateNGrams(self):
        seqs = {}
        for token in self.tokens:
            for i in range(len(token)-self.n+1):
                seq = tuple(token[i:i+self.n])
                if seq in seqs:
                    seqs[seq] += 1
                else:
                    seqs[seq] = 1

        for seq in seqs:
            if self.ngrams.get(seq[:-1]) is None:
                self.ngrams[seq[:-1]] = {}
            self.ngrams[seq[:-1]][seq[-1]] = seqs[seq]

        # pp.pprint(self.ngrams)

    def saveNGram(self, path):
        with open(path, 'w') as f:
            ngram = str(self.ngrams)
            f.write(ngram)

    def loadNGram(self, path):
        with open(path, 'r') as f:
            ngram = f.read()
            self.ngrams = eval(ngram)

In [168]:
n_gram = NGram()
n_gram.createNGram(tokenizer.tokens, 3)
n_gram.saveNGram('ngram.txt')

{('!', '"'): {'-': 1,
              '</s>': 52,
              'accompanied': 1,
              'added': 1,
              'cried': 19,
              'exclaimed': 1,
              'observed': 1,
              'repeated': 4,
              'replied': 1,
              'said': 8,
              'she': 5,
              'thought': 2,
              'was': 1,
              'would': 1},
 ('!', ')'): {'and': 1, 'can': 1, 'on': 1, 'to': 1},
 ('!', '</s>'): {'</s>': 323},
 ('"', '"'): {'But': 1, 'Hunsford': 1, 'I': 1, 'Mr': 2, 'This': 1, '_That_': 1},
 ('"', "'"): {'Tis': 2},
 ('"', '('): {'for': 1},
 ('"', ')'): {',': 1},
 ('"', '-'): {'-': 1},
 ('"', '</s>'): {'</s>': 1207},
 ('"', 'A'): {'gamester': 1,
              'great': 2,
              'little': 2,
              'man': 3,
              'place': 1,
              'thorough': 1,
              'young': 1},
 ('"', 'About'): {'a': 2},
 ('"', 'After'): {'mentioning': 1},
 ('"', 'Ah'): {'!': 4},
 ('"', 'All'): {'this': 1, 'young': 1},
 ('"', 'All!--W