In [16]:
import re
import numpy as np

from nltk.stem.porter import PorterStemmer
from typing import List, Set, Dict, Tuple, NewType
from scipy.sparse import dok_matrix
import csv

In [20]:
class BOW():
    def __init__(self):
        self.token_to_id = dict()
        self.id_to_token = dict()
    
    @property
    def nr_tokens(self) -> int:
        return len(self.token_to_id)
    
    def transform_tokenized_sent(self, tokenized_sent:List[str]) -> List[int]:
        output = []
        for token in tokenized_sent:
            output.append(self.token_to_id[token])
        return output
    
    def transform_tokenized_sents(self, tokenized_sents:List[List[str]]) -> List[List[int]]:
        output = []
        for sent in tokenized_sents:
            self.add_list_of_tokens(sent)
            output.append(self.transform_tokenized_sent(sent))
        return output
    
    def reverse_transform_sent(self, id_sent:List[int]) -> List[str]:
        output = []
        for token_id in id_sent:
            output.append(self.id_to_token[token_id])
        return output
    
    def reverse_transform_sents(self, id_sents:List[List[str]]) -> List[List[int]]:
        output = []
        for sent in id_sents:
            output.append(self.reverse_transform_sent(sent))
        return output
        
    def add_token(self, token:str):
        if token not in self.token_to_id:
            nr_tokens = len(self.token_to_id)
            self.token_to_id[token] = nr_tokens
            self.id_to_token[nr_tokens] = token
    
    def add_list_of_tokens(self, tokens:List[str]):
        for token in tokens:
            self.add_token(token)
            
    def get_token_id(self, token:str) -> int:
        if token not in self.token_to_id:
            print("Token " + token + " not in the BOW")
            return -1
        else:
            return self.token_to_id[token]
    
    def get_token_from_id(self, token_id:int) -> str:
        if token_id not in self.id_to_token:
            print("Token id " + str(token_id) + " is not in the BOW.")
            return ""
        else:
            return self.id_to_token[token]
        

class SimpleTokenizer():
    def __init__(self, pattern:str):
        """Initialise the regular expression which will be used to tokenize our expression.

        Args:
            pattern (str): pattern to be used.
        """
        self.regexp = re.compile(pattern, re.MULTILINE | re.DOTALL)
    
    def tokenize_text_lines(self, text_lines:List[str]) -> List[str]:
        """Accepts a list of strings. Tokenizes each string and creates a list of the tokens.

        Args:
            text_lines (List[str]): List of strings.

        Returns:
            List[str]: List of tokens produced from the input strings.
        """
        tokens = []
        for line in text_lines:
            tokens += self.regexp.findall(line)
        return tokens
    
    def tokenize_list_of_strings(self, string_list:List[str]) -> List[List[str]]:
        list_of_tokens = []
        for string in string_list:
            list_of_tokens.append(self.regexp.findall(string))
        return list_of_tokens

def read_tsv_extract_corpora(tsv_file_name:str, corpus_names_to_int:Dict[str, int]) -> Dict[int, List[str]]:
    corpora = dict()
    for value in corpus_names_to_int.values():
        corpora[value] = []
    with open(tsv_file_name, mode='r', newline='\n') as f:
        read_tsv = csv.reader(f, delimiter="\t")
        for row in read_tsv:
            corpus_name = row[0]
            corpus_id = corpus_names_to_int[corpus_name]
            corpora[corpus_id].append(row[1])
    return corpora

def tokenize_corpora(corpora:Dict[int, List[str]], tokenizer:SimpleTokenizer) -> Dict[int, List[List[str]]]:
    tokenized_corpora = dict()
    for key in corpora.keys():
        tokenized_corpora[key] = []
        for document in corpora[key]:
            document_terms = tokenizer.tokenize_text_lines([document])
            tokenized_corpora[key].append(document_terms)
    return tokenized_corpora

def docs_to_bow_sents(docs:List[List[str]]) -> BOW, List[List[int]]:
    bow = BOW()
    bow_sents = bow.transform_tokenized_sents(docs)
    return bow, bow_sents

def bow_sents_to_dok(bow_sents:List[List[int]], bow:BOW) -> dok_matrix:
    nr_tokens = bow.nr_tokens
    dok = dok_matrix((len(bow_sents), nr_tokens), dtype='int')
    for sent_number, sent in enumerate(bow_sents):
        for token_id in sent:
            dok[sent_number, token_id] += 1
    return dok

In [23]:
tokenizer = SimpleTokenizer('[a-zA-Z]+')
tokenized_sents = tokenizer.tokenize_list_of_strings(['something or other', 'other or something..'])
bow = BOW()
bow_sents = bow.transform_tokenized_sents(tokenized_sents)

tsv_file_name = 'train_and_dev.tsv'
stopwords_file_name = "englishST.txt"
index_output_file_name = "index.txt"

corpus_names_to_int = {'Quran':0, 'OT':1, 'NT':2}
int_to_corpus_names = {0:'Quran', 1:'OT', 2:'NT'}
corpora = read_tsv_extract_corpora(tsv_file_name, corpus_names_to_int)
tokenized_corpora = tokenize_corpora(corpora, tokenizer)

bow, bow_sents = docs_to_bow_sents(corpora[0].items())

[['Praise', 'be', 'to', 'Allah', 'Lord', 'of', 'the', 'Worlds'],
 ['the', 'Merciful', 'the', 'Most', 'Merciful'],
 ['Owner', 'of', 'the', 'Day', 'of', 'Recompense'],
 ['Guide', 'us', 'to', 'the', 'Straight', 'Path'],
 ['the',
  'Path',
  'of',
  'those',
  'upon',
  'whom',
  'You',
  'have',
  'favored',
  'not',
  'those',
  'upon',
  'whom',
  'is',
  'the',
  'anger',
  'nor',
  'the',
  'astray',
  'Amen',
  'please',
  'answer'],
 ['AlifLaamMeem'],
 ['That',
  'is',
  'the',
  'Holy',
  'Book',
  'where',
  'there',
  'is',
  'no',
  'doubt',
  'It',
  'is',
  'a',
  'guidance',
  'for',
  'the',
  'cautious',
  'of',
  'evil',
  'and',
  'Hell'],
 ['Who',
  'believe',
  'in',
  'the',
  'unseen',
  'and',
  'establish',
  'the',
  'daily',
  'prayer',
  'who',
  'spend',
  'out',
  'of',
  'what',
  'We',
  'have',
  'provided',
  'them'],
 ['Who',
  'believe',
  'in',
  'that',
  'which',
  'has',
  'been',
  'sent',
  'down',
  'to',
  'you',
  'Prophet',
  'Muhammad',
  'and'

['Praise be to Allah, Lord of the Worlds,', 'the Merciful, the Most Merciful,', 'Owner of the Day of Recompense.', 'Guide us to the Straight Path,', 'the Path of those upon whom You have favored, not those upon whom is the anger, nor the astray. (Amen please answer)', 'AlifLaamMeem.', 'That is the (Holy) Book, where there is no doubt. It is a guidance for the cautious (of evil and Hell).', 'Who believe in the unseen and establish the (daily) prayer; who spend out of what We have provided them.', 'Who believe in that which has been sent down to you (Prophet Muhammad) and what has been sent down before you (to Prophets Jesus and Moses) and firmly believe in the Everlasting Life.', 'These are guided by their Lord; these surely are the prosperous.']
