In [1]:
import re
import pickle
from nltk.stem.porter import PorterStemmer
import numpy as np
from typing import List, Set, Dict, Tuple, NewType
from operator import itemgetter
from nltk.corpus import stopwords
import csv
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

In [2]:
class SimpleTokenizer():
    def __init__(self, pattern:str):
        """Initialise the regular expression which will be used to tokenize our expression.

        Args:
            pattern (str): pattern to be used.
        """
        self.regexp = re.compile(pattern, re.MULTILINE | re.DOTALL)
    
    def tokenize_text_lines(self, text_lines:List[str]) -> List[str]:
        """Accepts a list of strings. Tokenizes each string and creates a list of the tokens.

        Args:
            text_lines (List[str]): List of strings.

        Returns:
            List[str]: List of tokens produced from the input strings.
        """
        tokens = []
        for line in text_lines:
            tokens += self.regexp.findall(line)
        return tokens

def construct_stopwords_set(stopwords_file_name:str) -> Set[str]:
    """Reads stopwords from stopwords_file_name and saves them in a set.

    Args:
        stopwords_file_name (str): Stop words file.

    Returns:
        Set[str]: [description]
    """
    with open(stopwords_file_name, 'r') as f:
        read_stopwords = f.read().splitlines()
    stopwords_set = set(read_stopwords)
    stopwords_set.update(stopwords.words("english"))
    return stopwords_set

class SimplePreprocessor():
    """Class for pre-processing text. Given a list of strings, it tokenizes them, removes stop words, lowercases and stems them.
    """
    def __init__(self, tokenizer:SimpleTokenizer, stop_words_set:Set[str], stemmer:PorterStemmer):
        self.tokenizer = tokenizer
        self.stop_words_set = stop_words_set
        self.stemmer = stemmer
    
    @staticmethod
    def lowercase_word(word:str) -> str:
        return str.lower(word)
    
    def remove_stop_words_lowercase_and_stem(self, tokens:List[str]) -> List[str]:
        final_tokens = []
        for token in tokens:
            lowercase_token = SimplePreprocessor.lowercase_word(token)
            if lowercase_token not in self.stop_words_set:
                stemmed_token = self.stemmer.stem(lowercase_token)
                final_tokens.append(stemmed_token)
        return final_tokens
    
    def process_text_lines(self, text_lines:List[str]) -> List[str]:
        tokens = self.tokenizer.tokenize_text_lines(text_lines)
        tokens = self.remove_stop_words_lowercase_and_stem(tokens)
        return tokens

def pickle_object(obj:object, file_name:str):
    with open(file_name, 'wb') as f:
        pickle.dump(obj, f)

def unpickle_object(file_name:str) -> object:
    with open(file_name, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [3]:
# Warning: must take into account the fact that some documents may disappear.

In [4]:
# Read the tsv file + extract the 3 corpora.
# Assumption: 3 corpora Quran, OT, NT.
def read_tsv_extract_corpora(tsv_file_name:str, corpus_names_to_int:Dict[str, int]) -> Dict[int, List[str]]:
    corpora = dict()
    for value in corpus_names_to_int.values():
        corpora[value] = []
    with open(tsv_file_name, mode='r', newline='\n') as f:
        read_tsv = csv.reader(f, delimiter="\t")
        for row in read_tsv:
            corpus_name = row[0]
            corpus_id = corpus_names_to_int[corpus_name]
            corpora[corpus_id].append(row[1])
    return corpora

def preprocess_corpora(corpora:Dict[int, List[str]], preprocessor:SimplePreprocessor) -> Dict[int, List[List[str]]]:
    preprocessed_corpora = dict()
    for key in corpora.keys():
        preprocessed_corpora[key] = []
        for document in corpora[key]:
            document_terms = preprocessor.process_text_lines([document])
            preprocessed_corpora[key].append(document_terms)
    return preprocessed_corpora

# ----------------------------------CREATE INDEX AND DOCID SET----------------------------------
Index = NewType('Index', Dict[str, Dict[int, Dict[int, int]]])
def read_corpora_and_create_index(corpora:Dict[int, List[List[str]]]) -> Tuple[Index, Dict[int, int]]:
    """Reads input trec file and creates a positional inverted index from it, and it also creates a set containing all document IDs.

    Args:
        input_file_name (str): input trec file name.
        preprocessor (SimplePreprocessor): initialized SimplePreprocessor.
    """
    index = dict()
    corpora_nr_docs = dict()
    
    for corpus_id in corpora.keys():
        corpora_nr_docs[corpus_id] = 0
        for (doc_id, doc_tokens) in enumerate(corpora[corpus_id]):
            corpora_nr_docs[corpus_id] += 1
            for token in doc_tokens:
                if token in index:
                    if corpus_id in index[token]:
                        if doc_id in index[token][corpus_id]:
                            index[token][corpus_id][doc_id] += 1
                        else:
                            index[token][corpus_id][doc_id] = 1
                    else:
                        index[token][corpus_id] = dict()
                        index[token][corpus_id][doc_id] = 1
                else:
                    index[token] = dict()
                    index[token][corpus_id] = dict()
                    index[token][corpus_id][doc_id] = 1
                    
                    
        print("Index construction for corpus " + str(corpus_id+1) + " finished.")

    return index, corpora_nr_docs

def calculate_freq_term(index:Index, term:str) -> int:
    if term not in index:
        return 0
    
    frequency = 0
    for corpus_id in index[term]:
        for doc_id in index[term][corpus_id]:
            frequency += index[term][corpus_id][doc_id]
    return frequency


def remove_low_freq_words_from_index(corpora_index:Index, threshold_freq:int) -> Index:
    new_index = dict()

    for term in corpora_index:
        freq = calculate_freq_term(corpora_index, term)
        if freq >= threshold_freq:
            new_index[term] = corpora_index[term]
    return new_index
    

def compute_MI_score_term_corpus(N:int, N_00:int, N_01:int, N_10:int, N_11:int) -> float:
    N_1x = N_10 + N_11 # 0 iff no corpus contains the term (impossible)
    N_x1 = N_01 + N_11 # 0 iff the corpus doesn't contain any documents (may be possible with a cheater corpus)
    N_0x = N_01 + N_00 # 0 iff ALL documents contain term t (may be possible if you miss a stop word or you tokenize incorrectly -- need to check for assignment imo)
    N_x0 = N_10 + N_00 # 0 N_10 = 0 iff no other documents (from other corpora) contain the term. N_00 = 0 iff every document (from other corpora) contain the term.
    # N_x0 can be 0 iff we have a single corpus.
    
    # 0 * log(0) = 0 by convention.
    MI_score = 0
    if N_10 != 0:
        MI_score += (N_10/N) * np.log2((N * N_10)/(N_1x * N_x0))
    
    if N_01 != 0:
        MI_score += (N_01/N) * np.log2((N * N_01)/(N_0x * N_x1))
    
    if N_11 != 0:
        MI_score += (N_11/N) * np.log2((N * N_11)/(N_1x * N_x1))
    
    if N_00 != 0:
        MI_score += (N_00/N) * np.log2((N * N_00)/(N_0x * N_x0))
        
    return MI_score

def compute_chi_score_term_corpus(N:int, N_00:int, N_01:int, N_10:int, N_11:int) -> float:
    chi_score_numerator = (N_11 + N_10 + N_01 + N_00) * (N_11 * N_00 - N_10 * N_01) ** 2
    # Same warning as above. Term in all documents, in no document, or one-corpus dataset.
    chi_score_denominator = (N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00)
    chi_score = chi_score_numerator/chi_score_denominator
    
    return chi_score

def compute_MI_chi_scores(index:Index, corpora_nr_docs:Dict[int, int], corpora_ids:List[int]) -> Tuple[Dict[int, List[Tuple[str, int]]], Dict[int, List[Tuple[str, int]]]]:
    MI_scores = dict()
    chi_scores = dict()

    for corpus_id in corpora_ids:
        MI_scores[corpus_id] = []
        chi_scores[corpus_id] = []
    
    N = 0
    for corpus_id in corpora_nr_docs:
        N += corpora_nr_docs[corpus_id]
    
    nr_docs_which_contain_term = dict()
    for term in index:
        N_1x = 0
        for corpus_id in index[term]:
            N_1x += len(index[term][corpus_id])
        nr_docs_which_contain_term[term] = N_1x
    
    for term in index:
        for corpus_id in corpora_ids:
            N_11 = 0
            if corpus_id not in index[term]:
                N_01 = corpora_nr_docs[corpus_id]
            else:
                for _ in index[term][corpus_id]:
                    N_11 += 1
                N_01 = corpora_nr_docs[corpus_id] - N_11
            N_10 = nr_docs_which_contain_term[term] - N_11
            N_00 = N - nr_docs_which_contain_term[term] - N_01

            MI_scores[corpus_id].append((term, compute_MI_score_term_corpus(N, N_00, N_01, N_10, N_11)))
            chi_scores[corpus_id].append((term, compute_chi_score_term_corpus(N, N_00, N_01, N_10, N_11)))
    
    for corpus_id in MI_scores:
        MI_scores[corpus_id] = sorted(MI_scores[corpus_id], key=itemgetter(1), reverse=True)
        chi_scores[corpus_id] = sorted(chi_scores[corpus_id], key=itemgetter(1), reverse=True)
    return MI_scores, chi_scores

def print_top_k_terms_for_each_corpus(MI_scores, chi_scores, int_to_corpus_names, k):
    for corpus_id in MI_scores.keys():
        corpus_name = int_to_corpus_names[corpus_id]
        # print('Top ' + str(k) + ' terms in ' + corpus_name + ' by MI score: ')
        # print(MI_scores[corpus_id][:k])
        # print('Top ' + str(k) + ' terms in ' + corpus_name + ' by Chi-squared score: ')
        # print(chi_scores[corpus_id][:k])
        
        file_name = corpus_name + '_' + 'MI.csv'
        file_content = "term,mi\n"
        for (term, MI_score) in MI_scores[corpus_id][:k]:
            file_content += term + ',' + str(round(MI_score, 5)) + '\n'
        with open(file_name, 'w') as f:
            f.write(file_content)
        
        file_name = corpus_name + '_' + 'chi.csv'
        file_content = "term,chisq\n"
        for (term, chi_score) in chi_scores[corpus_id][:k]:
            file_content += term + ',' + str(round(chi_score, 3)) + '\n'
        with open(file_name, 'w') as f:
            f.write(file_content)

In [35]:
def write_topic_words_to_file(topic_words:List[Tuple[str, float]], corpus_id:int):
    file_name = "topic_words_corpus_" + str(corpus_id) + ".csv"
    content = "Term,Score\n"
    for term, score in topic_words:
        content += term + "," + str(round(score, 3)) + '\n'
    with open(file_name, 'w') as f:
        f.write(content)

In [49]:
def run_topics_task(corpora:Dict[int, List[List[str]]], corpora_nr_docs:Dict[int, int], num_topics=20):
    clean_docs = []
    for corpus_id in corpora:
        clean_docs += corpora[corpus_id]
    
    dictionary = Dictionary(clean_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.6)
    corpus = [dictionary.doc2bow(text) for text in clean_docs]
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=25)
    
    corpora_topics_scores = dict()
    for corpus_id in corpora:
        corpora_topics_scores[corpus_id] = dict()
        for ii in range(num_topics):
            corpora_topics_scores[corpus_id][ii] = 0
    
    # Sum topic probs for each corpus.
    for corpus_id in corpora:
        for doc in corpora[corpus_id]:
            doc_topics = lda.get_document_topics(dictionary.doc2bow(doc), 0)
            for (topic_id, topic_prob) in doc_topics:
                corpora_topics_scores[corpus_id][topic_id] += topic_prob
    
    # Normalise topic probs.
    for corpus_id in corpora:
        corpus_nr_docs = corpora_nr_docs[corpus_id]
        for topic_id in range(num_topics):
            corpora_topics_scores[corpus_id][topic_id] /= corpus_nr_docs
    
    # Select top topic for each corpus.
    corpora_top_topic = dict()
    for corpus_id in corpora:
        top_topic = -1
        top_score = 0
        for topic_id in range(num_topics):
            topic_score = corpora_topics_scores[corpus_id][topic_id]
            if topic_score > top_score:
                top_topic = topic_id
                top_score = topic_score
        corpora_top_topic[corpus_id] = top_topic
    
    for corpus_id in corpora:
        top_topic = corpora_top_topic[corpus_id]
        print('Top topic for corpus: ' + str(corpus_id) + " is topic nr " + str(top_topic))
        print(lda.print_topic(top_topic, 10))
        topic_words = lda.show_topic(top_topic, 10)
        write_topic_words_to_file(topic_words, corpus_id)
    
    print('\n')
    for ii in range(num_topics):
        print('Topic ' + str(ii) + ': ' + str(round(corpora_topics_scores[0][ii], 3)) + ' ' + 
              str(round(corpora_topics_scores[1][ii], 3)) + ' ' + 
              str(round(corpora_topics_scores[2][ii], 3)))
    
    print('\n')
    for ii in range(num_topics):
        print('Topic ' + str(ii) + ' words: ')
        print(lda.print_topic(ii, 10))
    
    return lda

In [50]:
tsv_file_name = 'train_and_dev.tsv'
stopwords_file_name = "englishST.txt"
index_output_file_name = "index.txt"

stopwords_set = construct_stopwords_set(stopwords_file_name)
tokenizer = SimpleTokenizer('[a-zA-Z]+')
stemmer = PorterStemmer()
preprocessor = SimplePreprocessor(tokenizer, stopwords_set, stemmer)

corpus_names_to_int = {'Quran':0, 'OT':1, 'NT':2}
int_to_corpus_names = {0:'Quran', 1:'OT', 2:'NT'}
corpora = read_tsv_extract_corpora(tsv_file_name, corpus_names_to_int)

# Apply preprocessing to the documents in the corpus.
# Structure of "corpora" changes.
corpora = preprocess_corpora(corpora, preprocessor)


index, corpora_nr_docs = read_corpora_and_create_index(corpora)

MI_scores, chi_scores = compute_MI_chi_scores(index, corpora_nr_docs, corpus_names_to_int.values())
print_top_k_terms_for_each_corpus(MI_scores, chi_scores, int_to_corpus_names, 10)

run_topics_task(corpora, corpora_nr_docs)

Index construction for corpus 1 finished.
Index construction for corpus 2 finished.
Index construction for corpus 3 finished.
Top topic for corpus: 0 is topic nr 15
0.088*"allah" + 0.081*"receiv" + 0.053*"gate" + 0.043*"believ" + 0.040*"fear" + 0.039*"brother" + 0.038*"field" + 0.037*"enter" + 0.031*"abraham" + 0.029*"wall"
Top topic for corpus: 1 is topic nr 13
0.308*"god" + 0.087*"lord" + 0.065*"land" + 0.039*"year" + 0.038*"hear" + 0.025*"egypt" + 0.023*"peopl" + 0.021*"turn" + 0.020*"rejoic" + 0.017*"gold"
Top topic for corpus: 2 is topic nr 0
0.143*"jesu" + 0.056*"faith" + 0.048*"eye" + 0.048*"mine" + 0.040*"thing" + 0.034*"discipl" + 0.030*"jew" + 0.029*"heart" + 0.024*"bread" + 0.022*"wise"


Topic 0: 0.046 0.042 0.092
Topic 1: 0.026 0.056 0.038
Topic 2: 0.037 0.041 0.045
Topic 3: 0.105 0.049 0.05
Topic 4: 0.049 0.045 0.046
Topic 5: 0.042 0.054 0.04
Topic 6: 0.029 0.063 0.043
Topic 7: 0.048 0.034 0.038
Topic 8: 0.053 0.037 0.044
Topic 9: 0.061 0.068 0.067
Topic 10: 0.039 0.055 0

<gensim.models.ldamodel.LdaModel at 0x7f51250577c0>

In [41]:
def count_word_in_corpus(corpora, word):
    for corpus_id in corpora:
        occurrences = 0
        for doc in corpora[corpus_id]:
            for token in doc:
                if word == token:
                    occurrences += 1
        print('Corpus ' + str(corpus_id) + ": " + str(occurrences) + ' occurrences.')

In [28]:
count_word_in_corpus(corpora, 'david')

Corpus 0: 16 occurrences.
Corpus 1: 912 occurrences.
Corpus 2: 53 occurrences.


In [48]:
print(corpora_nr_docs)

{0: 5612, 1: 16720, 2: 5242}
