# Importing libraries

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import numpy as np
import math

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading Corpus

In [None]:
# NFCorpus
!gdown 1QscBtuFj3XW7Gl3lhWdEx6r8Yit0yJ2v

Downloading...
From: https://drive.google.com/uc?id=1QscBtuFj3XW7Gl3lhWdEx6r8Yit0yJ2v
To: /content/nfcorpus.tar.gz
100% 97.3M/97.3M [00:00<00:00, 156MB/s]


In [None]:
!tar -xvf 'nfcorpus.tar.gz'

nfcorpus/
nfcorpus/train.docs
nfcorpus/test.docs
nfcorpus/dev.docs
nfcorpus/dev.3-2-1.qrel
nfcorpus/test.3-2-1.qrel
nfcorpus/train.3-2-1.qrel
nfcorpus/raw/
nfcorpus/raw/doc_dump.txt
nfcorpus/raw/dev.docs.ids
nfcorpus/raw/dev.queries.ids
nfcorpus/raw/test.docs.ids
nfcorpus/raw/test.queries.ids
nfcorpus/raw/train.docs.ids
nfcorpus/raw/train.queries.ids
nfcorpus/raw/stopwords.large
nfcorpus/raw/nfdump.txt
nfcorpus/raw/all_videos.ids
nfcorpus/raw/nontopics.ids
nfcorpus/test.2-1-0.qrel
nfcorpus/dev.2-1-0.qrel
nfcorpus/train.2-1-0.qrel
nfcorpus/ecir2016.bib
nfcorpus/test.vid-desc.queries
nfcorpus/dev.vid-desc.queries
nfcorpus/train.vid-desc.queries
nfcorpus/dev.all.queries
nfcorpus/test.all.queries
nfcorpus/train.all.queries
nfcorpus/dev.vid-titles.queries
nfcorpus/test.vid-titles.queries
nfcorpus/train.vid-titles.queries
nfcorpus/dev.titles.queries
nfcorpus/test.titles.queries
nfcorpus/train.titles.queries
nfcorpus/dev.nontopic-titles.queries
nfcorpus/test.nontopic-titles.queries
nfcorpus/t

In [None]:
## Read the .docs file
CorpusDocs = {}

with open('/content/nfcorpus/train.docs', 'r', encoding='utf-8') as file:
    lines = file.readlines()

    for line in lines:
        line = line.strip()
        doc_id, doc_text = line.split('\t')
        CorpusDocs[doc_id] = doc_text

# Now you have the documents stored in the 'documents' dictionary

In [None]:
# Read any queries
CorpusQueries = {}

with open('/content/nfcorpus/train.all.queries', 'r', encoding='utf-8') as file:
    lines = file.readlines()

    for line in lines:
        line = line.strip()
        query_id, query_text = line.split('\t')
        CorpusQueries[query_id] = query_text

# Now you have the queries stored in the 'queries' dictionary
CorpusQueries = dict(sorted(CorpusQueries.items()))


In [None]:
# Read relevance judgements

CorpusRelJudgements = {}

with open('/content/nfcorpus/train.3-2-1.qrel', 'r', encoding='utf-8') as file:
    lines = file.readlines()

    for line in lines:
        line = line.strip()
        query_id, _, doc_id, relevance = line.split('\t')

        if query_id not in CorpusRelJudgements:
            CorpusRelJudgements[query_id] = {}

        CorpusRelJudgements[query_id][doc_id] = int(relevance)

# Now you have the relevance judgments stored in the 'relevance_judgments' dictionary
corpus_result_que = {}
for i in CorpusRelJudgements:
  res = []
  for j in CorpusRelJudgements[i]:
    res.append(j)
  corpus_result_que[i] = res

corpus_result_que = dict(sorted(corpus_result_que.items()))

# Functions

## Porter Stemmer Preprocess

In [None]:
def preprocessPorterStemmerOnly(docs):
    # docs is a dict {id: doc}
    porter_stemmer = PorterStemmer()
    terms = set()
    modified_docs = {}

    for i, doc in enumerate(docs):
        modified_doc = docs[doc].lower()
        modified_doc = re.sub(r'[^a-z0-9\s]', '', modified_doc)
        # use nltk to tokenize and filter the punctuation
        words = word_tokenize(modified_doc)
        sentence = ""
        for word in words:
            if word.isdigit() or word.isnumeric():
                continue
            if not word.isalnum():
                continue
            word = porter_stemmer.stem(word)
            terms.add(word)
            sentence += " " + word
        modified_docs[doc] = sentence

    return modified_docs, terms

In [None]:
# Porter Stemmer version of the Preprocess
def preprocessPorterStemmer(docs):
    # docs is a dict {id: doc}
    porter_stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    terms = set()
    modified_docs = {}

    for i, doc in enumerate(docs):
        modified_doc = docs[doc].lower()
        modified_doc = re.sub(r'[^a-z0-9\s]', '', modified_doc)
        # use nltk to tokenize and filter the punctuation
        words = word_tokenize(modified_doc)
        sentence = ""
        for word in words:
            if word not in string.punctuation and word not in stop_words:
                if word.isdigit() or word.isnumeric():
                    continue
                if not word.isalnum():
                    continue
                word = porter_stemmer.stem(word)
                terms.add(word)
                sentence += " " + word
        modified_docs[doc] = sentence

    return modified_docs, terms

# docs, terms = preprocessPorterStemmer(docs)

## Lemmatizer Preprocess

In [None]:
def preprocessLemmatizerOnly(docs):
    # docs is a dict {id: doc}
    lemmatizer = WordNetLemmatizer()
    terms = set()
    modified_docs = {}

    for i, doc in enumerate(docs):
        modified_doc = docs[doc].lower()
        modified_doc = re.sub(r'[^a-z0-9\s]', '', modified_doc)
        # use nltk to tokenize and filter the punctuation
        words = word_tokenize(modified_doc)
        sentence = ""
        for word in words:
            if word.isdigit() or word.isnumeric():
                continue
            if not word.isalnum():
                continue
            word = lemmatizer.lemmatize(word)
            terms.add(word)
            sentence += " " + word
        modified_docs[doc] = sentence

    return modified_docs, terms


In [None]:
# Lemmatizer Version of Preprocess
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import string

def preprocessLematizer(docs):
    # docs is a dict {id: doc}
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    terms = set()
    modified_docs = {}

    for i, doc in enumerate(docs):
        modified_doc = docs[doc].lower()
        modified_doc = re.sub(r'[^a-z0-9\s]', '', modified_doc)
        # use nltk to tokenize and filter the punctuation
        words = word_tokenize(modified_doc)
        sentence = ""
        for word in words:
            if word not in string.punctuation and word not in stop_words:
                if word.isdigit() or word.isnumeric():
                    continue
                if not word.isalnum():
                    continue
                word = lemmatizer.lemmatize(word)
                terms.add(word)
                sentence += " " + word
        modified_docs[doc] = sentence

    return modified_docs, terms

# docs, terms = preprocessLematizer(docs)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Indexing

In [None]:
# Create the indexing, order term and inverse order term

def indexing(docs, terms):
    # docs is a dict which {id: doc}
    # terms is the set word not include punctuation
    # indexs is a list {'word': doc_id} word can duplicate
    # order_term is a dict {'word': order in term}
    # inverse_order_term {order in term: 'word'}

    index = []
    order_term = {term: index for index, term in enumerate(terms)}
    inverse_order_term = {index: term for term, index in order_term.items()}
    for doc_id, doc in docs.items():
        words = re.findall(r'\b\w+\b', doc)
        words = [word for word in words if word in terms]
        for word in words:
            index.append({'word': word, 'doc_id':doc_id})

    return index, order_term, inverse_order_term

# indexs, order_term, inverse_order_term = indexing(docs, terms)

## Find Max ID

In [None]:
def find_max_id(docs):
    max_id = 0
    for doc in docs:
        max_id = max(max_id, doc)
    return max_id

# max_id = find_max_id(docs)

## Adding Frequencies

In [None]:
# Adding Frequency to the index table

def adding_frequency(docs, terms, sorted_indexs):
    # new_sorted_indexs is a dict {'word', doc_id, frequency in doc_id}
    new_sorted_indexs = []
    for entry in sorted_indexs:
        word = entry['word']
        doc_id = entry['doc_id']
        frequency = docs[doc_id].count(word)
        entry['frequency'] = frequency
        new_sorted_indexs.append(entry)
    return new_sorted_indexs

# new_sorted_indexs = adding_frequency(docs, terms, sorted_indexs)

## Build Vocabs

In [None]:
# Build the vocabs

def build_vocab(docs, terms, new_sorted_indexs):
    # vocab {'word': word, 'number of doc include': number of document, 'frequency': frequency, 'position' : a set position in new_sorted_indexs}


    vocab = {}
    for i, entry in enumerate(new_sorted_indexs):
        word = entry['word']
        doc_id = entry['doc_id']
        if word not in vocab:
            vocab[word] = {'number of doc include': 0, 'frequency': 0, 'position': set()}
        if doc_id not in docs:
            continue
        vocab[word]['number of doc include'] += 1
        vocab[word]['frequency'] += entry['frequency']
        vocab[word]['position'].add(i)

    return vocab

# vocab = build_vocab(docs, terms, sorted_indexs)

## IDF

In [None]:
# Calculate the IDF of a term with multiple options

def calculate_idf(docs, terms, vocab, new_sorted_indexs, option):
    if option == 'inverse':
        calculate_inverse_idf(vocab, len(docs))
    elif option == 'count':
        calculate_count_idf(vocab)
    elif option == 'square':
        calculate_inverse_idf(vocab, len(docs))
        calculate_square_idf(vocab)
    elif option == 'proba':
        calculate_proba_idf(vocab, len(docs))
    elif option == 'frequency':
        calculate_frequency_idf(vocab, new_sorted_indexs)
    return vocab

def calculate_inverse_idf(vocab, total_docs):
    for word in vocab:
        num_docs = vocab[word]['number of doc include']
        if num_docs > 0:
            idf = math.log(total_docs / num_docs)
            vocab[word]['idf'] = idf

def calculate_count_idf(vocab):
    for word in vocab:
        num_docs = vocab[word]['number of doc include']
        if num_docs > 0:
            vocab[word]['idf'] = 1 / num_docs

def calculate_square_idf(vocab):
    for word in vocab:
        if 'idf' in vocab[word]:
            idf = vocab[word]['idf']
            square_idf = idf * idf
            vocab[word]['idf'] = square_idf

def calculate_proba_idf(vocab, total_docs):
    for word in vocab:
        num_docs = vocab[word]['number of doc include']
        if num_docs > 0:
            idf = math.log((total_docs - num_docs) / num_docs)
            vocab[word]['idf'] = idf

def calculate_frequency_idf(vocab, new_sorted_indexs):
    for word in vocab:
        num_docs = vocab[word]['number of doc include']
        if num_docs > 0:
            idf = 1 / num_docs
            vocab[word]['idf'] = idf

## TF

In [None]:
# Calculate the TF of a term with multiple options

def calculate_tf(docs, terms, vocab, new_sorted_indexs, option):
    if option == 'binary':
        calculate_binary_tf(new_sorted_indexs)
    elif option == 'count':
        calculate_count_tf(new_sorted_indexs)
    elif option == 'max':
        calculate_max_tf(new_sorted_indexs)
    elif option == 'augmented':
        calculate_augmented_tf(new_sorted_indexs)
    elif option == 'logarithmic':
        calculate_logarithmic_tf(new_sorted_indexs)
    return new_sorted_indexs

def calculate_binary_tf(new_sorted_indexs):
    for entry in new_sorted_indexs:
        entry['tf'] = 1

def calculate_count_tf(new_sorted_indexs):
    for entry in new_sorted_indexs:
        entry['tf'] = entry['frequency']

def calculate_max_tf(new_sorted_indexs):
    max_tf = max(entry['frequency'] for entry in new_sorted_indexs)
    for entry in new_sorted_indexs:
        entry['tf'] = entry['frequency'] / max_tf

def calculate_augmented_tf(new_sorted_indexs):
    max_tf = max(entry['frequency'] for entry in new_sorted_indexs)
    for entry in new_sorted_indexs:
        entry['tf'] = 0.5 + 0.5 * entry['frequency'] / max_tf

def calculate_logarithmic_tf(new_sorted_indexs):
    for entry in new_sorted_indexs:
        entry['tf'] = 1 + math.log(entry['frequency'])

## Calculate the weight

In [None]:
def calculate_w(docs, terms, new_idf_vocab, new_tf_sorted_indexs, option):
    # w = idf * tf
    # tf in new_sorted_indexs
    # idf in vocab

    for entry in new_tf_sorted_indexs:
        word = entry['word']
        doc_id = entry['doc_id']
        if word in new_idf_vocab:
            tf = entry['tf']
            idf = new_idf_vocab[word]['idf']
        entry['w'] = idf * tf
    if option == 'sum':
        calculate_sum_w(new_tf_sorted_indexs)
    elif option == 'cosine':
        calculate_cosine_w(new_tf_sorted_indexs)
    elif option == '**4':
        calculate_power_4_w(new_tf_sorted_indexs)
    elif option == 'max':
        calculate_max_w(new_tf_sorted_indexs)

    return new_tf_sorted_indexs

def calculate_sum_w(new_tf_sorted_indexs):
    doc_weights = {}
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']
        if doc_id in doc_weights:
            doc_weights[doc_id] += w
        else:
            doc_weights[doc_id] = w
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        entry['w'] /= doc_weights[doc_id]

def calculate_cosine_w(new_tf_sorted_indexs):
    doc_lengths = {}
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']
        if doc_id in doc_lengths:
            doc_lengths[doc_id] += w ** 2
        else:
            doc_lengths[doc_id] = w ** 2
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']
        length = math.sqrt(doc_lengths[doc_id])
        entry['w'] = w / length if length != 0 else 0.0

def calculate_power_4_w(new_tf_sorted_indexs):
    doc_lengths = {}
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']
        if doc_id in doc_lengths:
            doc_lengths[doc_id] += w ** 4
        else:
            doc_lengths[doc_id] = w ** 4
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']
        length = doc_lengths[doc_id]
        entry['w'] = w / length if length != 0 else 0.0

def calculate_max_w(new_tf_sorted_indexs):
    doc_max_w = {}
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']

        if doc_id in doc_max_w:
            doc_max_w[doc_id] = max(doc_max_w[doc_id], w)
        else:
            doc_max_w[doc_id] = w
    for entry in new_tf_sorted_indexs:
        doc_id = entry['doc_id']
        w = entry['w']
        max_w = doc_max_w[doc_id]
        entry['w'] = w / max_w if max_w != 0 else 0.0

## Calculate the TF-IDF Score of the query

In [None]:
def calculate_tf_idf_query(query, new_tf_sorted_indexs, new_idf_vocab, order_term, option_norm):
    tf_idf_query = {}
    query = query.lower()
    query = re.findall(r'\b\w+\b', query)
    query = [word for word in query if word in terms]
    # tf number appearance
    # idf get in new_idf_vocab
    # option norm: 'sum', 'cosine', '**4', 'max'

    term_frequencies = {}
    for term in query:
        term_frequencies[term] = term_frequencies.get(term, 0) + 1

    for term, frequency in term_frequencies.items():
        if term in order_term:
            tf = frequency
            idf = new_idf_vocab[term]['idf']
            tf_idf_query[term] = tf * idf

    if option_norm == 'sum':
        tf_idf_sum = sum(tf_idf_query.values())
        if tf_idf_sum != 0:
            tf_idf_query = {term: tf_idf / tf_idf_sum for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'cosine':
        tf_idf_norm = math.sqrt(sum([tf_idf ** 2 for tf_idf in tf_idf_query.values()]))
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == '**4':
        tf_idf_norm = sum([tf_idf ** 4 for tf_idf in tf_idf_query.values()])
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'max':
        tf_idf_max = max(tf_idf_query.values())
        if tf_idf_max != 0:
            tf_idf_query = {term: tf_idf / tf_idf_max for term, tf_idf in tf_idf_query.items()}
    return tf_idf_query

In [None]:
def calculate_tf_idf_queryPorterStemmerOnly(query, new_tf_sorted_indexs, new_idf_vocab, order_term, option_norm):
    tf_idf_query = {}
    query = query.lower()
    query = re.findall(r'\b\w+\b', query)

    porter_stemmer = PorterStemmer()
    query = [porter_stemmer.stem(word) for word in query]

    # tf number appearance
    # idf get in new_idf_vocab
    # option norm: 'sum', 'cosine', '**4', 'max'

    term_frequencies = {}
    for term in query:
        term_frequencies[term] = term_frequencies.get(term, 0) + 1

    for term, frequency in term_frequencies.items():
        if term in order_term:
            tf = frequency
            idf = new_idf_vocab[term]['idf']
            tf_idf_query[term] = tf * idf

    if option_norm == 'sum':
        tf_idf_sum = sum(tf_idf_query.values())
        if tf_idf_sum != 0:
            tf_idf_query = {term: tf_idf / tf_idf_sum for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'cosine':
        tf_idf_norm = math.sqrt(sum([tf_idf ** 2 for tf_idf in tf_idf_query.values()]))
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == '**4':
        tf_idf_norm = sum([tf_idf ** 4 for tf_idf in tf_idf_query.values()])
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'max':
        tf_idf_max = max(tf_idf_query.values())
        if tf_idf_max != 0:
            tf_idf_query = {term: tf_idf / tf_idf_max for term, tf_idf in tf_idf_query.items()}
    return tf_idf_query

In [None]:
def calculate_tf_idf_queryPorterStemmer(query, new_tf_sorted_indexs, new_idf_vocab, order_term, option_norm):
    tf_idf_query = {}
    query = query.lower()
    query = re.findall(r'\b\w+\b', query)

    porter_stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    query = [porter_stemmer.stem(word) for word in query if word not in stop_words]

    # tf number appearance
    # idf get in new_idf_vocab
    # option norm: 'sum', 'cosine', '**4', 'max'

    term_frequencies = {}
    for term in query:
        term_frequencies[term] = term_frequencies.get(term, 0) + 1

    for term, frequency in term_frequencies.items():
        if term in order_term:
            tf = frequency
            idf = new_idf_vocab[term]['idf']
            tf_idf_query[term] = tf * idf

    if option_norm == 'sum':
        tf_idf_sum = sum(tf_idf_query.values())
        if tf_idf_sum != 0:
            tf_idf_query = {term: tf_idf / tf_idf_sum for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'cosine':
        tf_idf_norm = math.sqrt(sum([tf_idf ** 2 for tf_idf in tf_idf_query.values()]))
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == '**4':
        tf_idf_norm = sum([tf_idf ** 4 for tf_idf in tf_idf_query.values()])
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'max':
        tf_idf_max = max(tf_idf_query.values())
        if tf_idf_max != 0:
            tf_idf_query = {term: tf_idf / tf_idf_max for term, tf_idf in tf_idf_query.items()}
    return tf_idf_query

# query = "what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft ."
# tf_idf_query = calculate_tf_idf_query(query, new_tf_sorted_indexs, new_idf_vocab, order_term, 'sum')

In [None]:
def calculate_tf_idf_queryLemmatizerOnly(query, new_tf_sorted_indexs, new_idf_vocab, order_term, option_norm):
    tf_idf_query = {}
    query = query.lower()
    query = re.findall(r'\b\w+\b', query)

    lemmatizer = WordNetLemmatizer()
    query = [lemmatizer.lemmatize(word) for word in query]

    # tf number appearance
    # idf get in new_idf_vocab
    # option norm: 'sum', 'cosine', '**4', 'max'

    term_frequencies = {}
    for term in query:
        term_frequencies[term] = term_frequencies.get(term, 0) + 1

    for term, frequency in term_frequencies.items():
        if term in order_term:
            tf = frequency
            idf = new_idf_vocab[term]['idf']
            tf_idf_query[term] = tf * idf

    if option_norm == 'sum':
        tf_idf_sum = sum(tf_idf_query.values())
        if tf_idf_sum != 0:
            tf_idf_query = {term: tf_idf / tf_idf_sum for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'cosine':
        tf_idf_norm = math.sqrt(sum([tf_idf ** 2 for tf_idf in tf_idf_query.values()]))
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == '**4':
        tf_idf_norm = sum([tf_idf ** 4 for tf_idf in tf_idf_query.values()])
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'max':
        tf_idf_max = max(tf_idf_query.values())
        if tf_idf_max != 0:
            tf_idf_query = {term: tf_idf / tf_idf_max for term, tf_idf in tf_idf_query.items()}
    return tf_idf_query


In [None]:
def calculate_tf_idf_queryLemmatizer(query, new_tf_sorted_indexs, new_idf_vocab, order_term, option_norm):
    tf_idf_query = {}
    query = query.lower()
    query = re.findall(r'\b\w+\b', query)

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    query = [lemmatizer.lemmatize(word) for word in query if word not in stop_words]

    # tf number appearance
    # idf get in new_idf_vocab
    # option norm: 'sum', 'cosine', '**4', 'max'

    term_frequencies = {}
    for term in query:
        term_frequencies[term] = term_frequencies.get(term, 0) + 1

    for term, frequency in term_frequencies.items():
        if term in order_term:
            tf = frequency
            idf = new_idf_vocab[term]['idf']
            tf_idf_query[term] = tf * idf

    if option_norm == 'sum':
        tf_idf_sum = sum(tf_idf_query.values())
        if tf_idf_sum != 0:
            tf_idf_query = {term: tf_idf / tf_idf_sum for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'cosine':
        tf_idf_norm = math.sqrt(sum([tf_idf ** 2 for tf_idf in tf_idf_query.values()]))
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == '**4':
        tf_idf_norm = sum([tf_idf ** 4 for tf_idf in tf_idf_query.values()])
        if tf_idf_norm != 0:
            tf_idf_query = {term: tf_idf / tf_idf_norm for term, tf_idf in tf_idf_query.items()}
    elif option_norm == 'max':
        tf_idf_max = max(tf_idf_query.values())
        if tf_idf_max != 0:
            tf_idf_query = {term: tf_idf / tf_idf_max for term, tf_idf in tf_idf_query.items()}
    return tf_idf_query

# query = "what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft ."
# tf_idf_query = calculate_tf_idf_queryLemmatizer(query, new_tf_sorted_indexs, new_idf_vocab, order_term, 'sum')

## Information Retrieval Function (Finding Relevant Documents)

In [None]:
def information_retrieval(tf_idf_query, new_idf_vocab, new_w_sorted_indexs):
    result = {}
    for term, query_weight in tf_idf_query.items():
        if term not in new_idf_vocab:
            continue
        # get doc_id by new_idf_vocab['position']
        # get w by new_w_sortd_indexs[doc_id]
        positions = list(new_idf_vocab[term]['position'])
        for position in positions:
            entry = new_w_sorted_indexs[position]
            doc_id = entry['doc_id']
            doc_weight = entry['w']
            retrieval_score = query_weight * doc_weight
            if doc_id in result:
                result[doc_id] += retrieval_score
            else:
                result[doc_id] = retrieval_score
    return result

# retrieval_scores = information_retrieval(tf_idf_query,new_idf_vocab, new_tf_sorted_indexs)
# sorted_results = sorted(retrieval_scores.items(), key=lambda x: x[1], reverse=True)
# sorted_results[:29]

## Calculate the AP/mAP of all the queries

In [None]:
def precision(y_true, y_pred):
  prec_list = []
  rec_list = []
  count = 0
  for i in range(len(y_pred)):
    if y_pred[i] in y_true:
      count += 1
      prec_list.append(count/(i+1))
      rec_list.append(count/len(y_true))
  return prec_list, rec_list
# list prec nội suy nè
def prec_in(prec_list, rec_list):
  prec_list_in = []
  for i in range(11):
    rec = i/10
    # prec_rec = max(rec->all)
    # lấy max từ prec_rec_in
    prec_rec_in = next((x[0] for x in enumerate(rec_list) if x[1] >= rec),-1)
    # print('rec_in:', prec_rec_in)
    prec_rec = 0
    if prec_rec_in != -1:
      prec_rec = max(prec_list[prec_rec_in:])
    prec_list_in.append(prec_rec)
  return prec_list_in

In [None]:
def MAP(list_y_true, list_y_pred, noisuy=True):
    common_keys = set(list_y_true.keys()).intersection(set(list_y_pred.keys()))
    result_dict = {}

    if noisuy:
        prec_in_data = {}
        for key in common_keys:
            prec_list, rec_list = precision(list_y_true[key], list_y_pred[key])
            prec_in_list = prec_in(prec_list, rec_list)
            # print('prec_list', prec_list)
            # print('rec_list', rec_list)
            # print('prec_list_in', prec_in_list)
            result_dict[key] = sum(prec_in_list) / len(prec_in_list)
            prec_in_data[key] = prec_in_list
        return prec_in_data, result_dict, sum(result_dict.values()) / len(result_dict.values())
    else:
        for key in common_keys:
            prec_list, rec_list = precision(list_y_true[key], list_y_pred[key])
            if len(prec_list) != 0:
                result_dict[key] = sum(prec_list) / len(prec_list)
            else:
                result_dict[key] = 0

    return result_dict, sum(result_dict.values()) / len(result_dict.values())

## P and R ver chang

In [None]:
def count_P_R(list_y_true, list_y_pred):
  common_keys = set(list_y_true.keys()).intersection(set(list_y_pred.keys()))
  result_P_dict = {}
  result_R_dict = {}
  for key in common_keys:
      count = 0
      for i in range(len(list_y_pred[key])):
        if list_y_pred[key][i] in list_y_true[key]:
          count += 1
      if len(list_y_pred[key])!=0:
        result_P_dict[key] = count / len(list_y_pred[key])
      else:
        result_P_dict[key] = 0
      result_R_dict[key] = count / len(list_y_true[key])
  return result_P_dict, result_R_dict, sum(result_P_dict.values()) / len(result_P_dict.values()), sum(result_R_dict.values()) / len(result_R_dict.values())

In [None]:
def trec_11(prec_in_data):
  trec_11 = {}
  for i in range(11):
    trec_11[i/10] = 0
    for j in prec_in_data:
      trec_11[i/10]+=prec_in_data[j][i]/len(prec_in_data)
  return trec_11

# Porter Stemmer

## Porter Stem + Remove Stopwords

In [None]:
import time

# Measure total execution time
start_time = time.time()

docs, terms = preprocessPorterStemmer(CorpusDocs)
indexs, order_term, inverse_order_term = indexing(docs, terms)

print(len(indexs))

## Sorting the index list in alphabetical order
sorted_indexs = sorted(indexs, key=lambda x: x['word'])
## Adding the frequencies to the terms in the index table
new_sorted_indexs = adding_frequency(docs, terms, sorted_indexs)
# Build the vocab table using the docs, terms and the sorted indexes
vocab = build_vocab(docs, terms, sorted_indexs)

# Calculate total execution time
total_timePSStopIndex = time.time() - start_time

# Print total execution time
print("Total execution time:", total_timePSStopIndex)

527828
Total execution time: 14.156087398529053


In [None]:
print(f'number of word in terms: {len(terms)}')
print(f'number of word in indexs: {len(indexs)}')
print(f'number of document: {len(docs)}')

number of word in terms: 20978
number of word in indexs: 527828
number of document: 3612


In [None]:
new_tf_sorted_indexs = calculate_tf(docs, terms, vocab, sorted_indexs, 'binary')
new_idf_vocab = calculate_idf(docs, terms, vocab, sorted_indexs, 'inverse')
new_w_sorted_indexs = calculate_w(docs, terms, new_idf_vocab, new_tf_sorted_indexs, 'cosine')

retrievalResult = dict()

start_time = time.time()

for key, value in CorpusQueries.items():
    valueTFIDF = calculate_tf_idf_queryPorterStemmer(value, new_tf_sorted_indexs, new_idf_vocab, order_term, 'cosine')

    retrieval_scores = information_retrieval(valueTFIDF,new_idf_vocab, new_tf_sorted_indexs)
    sorted_results = sorted(retrieval_scores.items(), key=lambda x: x[1], reverse=True)

    retrievalResult[key] = sorted_results

# Calculate total execution time
total_timePSStopQuery = time.time() - start_time

predResult = {}

for key, value in retrievalResult.items():
    values_first_column = [tup[0] for tup in value]
    predResult[key] = values_first_column

predResult = dict(sorted(predResult.items()))

APTrecList_PorterStem_Stop, listOfAP_PorterStem_Stop_Inter, MAP_PorterStem_Stop_Inter = MAP(corpus_result_que, predResult)
listOfAP_PorterStem_Stop_NonInter, MAP_PorterStem_Stop_NonInter = MAP(corpus_result_que, predResult, False)
l_p1, l_r1, p1, r1 = count_P_R(corpus_result_que, predResult)
# Print total execution time
print("Total execution time:", total_timePSStopQuery)
print(MAP_PorterStem_Stop_NonInter)
print(MAP_PorterStem_Stop_Inter)
print(p1, r1)

Total execution time: 125.09028959274292
0.18991752235133255
0.20488963478590122
0.016181867685635024 0.9529044550417354


## Porter Stem Only

In [None]:
import time

start_time = time.time()

docs, terms = preprocessPorterStemmerOnly(CorpusDocs)
indexs, order_term, inverse_order_term = indexing(docs, terms)

print(len(indexs))

## Sorting the index list in alphabetical order
sorted_indexs = sorted(indexs, key=lambda x: x['word'])
## Adding the frequencies to the terms in the index table
new_sorted_indexs = adding_frequency(docs, terms, sorted_indexs)
# Build the vocab table using the docs, terms and the sorted indexes
vocab = build_vocab(docs, terms, sorted_indexs)

# Calculate total execution time
total_timePSIndex = time.time() - start_time

# Print total execution time
print("Total execution time:", total_timePSIndex)

528389
Total execution time: 11.539645671844482


In [None]:
print(f'number of word in terms: {len(terms)}')
print(f'number of word in indexs: {len(indexs)}')
print(f'number of document: {len(docs)}')

number of word in terms: 21001
number of word in indexs: 528389
number of document: 3612


In [None]:
new_tf_sorted_indexs = calculate_tf(docs, terms, vocab, sorted_indexs, 'binary')
new_idf_vocab = calculate_idf(docs, terms, vocab, sorted_indexs, 'inverse')
new_w_sorted_indexs = calculate_w(docs, terms, new_idf_vocab, new_tf_sorted_indexs, 'cosine')

retrievalResult = dict()

start_time = time.time()

for key, value in CorpusQueries.items():
    valueTFIDF = calculate_tf_idf_queryPorterStemmerOnly(value, new_tf_sorted_indexs, new_idf_vocab, order_term, 'cosine')

    retrieval_scores = information_retrieval(valueTFIDF,new_idf_vocab, new_tf_sorted_indexs)
    sorted_results = sorted(retrieval_scores.items(), key=lambda x: x[1], reverse=True)

    retrievalResult[key] = sorted_results

total_timePSQuery = time.time() - start_time

predResult = {}

for key, value in retrievalResult.items():
    values_first_column = [tup[0] for tup in value]
    predResult[key] = values_first_column

predResult = dict(sorted(predResult.items()))

APTrecList_PorterStem, listOfAP_PorterStem_Inter, MAP_PorterStem_Inter = MAP(corpus_result_que, predResult)
listOfAP_PorterStem_NonInter, MAP_PorterStem_NonInter = MAP(corpus_result_que, predResult, False)
l_p2, l_r2, p2, r2 = count_P_R(corpus_result_que, predResult)

# Print total execution time
print("Total execution time:", total_timePSQuery)
print(MAP_PorterStem_NonInter)
print(MAP_PorterStem_Inter)
print(p2, r2)

Total execution time: 138.7916443347931
0.15510611102460825
0.1661026659587462
0.016170790810072097 0.9534496018264813


# Lemmatizer

## Lemmatizer + Remove Stopwords

In [None]:
import time

start_time = time.time()

docs, terms = preprocessLematizer(CorpusDocs)
indexs, order_term, inverse_order_term = indexing(docs, terms)

print(len(indexs))

## Sorting the index list in alphabetical order
sorted_indexs = sorted(indexs, key=lambda x: x['word'])
## Adding the frequencies to the terms in the index table
new_sorted_indexs = adding_frequency(docs, terms, sorted_indexs)
# Build the vocab table using the docs, terms and the sorted indexes
vocab = build_vocab(docs, terms, sorted_indexs)

# Calculate total execution time
total_timeLemmaStopIndex = time.time() - start_time

# Print total execution time
print("Total execution time:", total_timeLemmaStopIndex)

527828
Total execution time: 7.775843858718872


In [None]:
print(f'number of word in terms: {len(terms)}')
print(f'number of word in indexs: {len(indexs)}')
print(f'number of document: {len(docs)}')

number of word in terms: 25925
number of word in indexs: 527828
number of document: 3612


In [None]:
new_tf_sorted_indexs = calculate_tf(docs, terms, vocab, sorted_indexs, 'binary')
new_idf_vocab = calculate_idf(docs, terms, vocab, sorted_indexs, 'inverse')
new_w_sorted_indexs = calculate_w(docs, terms, new_idf_vocab, new_tf_sorted_indexs, 'cosine')

retrievalResult = dict()

start_time = time.time()

for key, value in CorpusQueries.items():
    valueTFIDF = calculate_tf_idf_queryLemmatizer(value, new_tf_sorted_indexs, new_idf_vocab, order_term, 'cosine')

    retrieval_scores = information_retrieval(valueTFIDF,new_idf_vocab, new_tf_sorted_indexs)
    sorted_results = sorted(retrieval_scores.items(), key=lambda x: x[1], reverse=True)

    retrievalResult[key] = sorted_results

# Calculate total execution time
total_timeLemmaStopQuery = time.time() - start_time

predResult = {}

for key, value in retrievalResult.items():
    values_first_column = [tup[0] for tup in value]
    predResult[key] = values_first_column

# predResult = dict(sorted(predResult.items()))
# for i in retrievalResult:
#   print(retrievalResult[i])

APTrecList_Lemma_Stop, listOfAP_Lemma_Stop_Inter, MAP_Lemma_Stop_Inter = MAP(corpus_result_que, predResult)
listOfAP_Lemma_Stop_NonInter, MAP_Lemma_Stop_NonInter = MAP(corpus_result_que, predResult, False)
l_p3, l_r3, p3, r3 = count_P_R(corpus_result_que, predResult)

# Print total execution time
print("Total execution time:", total_timeLemmaStopQuery)
print(MAP_Lemma_Stop_NonInter)
print(MAP_Lemma_Stop_Inter)
print(p3, r3)

Total execution time: 85.0072271823883
0.19084241355532475
0.20524251449026903
0.016337289895641895 0.9486093506624861


## Lemma Only

In [None]:
import time

start_time = time.time()

docs, terms = preprocessLemmatizerOnly(CorpusDocs)
indexs, order_term, inverse_order_term = indexing(docs, terms)

print(len(indexs))

## Sorting the index list in alphabetical order
sorted_indexs = sorted(indexs, key=lambda x: x['word'])

## Adding the frequencies to the terms in the index table
new_sorted_indexs = adding_frequency(docs, terms, sorted_indexs)

# Build the vocab table using the docs, terms and the sorted indexes
vocab = build_vocab(docs, terms, sorted_indexs)

# Calculate total execution time
total_timeLemmaIndex = time.time() - start_time

# Print total execution time
print("Total execution time:", total_timeLemmaIndex)

528389
Total execution time: 5.844847917556763


In [None]:
print(f'number of word in terms: {len(terms)}')
print(f'number of word in indexs: {len(indexs)}')
print(f'number of document: {len(docs)}')

number of word in terms: 25946
number of word in indexs: 528389
number of document: 3612


In [None]:
new_tf_sorted_indexs = calculate_tf(docs, terms, vocab, sorted_indexs, 'binary')
new_idf_vocab = calculate_idf(docs, terms, vocab, sorted_indexs, 'inverse')
new_w_sorted_indexs = calculate_w(docs, terms, new_idf_vocab, new_tf_sorted_indexs, 'cosine')

retrievalResult = dict()

start_time = time.time()

for key, value in CorpusQueries.items():
    valueTFIDF = calculate_tf_idf_queryLemmatizerOnly(value, new_tf_sorted_indexs, new_idf_vocab, order_term, 'cosine')

    retrieval_scores = information_retrieval(valueTFIDF,new_idf_vocab, new_tf_sorted_indexs)
    sorted_results = sorted(retrieval_scores.items(), key=lambda x: x[1], reverse=True)

    retrievalResult[key] = sorted_results

total_timeLemmaQuery = time.time() - start_time

predResult = {}

for key, value in retrievalResult.items():
    values_first_column = [tup[0] for tup in value]
    predResult[key] = values_first_column

predResult = dict(sorted(predResult.items()))

APTrecList_Lemma, listOfAP_Lemma_Inter, MAP_Lemma_Inter = MAP(corpus_result_que, predResult)
l_p4, l_r4, p4, r4 = count_P_R(corpus_result_que, predResult)
listOfAP_Lemma_NonInter, MAP_Lemma_NonInter = MAP(corpus_result_que, predResult, False)

# Print total execution time
print("Total execution time:", total_timeLemmaQuery)
print(MAP_Lemma_NonInter)
print(MAP_Lemma_Inter)
print(p4, r4)

Total execution time: 87.28621697425842
0.15695809779980346
0.16757651783652658
0.01632231156402499 0.9492199469340202


# No Process

In [None]:
def preprocess(docs):
    # docs is a dict {id: doc}
    terms = set()
    modified_docs = {}

    for i, doc in enumerate(docs):
        modified_doc = docs[doc].lower()
        modified_doc = re.sub(r'[^a-z0-9\s]', '', modified_doc)
        # use nltk to tokenize and filter the punctuation
        words = word_tokenize(modified_doc)
        sentence = ""
        for word in words:
            if word not in string.punctuation:
                terms.add(word)
                sentence += " " + word
        modified_docs[doc] = sentence

    return modified_docs, terms

In [None]:
start_time = time.time()

docs, terms = preprocess(CorpusDocs)
indexs, order_term, inverse_order_term = indexing(docs, terms)

print(len(indexs))

## Sorting the index list in alphabetical order
sorted_indexs = sorted(indexs, key=lambda x: x['word'])

## Adding the frequencies to the terms in the index table
new_sorted_indexs = adding_frequency(docs, terms, sorted_indexs)

# Build the vocab table using the docs, terms and the sorted indexes
vocab = build_vocab(docs, terms, sorted_indexs)

# Calculate total execution time
total_timeBaseIndex = time.time() - start_time

# Print total execution time
print("Total execution time:", total_timeBaseIndex)

528389
Total execution time: 4.33671498298645


In [None]:
print(f'number of word in terms: {len(terms)}')
print(f'number of word in indexs: {len(indexs)}')
print(f'number of document: {len(docs)}')

number of word in terms: 28286
number of word in indexs: 528389
number of document: 3612


In [None]:
new_tf_sorted_indexs = calculate_tf(docs, terms, vocab, sorted_indexs, 'binary')
new_idf_vocab = calculate_idf(docs, terms, vocab, sorted_indexs, 'inverse')
new_w_sorted_indexs = calculate_w(docs, terms, new_idf_vocab, new_tf_sorted_indexs, 'cosine')

retrievalResult = dict()

start_time = time.time()

for key, value in CorpusQueries.items():
    valueTFIDF = calculate_tf_idf_query(value, new_tf_sorted_indexs, new_idf_vocab, order_term, 'cosine')

    retrieval_scores = information_retrieval(valueTFIDF,new_idf_vocab, new_tf_sorted_indexs)
    sorted_results = sorted(retrieval_scores.items(), key=lambda x: x[1], reverse=True)

    retrievalResult[key] = sorted_results

# Calculate total execution time
total_timeBaseQuery = time.time() - start_time

predResult = {}

for key, value in retrievalResult.items():
    values_first_column = [tup[0] for tup in value]
    predResult[key] = values_first_column

predResult = dict(sorted(predResult.items()))

APTrecList_Stop, listOfAP_Stop_Inter, MAP_Stop_Inter = MAP(corpus_result_que, predResult)
l_p5, l_r5, p5, r5 = count_P_R(corpus_result_que, predResult)
listOfAP_Stop_NonInter, MAP_Stop_NonInter = MAP(corpus_result_que, predResult, False)

# Print total execution time
print("Total execution time:", total_timeBaseQuery)
print(MAP_Stop_NonInter)
print(MAP_Stop_Inter)
print(p5, r5)

Total execution time: 72.34215545654297
0.15482782185089589
0.16287137745323652
0.016677748943465517 0.9304974169585449




---

# Thống kê

---



In [None]:
import pandas as pd

# Define your data
q1 = ["PorterStem + Stop", MAP_PorterStem_Stop_Inter, MAP_PorterStem_Stop_NonInter, p1, r1, total_timePSStopIndex, total_timePSStopQuery]
q2 = ["PorterStem", MAP_PorterStem_Inter, MAP_PorterStem_NonInter, p2, r2, total_timePSIndex, total_timePSQuery]
q3 = ["Lemma + Stop", MAP_Lemma_Stop_Inter, MAP_Lemma_Stop_NonInter, p3, r3, total_timeLemmaStopIndex, total_timeLemmaStopQuery]
q4 = ["Lemma", MAP_Lemma_Inter, MAP_Lemma_NonInter, p4, r4, total_timeLemmaIndex, total_timeLemmaQuery]
q5 = ["No Preprocess", MAP_Stop_Inter, MAP_Stop_NonInter, p5, r5, total_timeBaseIndex, total_timeBaseQuery]

col = ["Type", "Inter", "NonInter", "P", "R", "Index Time", "Query Time"]

# Create the DataFrame
df = pd.DataFrame([q1, q2, q3, q4, q5], columns=col)

# Round the numeric columns to 4 decimal places
numeric_cols = ["Inter", "NonInter", "P", "R", "Index Time", "Query Time"]
df[numeric_cols] = df[numeric_cols].round(4)

# Print the rounded DataFrame
df


Unnamed: 0,Type,Inter,NonInter,P,R,Index Time,Query Time
0,PorterStem + Stop,0.2049,0.1899,0.0162,0.9529,14.1561,125.0903
1,PorterStem,0.1661,0.1551,0.0162,0.9534,11.5396,138.7916
2,Lemma + Stop,0.2052,0.1908,0.0163,0.9486,7.7758,85.0072
3,Lemma,0.1676,0.157,0.0163,0.9492,5.8448,87.2862
4,No Preprocess,0.1629,0.1548,0.0167,0.9305,4.3367,72.3422
