In [3]:
import os
import nltk
import math

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter, defaultdict

In [4]:
tf_dict = {}  #Term Frequency
df_dict = defaultdict(int)
normalized_tf_idf = {}
N=0 # Total number of documents

In [6]:
corpusroot = './US_Inaugural_Addresses'
for filename in os.listdir(corpusroot):
    if filename.endswith('.txt'):
        file = open(os.path.join(corpusroot, filename), "r", encoding='windows-1252')
        doc = file.read()
        file.close() 
        doc = doc.lower()
        
        # Tokenize the document
        tokenize = RegexpTokenizer(r'[a-zA-Z]+')
        tokens = tokenize.tokenize(doc)
        
        # print(f"before: {tokens}")
        
        #remvoving the stop words
        stop_words_list = stopwords.words('english')
        tokens_without_stopwords = [t for t in tokens if t not in stop_words_list]
        
        # print(f"after: {tokens_without_stopwords}")
        
        # Stemming on obtained tokens
        stemmer = PorterStemmer()
        final_tokens = [stemmer.stem(token) for token in tokens_without_stopwords]
        
        # Term frequency for document
        tf_dict[filename] = Counter(final_tokens)
        
        # Document Frequency for unique tokens
        unique_tokens = set(final_tokens)  
        for token in unique_tokens:
            df_dict[token] += 1
        
        N+=1

In [7]:
def calculate_idf(token):
    return math.log10(N / df_dict[token]) if df_dict[token] > 0 else -1

In [8]:
def getidf(token):
    stemmed_token = stemmer.stem(token)
    return calculate_idf(stemmed_token)


In [9]:
# Get tf-idf value 
def get_tf_idf_weight(filename,token):
    if(tf_dict[filename][token])!=0:
        return (1+math.log10(tf_dict[filename][token]))*calculate_idf(token)
    else:
        return 0

In [10]:
# Normalize document vector
def normalize(weights):
    length = math.sqrt(sum(weight ** 2 for weight in weights.values()))
    return {token: weight / length for token, weight in weights.items()}


In [11]:
# Build normalized TF-IDF vectors
for filename in tf_dict:
    tf_idf_weight = {token: get_tf_idf_weight(filename, token) for token in tf_dict[filename]}
    normalized_tf_idf[filename] = normalize(tf_idf_weight)
print(normalized_tf_idf)        

{'13_van_buren_1837.txt': {'martin': 0.06328754875050986, 'van': 0.06328754875050986, 'buren': 0.06328754875050986, 'fellow': 0.005868594857583407, 'citizen': 0.0016236927522765446, 'practic': 0.007961295107691953, 'predecessor': 0.026873720623674435, 'impos': 0.023220590121227132, 'oblig': 0.007961295107691953, 'cheer': 0.0330917140825348, 'fulfil': 0.020655727164814078, 'accompani': 0.039503856967335056, 'first': 0.009960463989428372, 'solemn': 0.015150352907592882, 'act': 0.008063526599014565, 'public': 0.004466903179720059, 'trust': 0.010880907356168153, 'avow': 0.05781701704121182, 'principl': 0.010499704120501262, 'guid': 0.01801107076282406, 'perform': 0.020461268942217628, 'express': 0.007961295107691953, 'feel': 0.013141682869170404, 'assum': 0.013699443253880499, 'charg': 0.015720165184160244, 'respons': 0.008063526599014565, 'vast': 0.028482575662307313, 'imit': 0.04443941894798888, 'exampl': 0.014382609289981313, 'tread': 0.04443941894798888, 'footstep': 0.05139570285892246

In [12]:
# Retrieve normalized TF-IDF weight for a term in a document
def return_weight(filename, token):
    return normalized_tf_idf[filename].get(token, 0)   

In [None]:
# Construction of postings list
list_sorted_tf_idf = {}

# list for each token (sorted by TF-IDF weights)
for filename, tfidf_vector in normalized_tf_idf.items():
    for token, weight in tfidf_vector.items():
        if token not in list_sorted_tf_idf:
            list_sorted_tf_idf[token] = []
        list_sorted_tf_idf[token].append((filename, weight))
# print(list_sorted_tf_idf)       

In [None]:
for token in list_sorted_tf_idf:
    list_sorted_tf_idf[token].sort(key=lambda x: x[1], reverse=True)
# print(list_sorted_tf_idf)   

In [53]:
def query(qstring):
    query_tokens = qstring.lower().split()
    stemmed_query_tokens = [stemmer.stem(token) for token in query_tokens]
    
    query_tf = {}
    length = 0
    
    # Calculate query term frequency and magnitute
    for token in stemmed_query_tokens:
        if token not in query_tf:
            query_tf[token] = 1 + math.log10(stemmed_query_tokens.count(token))
        length += query_tf[token] ** 2
    
    query_magnitude = math.sqrt(length)
    

    cosine_scores = {}  # cosine
    upper_bound_weight = {}
    top_10_postings = {}
    
    # Cosine similarity
    for token in stemmed_query_tokens:
        if token in list_sorted_tf_idf:
            top_10_postings[token] = {doc: weight for doc, weight in list_sorted_tf_idf[token][:10]}  # top-10 elements
            # print(f"Token={token} Posting= {top_10_postings}")
            
            
            if not top_10_postings[token]:  # Check if the posting list is empty
                return ("None", 0)

            for doc, weight in top_10_postings[token].items():
                if doc not in cosine_scores:
                    cosine_scores[doc] = 0
                cosine_scores[doc] += query_tf[token] * weight / query_magnitude
            
        else:
            return ("None", 0)
        
    # print("top 10", top_10_postings)
    for token in stemmed_query_tokens:
        if token in list_sorted_tf_idf:
            
            upper_bound_weight[token] = min(weight for dx, weight in top_10_postings[token].items()) if top_10_postings[token] else 0

            for doc in cosine_scores:
                if not any(d == doc for d in top_10_postings[token]):
                    cosine_scores[doc] += query_tf[token] * upper_bound_weight[token] / query_magnitude


    max_value = -1
    best_doc = None
    
    
    for doc, cosweight in cosine_scores.items():
        in_all_token = True
        for token in stemmed_query_tokens:
            if doc not in top_10_postings[token]:
                in_all_token = False
                break
            
        if in_all_token and cosweight > max_value:
            best_doc = doc
            max_value = cosweight
            
    if best_doc == None:
        return ("fetch more", 0)
    else:
        return(best_doc, max_value)
     



In [54]:
def getweight(filename, token):
    stemmed_token = stemmer.stem(token)
    return return_weight(filename, stemmed_token)

In [55]:
print("%.12f" % getidf('democracy'))
print("%.12f" % getidf('foreign'))
print("%.12f" % getidf('states'))
print("%.12f" % getidf('honor'))
print("%.12f" % getidf('great'))
print("--------------")
print("%.12f" % getweight('19_lincoln_1861.txt','constitution'))
print("%.12f" % getweight('23_hayes_1877.txt','public'))
print("%.12f" % getweight('25_cleveland_1885.txt','citizen'))
print("%.12f" % getweight('09_monroe_1821.txt','revenue'))
print("%.12f" % getweight('37_roosevelt_franklin_1933.txt','leadership'))
print("--------------")
print("(%s, %.12f)" % query("states laws"))
print("(%s, %.12f)" % query("war offenses"))
print("(%s, %.12f)" % query("british war"))
print("(%s, %.12f)" % query("texas government"))
print("(%s, %.12f)" % query("world civilization"))

0.698970004336
0.187086643357
0.057991946978
0.139661993429
0.033858267261
--------------
0.006537500538
0.008278952636
0.002917313392
0.028125012259
0.077484582245
--------------
(21_grant_1869.txt, 0.016899649589)
(20_lincoln_1865.txt, 0.126707853305)
(07_madison_1813.txt, 0.087561769740)
(fetch more, 0.000000000000)
(22_grant_1873.txt, 0.010700041427)


0.698970004336
