In [1]:
import json,re,glob,os
from collections import defaultdict
import time,math
start_time = time.time()

# Inverted Index

In [2]:
def main(list_of_files,dict,stop_words):
    document_no=0
    for file_name in list_of_files:
        file_open=open(file_name,encoding="utf-8")
        data = json.load(file_open)
        pattern=re.findall(r'\b[a-z0-9A-Z]{2,30}\b',data['text'])
        for word in pattern:
            word=word.lower()
            if word in dict and word not in stop_words:
                    dict[word].append(document_no)
            else:
                dict[word].append(document_no)
        document_no+=1
    return dict

# TF.IDF Score

In [3]:
def TF_IDF_score(queryList,term_frq,N,InvertedIndex):
    score=0
    for i in queryList:
        if i in term_frq and term_frq[i]>0:
            doc_frq=len(InvertedIndex[i])
            score+=(1+math.log(term_frq[i],10))*(math.log(N/doc_frq))
    return score

# Retrieving Top 10 documents based on TF.IDF Score

In [4]:
def top10rankedDocuments(sortedListScore,list_of_files,scoreList):
    j=1
    for i in sortedListScore:
        if j==10:
            break

        file_open=open(list_of_files[i],encoding="utf-8")
        data = json.load(file_open)
        print("----------------------Document No------------------------",i)
        print(data['text'],"\n---TF-IDF:Score-->",scoreList[i])
        print("--------------------------\n\n-----------------------------------")
        j+=1

# Considering only the terms in document which are present in input query

In [5]:
def modifiedqueryList(queryList,InvertedIndex,document_no):
    tempqueryList=[]
    for i in queryList:
        if i in InvertedIndex and document_no in InvertedIndex[i]:
            if document_no not in tempqueryList:
                tempqueryList.append(document_no)
            tempqueryList.append(i)
    return tempqueryList

In [6]:
def relevance(InvertedIndex,queryList,list_of_files,stop_words):
    document_no=0
    scoreList={}
    for file_name in list_of_files:
        term_frq={}
        tempqueryList=modifiedqueryList(queryList,InvertedIndex,document_no)
        if document_no in tempqueryList:
            file_open=open(file_name,encoding="utf-8")
            data = json.load(file_open)
            pattern=re.findall(r'\b[a-z0-9A-Z]{2,30}\b',data['text'])
            for word in pattern:
                word=word.lower()
                if word in term_frq and word not in stop_words:
                    term_frq[word]+=1
                else:
                    term_frq[word]=1
            tfidf_score_per_doc=TF_IDF_score(tempqueryList[1:],term_frq,len(list_of_files),InvertedIndex)

            scoreList[document_no]=tfidf_score_per_doc
        document_no+=1
    fin=sorted(scoreList,key=scoreList.get,reverse=True)
    top10rankedDocuments(fin,list_of_files,scoreList)

# Saving the Inverted Index in result json file

In [7]:
def initialising(list_of_files,dict,stop_words):
    if os.path.exists('result.json'):
        file_open=open('result.json',encoding="utf-8")
        exist_data = json.load(file_open)
        InvertedIndex=exist_data
    else:
        result=main(list_of_files,dict,stop_words)
        InvertedIndex=result
        print("-------------------------------Result--------------------------------")
        with open('result.json','w') as f:
            json.dump(result,f)
    return InvertedIndex

# Calculating the dotproduct  between input query vector and previous      queries vetcor      

In [8]:
def similarityWords(queryList,querymatch):
    similarity={}
    query_no=1
    for query in querymatch:
        similarity[query_no]=list(set(query).intersection(queryList))
        similarity[query_no].append(len(query))
        query_no+=1
    return similarity

In [9]:
def recommendatioins(sortedSimilarityScore,similarityScore,querymatch):
    print("Recommendations\n")
    suggestions=0
    for i in sortedSimilarityScore:
        if similarityScore[i]>=0:
            suggestions+=1
            print(' '.join(querymatch[i-1]))
    return suggestions

# Calculating the cosine similarity score

In [10]:
def cosineSimilarityScore(similarity,queryList):
    B=len(queryList)**2
    a=1
    similarityScore={}
    for i in similarity:
        dotproduct=len(similarity[i])-1
        denm=math.sqrt(similarity[i][-1]**2+B)
        similarityScore[a]=dotproduct/denm
        a+=1
    return similarityScore

In [11]:
if __name__=='__main__':
    stop_words = ['your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
    'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
    'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at']
    list_of_files=glob.glob('sports_articles/*.json')
    dict=defaultdict(list)
    querymatch=[]
    while(1):
        print("---------------Tasks------------------\n")
        print("1.Searching and retrieving the top 10 ranked documents\n")
        print("2.Recommendations on Query Expansion based on previous searches\n")
        print("3.Exit")
        print("-----------------------------------------\n")
        task_no=int(input("Enter the task number to perform\n"))
        if task_no==1 or task_no==2:
            query=input('Enter a phrasal query  ')
            pattern=re.findall(r'\b[a-z0-9A-Z]{2,30}\b',query)
            queryList=[]
            for word in pattern:
                if word not in stop_words:
                    word=word.lower()
                    queryList.append(word)   
        if task_no==1:
            start_time = time.time()
            querymatch.append(queryList)   
            InvertedIndex=initialising(list_of_files,dict,stop_words)
            relevance(InvertedIndex,queryList,list_of_files,stop_words)
            print("--- %s For Task-1 Execution Time in seconds ---" % (time.time() - start_time))
        if task_no==2:
            start_time = time.time()
            similarity=similarityWords(queryList,querymatch)
            similarityScore=cosineSimilarityScore(similarity,queryList)
            sortedSimilarityScore=sorted(similarityScore,key=similarityScore.get,reverse=True)
            suggestions=recommendatioins(sortedSimilarityScore,similarityScore,querymatch)
            if suggestions==0:
                InvertedIndex=initialising(list_of_files,dict,stop_words)
                relevance(InvertedIndex,queryList,list_of_files,stop_words)
            print("\n--- %s For Task-2 Execution Time in seconds ---" % (time.time() - start_time))
        if task_no==3:
            break
    print("--- %s Overall Execution time in seconds ---" % (time.time() - start_time))


---------------Tasks------------------

1.Searching and retrieving the top 10 ranked documents

2.Recommendations on Query Expansion based on previous searches

3.Exit
-----------------------------------------

Enter the task number to perform
1
Enter a phrasal query  football
----------------------Document No------------------------ 40327
One football team fell out, another made a big move in, but the top four remain rock solid among the state’s best at any division.
A look at azcentral sports’ Super 10 after Week 6:
FIND A GAME: Arizona high school football schedule
Super 10
No. 1 Chandler (1) 6-1
There is so much speed and so many athletes on this team that it can cover up any mistakes they make during a game.
Updated Arizona high school football rankings:
azcentral sports' Richard Obert ranks the Top 10 Arizona high school football teams in each division after Week 6. The rankings work their way up from Division VI to Division I. Cheryl Evans/azcentral sports Fullscreen Division VI

Enter the task number to perform
3
--- 395.5531916618347 Overall Execution time in seconds ---
