In [1]:
#Import future to add compatiability between python2 and python3
from __future__ import absolute_import, division, print_function

In [2]:
#import os to deal with os tasks like opening files etc
import os
#import logging to enable logging
import logging
#import math libraries
import math
#import json library
import json
#import operator library for sorting dicts
import operator

In [3]:
import spacy
nlp = spacy.load('en')

In [4]:
#import porter stemmer from nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [5]:
#import required functions from textblob library
from textblob import TextBlob as tb

In [6]:
#def fuctions that will be useful for creating the tf-idf calculations

#function to compute term frequency
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

#function to compute number of documents containg the word
def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

#function to compute inverse document frequency
def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

#function to compute tf-idf by multiplying term frequency with inverse document frequency
def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [13]:
blobs    = {}
bloblist = []

with open('data.json','r') as f:
    data = json.load(f)

for doc in data:
    data_val = tb((doc + " " + "".join(data[doc])).lower())
    
    blobs[doc] = data_val
    bloblist.append(data_val)

In [14]:
results = {}


for blob in blobs:
    scores = {word: tfidf(word, blobs[blob], bloblist) for word in blobs[blob].words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        
    #store the result
    results[blob] = sorted_words

print(results)

{'Anti-lock Braking System (ABS)': [('braking', 0.1627222757294336), ('driver', 0.11828784805675244), ('emergency', 0.11286014093856547), ('work', 0.09367080623963028), ('systems', 0.07915463491900315), ('brakes', 0.07915463491900315), ('pumping', 0.056430070469282737), ('prevention', 0.056430070469282737), ('lockup', 0.056430070469282737), ('retain', 0.056430070469282737), ('capabilities', 0.056430070469282737), ('fool', 0.056430070469282737), ('purchasing', 0.056430070469282737), ('advised', 0.056430070469282737), ('insist', 0.056430070469282737), ('demonstrating', 0.056430070469282737), ('applies', 0.056430070469282737), ('pumps', 0.056430070469282737), ('he', 0.056430070469282737), ('previously', 0.056430070469282737), ('sense', 0.05218436253097735), ('allow', 0.05218436253097735), ('heavy', 0.05218436253097735), ('constant', 0.05218436253097735), ('wheel', 0.051754994716905166), ('dealership', 0.04917198480896917), ('proper', 0.04917198480896917), ('properly', 0.04917198480896917)

In [15]:
#Store the results in a JSON file
with open("tfidf_new.json","w") as outFile:
    json.dump(results,outFile)

In [16]:
#build inverted index
inv_index = {}

for key in results:
    for value in results[key]:
        word, score = value
        
        if word in inv_index:
            inv_index[word].append((key,score))
        else:
            inv_index[word] = [(key,score)]
print(inv_index)

{'braking': [('Anti-lock Braking System (ABS)', 0.1627222757294336), ('Brake Pad', 0.045705815682826195), ('Drum Brakes', 0.18951191868488912), ('Hydraulic', 0.17659065150182848), ('Master Cylinder', 0.11260853139247035), ('Road test', 0.1363155906329904), ('Traction Control', 0.08354826522667155), ('Zero-offset steering', 0.13169472315390598)], 'driver': [('Anti-lock Braking System (ABS)', 0.11828784805675244), ('Brake Booster', 0.13135452894674252), ('Bucket Seats', 0.11527030091244751), ('Chassis', 0.07844784367652678), ('Coupe de Ville', 0.18827482482366428), ('Cruise Control', 0.17115893165787663), ('Dashboard', 0.04092930974427485), ('Feed-through', 0.21724018248884341), ('Four-Wheel Steering', 0.06206862356824098), ('Fuel Cell', 0.12551654988244285), ('Lap-and-Shoulder Belt', 0.05483732761854299), ('Master Cylinder', 0.0818586194885497), ('Oversteer', 0.08557946582893831), ('Rally', 0.07431900979881484), ('Rev counter', 0.11074989695509664), ('Rollcage', 0.25673839748681493), ('

In [18]:
#Store the inverted index in a JSON file
with open("invindex_new.json","w") as outFile:
    json.dump(inv_index,outFile)

In [17]:
#Calculate score for a query

query = "how to use brakes"

#remove ? symbol
query = query.replace("?","")

#turn to lowercase
query = query.lower()

#result set as a dictionary of key-value
res_set = {}

#get list of words to query over
words = query.split(" ")
words = list(set(words))

#get top 10 documents for each word
for word in words:
    if not word in inv_index:
        continue
    else:
        responses = inv_index[word][:10]

        for response in responses:
            file,score = response

            if file in res_set:
                res_set[file] += score
            else:
                res_set[file] = score

            
sorted_results = sorted(res_set.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_results[:5])

score1 = sorted_results[0][1]
score2 = sorted_results[1][1]
score3 = sorted_results[2][1]
score4 = sorted_results[3][1]
score5 = sorted_results[4][1]

print(score2-score1, score3-score2, score4-score3, score5-score4)

[('Air Brakes', 0.2527538881165171), ('Camber', 0.2236390498971173), ('Automatic Transmission', 0.22020157322534567), ('Rev counter', 0.17540317638989592), ('Carbon Brakes', 0.17337769804506425)]
-0.029114838219399808 -0.003437476671771633 -0.044798396835449755 -0.0020254783448316604
