#Creating a search engine

In [1]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords[0:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [3]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
def preprocess(text):
  # removing punctuations
    import string
    punc = string.punctuation
    for c in text:
      if c in punc:
        text = text.replace(c,"")
    # print(text)

    # tokenization
    words = word_tokenize(text)
    # print("After tokenization: {}".format(words))

    # stop words removal
    clean_words=[]
    for word in words:
      if word not in stopwords:
        clean_words.append(word)

    words = clean_words
    # print("After stop words removal: {}".format(words))

    # stemming
    stemmed_words = []
    for word in words:
      stemmed_words.append(porter.stem(word))

    words = stemmed_words
    # print("After stemming: {}".format(words))

    # converting to lowercase
    lowercase_words = []
    for word in words:
      lowercase_words.append(word.lower())

    words = lowercase_words
    # print("After converting to lowecase: {}".format(words))

    #adding to vocabulary
    vocabulary.extend(words)

    #adding to processed docs
    processed_text = ' '.join(map(str, words))

    return processed_text

In [7]:
docs = []
for i in range(15):
  docs.append('/content/p3_d{}.txt'.format(i+1))

processed_docs = []
vocabulary = []

for doc in docs:
  #scanning docs
  # print("\t\t\t\t\t\t\t\t\t" +doc)
  with open(doc,'r',encoding='utf-8') as file:
    text = file.read()
    text = str(text)
    # print(text)

  preprocessed_text = preprocess(text);
  # print(preprocessed_text)
  processed_docs.append(preprocessed_text)
    # print("\n\n")
    
processed_docs

['the desol neighborhood west side chicago bull play home game quiet day their gleam new arena unit center set moonscap all twelv prechristma home game cancel labor disput owner player initi owner lockout describ struggl short millionair tall millionair billionair millionair the nation basketbal associ would enter fiftysecond season fall seem fallen victim dizzi success one seen player payrol increas estim two thousand five hundr per cent last twenti year the incid probabl trigger lockout occur year ago minnesota timberwolv extend contract gift young player name kevin garnett pay hundr twentysix million dollar seven year the timberwolv ’ gener manag former boston celtic kevin mchale complet deal unhappi direct leagu part later note “ we hand neck golden goos ’ squeez hard ” in chicago much last decad best basketbal team countri play silenc particularli pain the last time game play bull led michael jordan contest sixth nba championship play favor team utah jazz it indel seri memori serv

In [8]:
vocabulary = list(set(vocabulary))
vocabulary[0:10]

['disadvantag',
 'slackjaw',
 'musk',
 'morn',
 'forward',
 'righthand',
 'horrif',
 'get',
 'yield',
 'complain']

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(vocabulary=vocabulary)

tfidf.fit(processed_docs)
X = tfidf.transform(processed_docs)

In [10]:
import pandas as pd
df = pd.DataFrame(X.toarray(),columns=tfidf.get_feature_names())
df

Unnamed: 0,disadvantag,slackjaw,musk,morn,forward,righthand,horrif,get,yield,complain,know,nba,watersh,these,pulsat,memphi,fulllength,synch,quickli,jaw,sick—appar,life,sent,superior,wrought,unguard,phrase,luck,dog,connect,topic,took,though,gone,energi,base,manag,bird,in,valu,...,mind,reserv,studio,confer,francisco,board,briarcrest,nfl,maximum,near,nick,fall,next,musician,basket,yet,they,shoot,wednesday,marv,entir,bottl,freak,statist,men,thigh,eve,middl,cleveland,sure,1995,champagn,locker,five,for,massiv,stole,50000,linda,wrong
0,0.0,0.0,0.0,0.010754,0.0,0.0,0.0,0.0478,0.0,0.012385,0.0087,0.037156,0.0,0.0,0.0,0.0,0.0,0.012385,0.012385,0.0,0.0,0.0,0.0,0.0,0.012385,0.012385,0.0,0.0,0.0,0.0,0.0,0.0,0.021509,0.021509,0.037156,0.0,0.019195,0.012385,0.035473,0.0,...,0.0,0.0,0.0,0.037156,0.0,0.0,0.0,0.0,0.012385,0.012385,0.012385,0.010754,0.0,0.0,0.02477,0.010754,0.028792,0.0,0.0,0.012385,0.019195,0.012385,0.0,0.0,0.037156,0.0,0.0,0.0,0.0,0.037156,0.010754,0.012385,0.010754,0.0261,0.009597,0.0,0.02477,0.0,0.0,0.0
1,0.020973,0.020973,0.0,0.0,0.0,0.0,0.0,0.013491,0.0,0.0,0.014732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020973,0.0,0.0,0.0,0.020973,0.0,0.020973,0.0,0.016252,0.0,0.0,0.0,0.072845,0.130016,0.0,0.010012,0.041945,...,0.0,0.0,0.0,0.0,0.020973,0.091056,0.0,0.0,0.0,0.0,0.0,0.0,0.016252,0.0,0.0,0.018211,0.016252,0.0,0.0,0.0,0.016252,0.0,0.0,0.104863,0.0,0.0,0.020973,0.0,0.125836,0.0,0.0,0.0,0.018211,0.014732,0.0,0.0,0.0,0.0,0.0,0.041945
2,0.0,0.0,0.0,0.037357,0.0,0.037357,0.0,0.027674,0.0,0.0,0.0,0.0,0.0,0.0,0.043022,0.0,0.0,0.0,0.0,0.043022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033338,0.0,0.0,0.0,...,0.0,0.043022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.027511,0.0,0.020379,0.027511,0.0,0.022255,0.0,0.0,0.027511,0.0,0.126728,0.0,0.0,0.0,0.0,0.0,0.0,0.027511,0.0,0.0,0.0,0.031682,0.0,0.0,0.0,0.0,0.0,0.0,0.027511,0.0,0.0,0.0,0.0,0.015124,0.0,...,0.031682,0.0,0.0,0.0,0.0,0.0,0.063364,0.095046,0.0,0.0,0.0,0.0,0.024551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031682,0.0,0.0,0.031682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066765,0.0,0.027511,0.0,0.031682,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033509,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.189224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094612,0.0,0.0,0.073316,0.0,0.0,0.0,0.073316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.177323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076988,0.0,0.0,0.0,0.0,...,0.0,0.0,0.088661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.179285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042792,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069465,0.0,0.0,0.0,0.0,0.0,0.089642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065699,0.0,0.0,0.0,0.057048,0.0,0.0,0.0,0.0,0.0,0.062724,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070179,0.0,0.0,0.0,0.0,0.0


In [11]:
len(vocabulary)

1976

In [112]:
from sklearn.metrics.pairwise import cosine_similarity

def search(k, query):
  processed_query = preprocess(query)
  query_vec = tfidf.transform([processed_query])

  cosine_values = cosine_similarity(X,query_vec)
  cosine_values = cosine_values.reshape((-1,))

  results = []
  cosine_sorted_index = cosine_values.argsort()[-k:][::-1]
  print(cosine_sorted_index)
  for i in cosine_sorted_index:
    results.append(('/content/p3_d{}.txt'.format(i+1),cosine_values[i]))

  return results

In [116]:
query = "the simpsons american sitcom"
results = search(5,query)
results

[ 8 12 13  0 14]


[('/content/p3_d9.txt', 0.5397861890380344),
 ('/content/p3_d13.txt', 0.03436666319545569),
 ('/content/p3_d14.txt', 0.015905595269496132),
 ('/content/p3_d1.txt', 0.0038702100328060584),
 ('/content/p3_d15.txt', 0.0)]

In [120]:
for result in results:
  if (result[1] != 0):
    with open(result[0],'r',encoding='utf-8') as file:
        text = file.read()
        text = str(text)

        print("\t\t\t------------------------ From {} ------------------------".format(result[0]))
        print(text)
        print("\n")

			------------------------ From /content/p3_d9.txt ------------------------
The Simpsons is an American sitcom that shows middle class lifestyle in cartoon form. The half-hour episodes take place in and around the fictional town of Springfield and make fun out of American culture and society.

Since the show started in 1989 the Simpsons have been broadcast over 500 times. This makes it the longest running sitcom in American television history. In 2007 a full-length movie, The Simpsons Movie, made over half a billion dollars.

The Simpsons has won many prizes, concluding the Emmy Awards. In the year 2000 Time magazine named it the best television series of the century and the cartoon characters of The Simpsons received their own star on Hollywood’s Walk of Fame.

The Simpson family consists of 5 main characters. Homer is the rather clumsy, beer-drinking father. He works at a nuclear power plant in Springfield and is married to Marge Simpson, a typical American middle class housewife. T