# Pure Python implementation of a Search Engine

In [13]:
import sys
import os
import glob
from time import time
import numpy as np

### Input files

- input_PR
  - output from Spark PageRank
- input_TFIDF
  - output from Spark TF_IDF
- input_nowiki
  - output from Python preprocessor of nowiki dataset

In [21]:
input_PR = 'nowiki_PR/part*'
input_TFIDF = 'nowiki_TFIDF/part*'
input_nowiki = 'clean-data-no/nowiki.csv'

## Helper functions to format input files

In [5]:
def getTFIDF(line):
    tf_idf = line.split(',')
    if len(tf_idf) == 3:
        return tf_idf[0][2:-1], (tf_idf[1].strip()[2:-1], float(tf_idf[2][2:-1].strip()[:-2]))
    else:
        return ','.join(tf_idf[:-2])[2:-1], (tf_idf[-2].strip()[2:-1], float(tf_idf[-1][2:-1].strip()[:-2]))

In [6]:
def getPageRank(line):
    pair = line.split(',')
    if len(pair) > 2:
        return ','.join(pair[:-1])[2:], float(pair[-1].strip()[:-1])
    else:
        return pair[0][2:-1], float(pair[1].strip()[:-1])

## Read PageRanks into a dictionary

In [None]:
PR = dict()
for file in glob.glob(input_PR):
    with open(file, 'r', encoding='utf8') as f:
        for line in f:
            pair = getPageRank(line)
            PR[pair[0]] = pair[1]

## Read TF-IDF into a dictionary

Note: TF-IDF input is a large file and requires 10 GB of RAM

In [None]:
TFIDF = dict()
for file in glob.glob(input_TFIDF):
    with open(file, 'r', encoding='utf8') as f:
        for line in f:
            t = getTFIDF(line)
            if t[0] not in TFIDF:
                TFIDF[t[0]] = list()
            TFIDF[t[0]].append(t[1])

## Create a dictionary of pages

- Key: ID
- Value: Title

In [None]:
pages = dict()
with open(input_nowiki, 'r', encoding='utf8') as file:
    for line in file:
        entry = line.strip().split('\t')
        pages[entry[0]] = entry[1]

## Search function

In [49]:
def search(query):
    start = time()
    search_res = []
    # Title search
    if query in PR:
        search_res.append((query, np.inf))
    # Search through TF-IDF word by word
    # And get the PageRank per result from TF-IDF search
    for word in query.split():
        docs = dict()
        if word in TFIDF:
            for doc, score in TFIDF[word]:
                if doc not in docs:
                    docs[doc] = dict()
                    docs[doc]['TF-IDF'] = score
                    page = pages[doc]
                    if page in PR:
                        docs[doc]['title'] = page
                        docs[doc]['rank'] = PR[page]
                    else:
                        docs[doc]['title'] = None
                        docs[doc]['rank'] = 0
                else:
                    docs[doc]['TF-IDF'] += score
        for _, doc in docs.items():
            search_res.append((doc['title'], doc['TF-IDF']*doc['rank']))
    search_res.sort(key=lambda tup: tup[1], reverse=True)
    end = time()
    print('Found', len(docs), 'matches in', end-start, 'seconds')
    return search_res[:10]

In [50]:
search('hund')

Found 1294 matches in 0.003947019577026367 seconds


[('Chihuahua', 5.881837827315857e-07),
 ('Store hund', 2.719250828363443e-07),
 ('Den lille hund', 2.6500670427313347e-07),
 ('Kategori:Store hund', 2.3006890281200476e-07),
 ('Kategori:Den lille hund', 1.995185378535444e-07),
 ('Hofteleddsdysplasi', 1.7453830317081887e-07),
 ('Vesle hund', 1.5356277661287365e-07),
 ('Akita', 1.4500650767209032e-07),
 ('Den lille hunden', 1.382064989515863e-07),
 ('Kina', 1.3799534316859692e-07)]

In [51]:
search('Akita')

Found 53 matches in 0.0 seconds


[('Akita', inf),
 ('Japan', 9.911614979482184e-07),
 ('Akita (Akita)', 3.6962904468494174e-07),
 ('Akita', 3.5409990935597255e-07),
 ('Akita (prefektur)', 3.4223551796330046e-07),
 ('Akita (hund)', 2.3098011011964316e-07),
 ('Kategori:Personer fra prefekturet Akita', 2.217113344190102e-07),
 ('Omono', 2.0072452026452653e-07),
 ('Blaublitz Akita', 1.7180549303047218e-07),
 ('Kazuno', 8.330484878096286e-08)]

In [52]:
search('Blaublitz Akita')

Found 53 matches in 0.0 seconds


[('Blaublitz Akita', inf),
 ('Japan', 9.911614979482184e-07),
 ('Akita (Akita)', 3.6962904468494174e-07),
 ('Akita', 3.5409990935597255e-07),
 ('Akita (prefektur)', 3.4223551796330046e-07),
 ('Akita (hund)', 2.3098011011964316e-07),
 ('Kategori:Personer fra prefekturet Akita', 2.217113344190102e-07),
 ('Omono', 2.0072452026452653e-07),
 ('Blaublitz Akita', 1.7180549303047218e-07),
 ('Blaublitz Akita', 1.1566327382812662e-07)]

In [53]:
search('Vesle hund')

Found 1294 matches in 0.0029883384704589844 seconds


[('Vesle hund', inf),
 ('Chihuahua', 5.881837827315857e-07),
 ('Middelhavet', 3.675909049178139e-07),
 ('Store hund', 2.719250828363443e-07),
 ('Den lille hund', 2.6500670427313347e-07),
 ('Kategori:Store hund', 2.3006890281200476e-07),
 ('Kategori:Den lille hund', 1.995185378535444e-07),
 ('Blefjell', 1.9786135287993915e-07),
 ('Hofteleddsdysplasi', 1.7453830317081887e-07),
 ('Vesle hund', 1.5356277661287365e-07)]

In [54]:
search('Den lille hund')

Found 1294 matches in 0.2873058319091797 seconds


[('Den lille hund', inf),
 ('Kategori:Øverste forvaltningsenhet etter land', 1.606122590106071e-05),
 ('Kategori:Norske kirker etter prosti', 5.536431270393689e-06),
 ('Storbritannia', 3.6963229769205944e-06),
 ('Sverige', 3.468454309176283e-06),
 ('Frankrike', 3.066117966556517e-06),
 ('Kategori:Norske kirker etter bispedømme', 2.8593106648980036e-06),
 ('USA', 2.845493581383952e-06),
 ('Tyskland', 2.4990089828028465e-06),
 ('Norge', 2.212330303289179e-06)]

In [55]:
search('Kina')

Found 19081 matches in 0.03494119644165039 seconds


[('Kina', inf),
 ('Kategori:Provinser i Kina', 5.1378359401160886e-05),
 ('Kategori:Kina', 3.70476639501291e-05),
 ('Kategori:Personer etter føde- eller oppvekststed i Kina',
  1.8730082824141414e-05),
 ('Kategori:Distrikter i Kina', 1.6913449412822155e-05),
 ('Kina', 1.1020158144097284e-05),
 ('Kategori:Sport i Kina', 1.0826531720260328e-05),
 ('Kategori:Kinesere', 9.623589063621626e-06),
 ('Kategori:Fylker i Kina', 9.516384378200693e-06),
 ('Kategori:Byer i Kina', 9.007174677390198e-06)]

In [57]:
search('La oss prøve med en lang setning som inneholder ord som, Kina hund Norge Sverige USA')

Found 60170 matches in 2.5192596912384033 seconds


[('Kategori:Norges fylker', 0.0003432768079233506),
 ('Kategori:Personer etter nasjonalitet og beskjeftigelse',
  0.00010843667956040066),
 ('Kategori:Byggverk i Norge', 7.30186237745788e-05),
 ('Kategori:Veier i Norge', 6.797478879730423e-05),
 ('Kategori:USAs delstater', 6.744832943078649e-05),
 ('Kategori:Dyr', 6.137819908233194e-05),
 ('Kategori:Provinser i Kina', 5.1378359401160886e-05),
 ('Kategori:Undernasjonale områder i Norge', 4.986158354424146e-05),
 ('Kategori:Kommuner i Norge', 4.8202622007215405e-05),
 ('Kategori:Landformer i Norge', 4.5900360057882826e-05)]

In [59]:
from pprint import pprint

In [63]:
queries = [
    'hund',
    'katt',
    'Norge',
    'USA',
    'skole',
    'Microsoft',
    'Apple',
    'potet'
]
for query in queries:
    print("By searching for '{}' we found:".format(query))
    pprint(search(query))
    print()

By searching for 'hund' we found:
Found 1294 matches in 0.002991199493408203 seconds
[('Chihuahua', 5.881837827315857e-07),
 ('Store hund', 2.719250828363443e-07),
 ('Den lille hund', 2.6500670427313347e-07),
 ('Kategori:Store hund', 2.3006890281200476e-07),
 ('Kategori:Den lille hund', 1.995185378535444e-07),
 ('Hofteleddsdysplasi', 1.7453830317081887e-07),
 ('Vesle hund', 1.5356277661287365e-07),
 ('Akita', 1.4500650767209032e-07),
 ('Den lille hunden', 1.382064989515863e-07),
 ('Kina', 1.3799534316859692e-07)]

By searching for 'katt' we found:
Found 473 matches in 0.000997781753540039 seconds
[('2000', 1.2810789555860478e-06),
 ('NRK1', 2.712285768867986e-07),
 ('NRK', 1.7805317342117787e-07),
 ('Schrödingers katt (andre betydninger)', 1.6594210010248255e-07),
 ('Tyholt', 1.1411938082817706e-07),
 ('Burmeser', 1.0192750489192413e-07),
 ('Prolog (programmeringsspråk)', 9.47751837348958e-08),
 ('Lars Saabye Christensen', 9.309212682308292e-08),
 ('Figaro (Disney)', 8.375456179045604e