# Spark implementation of search engine

In [1]:
from __future__ import print_function

import sys
from operator import add
import pyspark
import os
import glob
from time import time
from pyspark.sql import SQLContext, Row

import numpy as np
import pandas as pd

## Helper functions

In [2]:
def getPageRank(line):
    pair = line.split(',')
    if len(pair) > 2:
        return ','.join(pair[:-1])[2:], float(pair[-1].strip()[:-1])
    else:
        return pair[0][2:-1], float(pair[1].strip()[:-1])

In [3]:
def getTFIDF(line):
    tf_idf = line.split(',')
    if len(tf_idf) == 3:
        return tf_idf[0][2:-1], (tf_idf[1].strip()[2:-1], float(tf_idf[2][2:-1].strip()[:-2]))
    else:
        return ','.join(tf_idf[:-2])[2:-1], (tf_idf[-2].strip()[2:-1], float(tf_idf[-1][2:-1].strip()[:-2]))

In [4]:
def procRDD(rdd, cache=True, part=True, hashp=True, npart=16):
    """
    Helper to handle caching/partioning
    
    Function taken from:
    https://stackoverflow.com/questions/31659404/spark-iteration-time-increasing-exponentially-when-using-join
    
    :param rdd: pyspark RDD
    :param cache: boolean (default=True)
    :param part: boolean (default=True)
    :param hashp: boolean (default=True)
    :param npart: number of partitions (default=12 suggested to be 2*(number of cores))
    :return: rdd or rdd.cache()
    """
    rdd = rdd if not part else rdd.repartition(npart)
    rdd = rdd if not hashp else rdd.partitionBy(npart)
    return rdd if not cache else rdd.cache()

## Initialize Spark Context

In [5]:
pyspark.SparkContext.setSystemProperty('spark.executor.memory', '7g')
pyspark.SparkContext.setSystemProperty('spark.driver.cores', '2')
pyspark.SparkContext.setSystemProperty('spark.driver.memory', '7g')
# pyspark.SparkContext.setSystemProperty('spark.cleaner.ttl', '600')
sc = pyspark.SparkContext(appName='search engine')

In [6]:
sqlContext = SQLContext(sc)

## Get necessary data

In [7]:
pages_ranks = sc.textFile('file:///mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/PR/page_ranks_nowiki_final/part*')
tf_idf = sc.textFile('file:///mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/TF-IDF/tf_idf_nowiki_final/part*')
nowiki = sc.textFile('file:///mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/clean_data/nowiki*.csv')

In [8]:
pages = (nowiki
         .filter(lambda line: len(line.strip()) > 0 and
                 not (line.strip().split('\t')[1].find('File:') == 0) and
                 not (line.strip().split('\t')[1].find('Wikipedia:') == 0))
         .map(lambda line: (line.strip().split('\t')[1],
                            line.strip().split('\t')[0])))

In [9]:
pages.take(5)

[('Akershus', '2'),
 ('Brus med ananassmak', '4'),
 ('Adjø solidaritet', '5'),
 ('Atonal musikk', '6'),
 ('Arne Dybfest', '8')]

In [10]:
page_ranks = pages_ranks.map(lambda line: getPageRank(line))

In [11]:
page_ranks.count()

679871

In [12]:
page_ranks.take(5)

[('Knole House', 2.714370680994686e-07),
 ('Kapitel', 2.2063036596032346e-07),
 ('Vorspiel', 2.637562177457272e-07),
 ('Nyhetsbyrå', 3.6482482045256115e-07),
 ('Rostrevor', 2.7067263659980717e-07)]

In [13]:
PR = (pages
      .join(page_ranks, 16)
      .map(lambda page: Row(id=page[1][0], title=page[0], rank=page[1][1])))

In [14]:
TF_IDF = tf_idf.map(lambda line: getTFIDF(line))
grouped = (TF_IDF.groupByKey()
           .mapValues(list)
           .map(lambda word: Row(id=word[0], list=word[1])))

In [15]:
grouped.saveAsTextFile('/mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/Search Engine/TF_IDF_grouped_01')

In [15]:
schemaPR = sqlContext.createDataFrame(PR)
schemaPR.registerTempTable('PR')
sqlContext.cacheTable('PR')
schemaTF_IDF = sqlContext.createDataFrame(grouped)
schemaTF_IDF.registerTempTable('TF_IDF')
sqlContext.cacheTable('TF_IDF')

In [21]:
query = 'Kina'
# query = query.lower()
title_search = sqlContext.sql("SELECT title, rank FROM PR WHERE title='{}'".format(query)).collect()
title_search, query

([Row(title='Kina', rank=0.0001928447378994561)], 'Kina')

In [23]:
a = 'dette er en test'
b = 'test'
a.split(), b.split()

(['dette', 'er', 'en', 'test'], ['test'])

In [16]:
def search(query):
    start = time()
    search_res = []
    title_search = sqlContext.sql("SELECT title, rank FROM PR WHERE title='{}'".format(query)).collect()
    if len(title_search) > 0:
        search_res.append((title_search[0]['title'], np.inf))
    for word in query.split():
        results = sqlContext.sql("SELECT list from TF_IDF WHERE id='{}'".format(word)).collect()
        docs = {}
        for result in results:
            for row in result:
                for item in row:
                    if item[0] not in docs:
                        docs[item[0]] = dict()
                        docs[item[0]]['TF_IDF'] = item[1]
                        res = sqlContext.sql('SELECT title, rank FROM PR WHERE id={}'.format(item[0])).collect()
                        if len(res) > 0:
                            docs[item[0]]['title'] = res[0]['title']
                            docs[item[0]]['rank'] = res[0]['rank']
                        else:
                            docs[item[0]]['title'] = None
                            docs[item[0]]['rank'] = 0
                    else:
                        docs[item[0]]['TF_IDF'] += item[1]
        for _, doc in docs.items():
            search_res.append((doc['title'], doc['TF_IDF']*doc['rank']))
    search_res.sort(key=lambda tup: tup[1], reverse=True)
    end = time()
    print('Found', len(docs), 'matches in', round(end-start), 'seconds')
    return search_res[:10]

In [25]:
search('hund')

Found 1294 matches in 326 seconds


[('Chihuahua', 5.881837827315853e-07),
 ('Store hund', 2.7192508283634427e-07),
 ('Den lille hund', 2.650067042731334e-07),
 ('Kategori:Store hund', 2.3006890281200452e-07),
 ('Kategori:Den lille hund', 1.9951853785354413e-07),
 ('Hofteleddsdysplasi', 1.745383031708187e-07),
 ('Vesle hund', 1.5356277661287343e-07),
 ('Akita', 1.450065076720903e-07),
 ('Den lille hunden', 1.3820649895158621e-07),
 ('Kina', 1.3799534316859684e-07)]

In [18]:
search('Akita')

Found 53 matches in 8 seconds


[('Akita', inf),
 ('Japan', 9.911614979482169e-07),
 ('Akita (Akita)', 3.6962904468494163e-07),
 ('Akita', 3.5409990935597244e-07),
 ('Akita (prefektur)', 3.4223551796330036e-07),
 ('Akita (hund)', 2.309801101196431e-07),
 ('Kategori:Personer fra prefekturet Akita', 2.2171133441900993e-07),
 ('Omono', 2.007245202645245e-07),
 ('Blaublitz Akita', 1.7180549303047049e-07),
 ('Kazuno', 8.330484878096268e-08)]

In [19]:
search('Blaublitz Akita')

Found 53 matches in 9 seconds


[('Blaublitz Akita', inf),
 ('Japan', 9.911614979482169e-07),
 ('Akita (Akita)', 3.6962904468494163e-07),
 ('Akita', 3.5409990935597244e-07),
 ('Akita (prefektur)', 3.4223551796330036e-07),
 ('Akita (hund)', 2.309801101196431e-07),
 ('Kategori:Personer fra prefekturet Akita', 2.2171133441900993e-07),
 ('Omono', 2.007245202645245e-07),
 ('Blaublitz Akita', 1.7180549303047049e-07),
 ('Blaublitz Akita', 1.156632738281266e-07)]

In [20]:
search('Vesle hund')

Found 1294 matches in 351 seconds


[('Vesle hund', inf),
 ('Chihuahua', 5.881837827315853e-07),
 ('Middelhavet', 3.6759090491781353e-07),
 ('Store hund', 2.7192508283634427e-07),
 ('Den lille hund', 2.650067042731334e-07),
 ('Kategori:Store hund', 2.3006890281200452e-07),
 ('Kategori:Den lille hund', 1.9951853785354413e-07),
 ('Blefjell', 1.9786135287993907e-07),
 ('Hofteleddsdysplasi', 1.745383031708187e-07),
 ('Vesle hund', 1.5356277661287343e-07)]

In [None]:
search('Den lille hund')

In [21]:
sc.stop()