# Exploration into building a search function \

Use a download of wikipedia pages as the data source
Build a function that searches for a target string
Output a CSV file that contains the path, line, character index, and context of the searched string
This will be done using a multithreaded approach, my first attempt at multithreading.



In [1]:
import os
import math
import functools
from multiprocessing import Pool
import pandas as pd
import time

In [2]:
folder_name = 'wiki'
file_names = os.listdir('wiki')
print(len(file_names), 'files')
with open(os.path.join(folder_name, file_names[0])) as f:
    lines = [lines for lines in f.readlines()]
file_names[:5]

FileNotFoundError: [Errno 2] No such file or directory: 'wiki'

Create a function that counts the number of lines
---
First step

Build a simple multithreaded function that accesses the number of lines per file

Intent: simply get the multithreaded functionality working.

In [3]:
def make_chunks(data, num_chunks):
    chunk_size = math.ceil(len(data) / num_chunks)
    return [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

def map_reduce(data, num_processes, mapper, reducer):
    chunks = make_chunks(data, num_processes)
    with Pool(num_processes) as pool:
        chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

In [4]:
def map_num_lines(file_names_chunk):
    num_lines = 0
    for file_name in file_names_chunk:
        with open(os.path.join(folder_name, file_name)) as file:
            num_lines += len(file.readlines())
    return num_lines

def reducer_num_lines(num_lines1, num_lines2):
    return num_lines1 + num_lines2

Total lines : 499797
---

In [5]:
total_lines = map_reduce(file_names, 4, map_num_lines, reducer_num_lines)
total_lines

499797

Modify the Function to give a search result
---

Returns a dict that contains {'filename':[list of rows with the searched term]}

Intent: increase funtion complexity to include the search term

In [6]:
target = 'engineering'

In [7]:
def map_search_lines(file_names_chunk):
    search_result = {}
    for file_name in file_names_chunk:
        with open(os.path.join(folder_name, file_name)) as file:
            indices = [index for index, line in enumerate(file.readlines()) if target in line]
        search_result[file_name] = indices    
    return search_result

def reducer_search_lines(search_result1, search_result2):
    merged = {}
    merged.update(search_result1)
    merged.update(search_result2)
    return merged

In [8]:
total_target_string = map_reduce(file_names, 4, map_search_lines, reducer_search_lines)

{'Bay_of_ConcepciC3B3n.html': [],
 'Bye_My_Boy.html': [],
 'Valentin_Yanin.html': [],
 'Kings_XI_Punjab_in_2014.html': [],
 'William_Harvey_Lillard.html': [],
 'Radial_Road_3.html': [],
 'George_Weldrick.html': [],
 'Zgornji_Otok.html': [],
 'Blue_Heelers_(season_8).html': [],
 'Taggen_Nunatak.html': [],
 'Henri_BraqueniC3A9.html': [],
 'Vrila.html': [],
 'William_Henry_Porter.html': [],
 'Clive_Brown_(footballer).html': [],
 'Blick_nach_Rechts.html': [],
 'Central_District_(Rezvanshahr_County).html': [],
 'Alexios_Aspietes.html': [],
 'Mei_Lanfang.html': [],
 'Wangeroogeclass_tug.html': [],
 'Dowell_Philip_O27Reilly.html': [],
 'Coalville_Town_railway_station.html': [],
 'Gennady_Lesun.html': [],
 'Bartrum_Glacier.html': [],
 'Victor_S._Mamatey.html': [],
 'Gottfried_Keller.html': [],
 'Table_Point_Formation.html': [],
 'Nobuhiko_Ushiba.html': [],
 'Master_of_Space_and_Time.html': [],
 'Early_medieval_states_in_Kazakhstan.html': [],
 'Eressa_aperiens.html': [],
 'Myrtle_(sternwheeler)

Update search to be case insensative
---

Improve search function

Intent: Find additional hits were found when the search was done in a case insensative manner

In [9]:
def map_search_case_insensative(file_names_chunk):
    search_result = {}
    for file_name in file_names_chunk:
        with open(os.path.join(folder_name, file_name)) as file:
            indices = [index for index, line in enumerate(file.readlines()) if target.lower() in line.lower()]
        search_result[file_name] = indices    
    return search_result

def reducer_search_case_insensative(search_result1, search_result2):
    merged = {}
    merged.update(search_result1)
    merged.update(search_result2)
    return merged

In [10]:
total_case_insensative = map_reduce(file_names, 4, map_search_case_insensative, reducer_search_case_insensative)
total_case_insensative


{'Bay_of_ConcepciC3B3n.html': [],
 'Bye_My_Boy.html': [],
 'Valentin_Yanin.html': [],
 'Kings_XI_Punjab_in_2014.html': [],
 'William_Harvey_Lillard.html': [],
 'Radial_Road_3.html': [],
 'George_Weldrick.html': [],
 'Zgornji_Otok.html': [],
 'Blue_Heelers_(season_8).html': [],
 'Taggen_Nunatak.html': [],
 'Henri_BraqueniC3A9.html': [],
 'Vrila.html': [],
 'William_Henry_Porter.html': [],
 'Clive_Brown_(footballer).html': [],
 'Blick_nach_Rechts.html': [],
 'Central_District_(Rezvanshahr_County).html': [],
 'Alexios_Aspietes.html': [],
 'Mei_Lanfang.html': [],
 'Wangeroogeclass_tug.html': [],
 'Dowell_Philip_O27Reilly.html': [],
 'Coalville_Town_railway_station.html': [],
 'Gennady_Lesun.html': [],
 'Bartrum_Glacier.html': [],
 'Victor_S._Mamatey.html': [],
 'Gottfried_Keller.html': [],
 'Table_Point_Formation.html': [],
 'Nobuhiko_Ushiba.html': [],
 'Master_of_Space_and_Time.html': [],
 'Early_medieval_states_in_Kazakhstan.html': [],
 'Eressa_aperiens.html': [],
 'Myrtle_(sternwheeler)

In [11]:
#Additional hits per line that had multiple hits
#A results of 1 means there was 1 additional (2 total) hits in this line

case_inse_additional = {}
for key in total_case_insensative:
    additional = [i for i in total_case_insensative[key] if i not in total_target_string[key]]
    case_inse_additional[key] = len(additional)
    if case_inse_additional[key] != 0:
        print(key, ':', case_inse_additional[key])

Metis_Institute_of_Polytechnic.html : 7
Pictogram.html : 1
Claire_Danes.html : 1
Olive_Dennis.html : 3
AB_v_CD.html : 1
I_Marine_Expeditionary_Force.html : 1
Biorock.html : 2
De_La_Salle_University_E28093_DasmariC3B1as.html : 4
Vijay_Govindarajan.html : 1
Sir_Run_Run_Shaw_Hospital.html : 2
Lydia_VillaKomaroff.html : 3
Hilyard_Robinson.html : 1
Slade_School_of_Fine_Art.html : 1
Flat_roof.html : 2
List_of_largest_chemical_producers.html : 29
Potheri_village_Kanchipuram.html : 1
City_of_Mandaluyong_Science_High_School.html : 2
USS_Impeccable_(AM320).html : 2
Ronald_McCaffer.html : 6
RF_microwave_CAE_CAD.html : 1
Government_Polytechnic_Nagriya_Mod_Etah.html : 5
Senapati_district.html : 1
Brownfield_(software_development).html : 1
List_of_people_from_Bangor_Maine.html : 1
Campus_of_Texas_A26M_University.html : 9
C389cole_des_Mines_de_Douai.html : 20


Improve search result 
---
Return Given additional hits per line, return a list of tuples that provide (line index, string index) per hit per file

This will allow us to use a loop to grab context from the files later

In [12]:
#Added helper function to clean up the list comprehension used for bulding the search result tuples

def get_index_tuples(line, line_num):
    index_tuples = [(line_num, index) for index in range(len(line)) if line.startswith(target.lower(), index)]
    return index_tuples

def map_search_line_tuples(file_names_chunk):
    search_result = {}
    for file_name in file_names_chunk:
        with open(os.path.join(folder_name, file_name)) as file:
            index_tuples = [get_index_tuples(line.lower(), index) for index, line in enumerate(file.readlines()) if target.lower() in line.lower()]
        search_result[file_name] = index_tuples    
    return search_result

def reducer_search_line_tuples(search_result1, search_result2):
    merged = {}
    merged.update(search_result1)
    merged.update(search_result2)
    return merged

In [13]:
total_tuples = map_reduce(file_names, 4, map_search_line_tuples, reducer_search_line_tuples)
total_tuples

{'Bay_of_ConcepciC3B3n.html': [],
 'Bye_My_Boy.html': [],
 'Valentin_Yanin.html': [],
 'Kings_XI_Punjab_in_2014.html': [],
 'William_Harvey_Lillard.html': [],
 'Radial_Road_3.html': [],
 'George_Weldrick.html': [],
 'Zgornji_Otok.html': [],
 'Blue_Heelers_(season_8).html': [],
 'Taggen_Nunatak.html': [],
 'Henri_BraqueniC3A9.html': [],
 'Vrila.html': [],
 'William_Henry_Porter.html': [],
 'Clive_Brown_(footballer).html': [],
 'Blick_nach_Rechts.html': [],
 'Central_District_(Rezvanshahr_County).html': [],
 'Alexios_Aspietes.html': [],
 'Mei_Lanfang.html': [],
 'Wangeroogeclass_tug.html': [],
 'Dowell_Philip_O27Reilly.html': [],
 'Coalville_Town_railway_station.html': [],
 'Gennady_Lesun.html': [],
 'Bartrum_Glacier.html': [],
 'Victor_S._Mamatey.html': [],
 'Gottfried_Keller.html': [],
 'Table_Point_Formation.html': [],
 'Nobuhiko_Ushiba.html': [],
 'Master_of_Space_and_Time.html': [],
 'Early_medieval_states_in_Kazakhstan.html': [],
 'Eressa_aperiens.html': [],
 'Myrtle_(sternwheeler)

Construct into a functional datatype
--
Data structure:|File Path |Line Index | String Index| String Context|

Build a non-mutlithreaded implementation first, then construct into a multithreaded function.

Compare time taken to do the context calls and build the dataframe

In [14]:
#Build non-multithreaded DataFrame constructor
start = time.time()
rows = []
for file_name in total_tuples:
    with open(os.path.join(folder_name, file_name)) as file:
        lines = file.readlines()
        for line_list in total_tuples[file_name]:
            for tup in line_list:
                context = lines[tup[0]][tup[1]-20 : tup[1]+10+len(target)]
                row = [''.join([folder_name, '/', file_name]), tup[0], tup[1], context]
                rows.append(row)
                
df = pd.DataFrame(rows, columns=['File', 'Line', 'Index', 'Context'])
print(time.time()-start, 's')
df

0.23424100875854492 s


Unnamed: 0,File,Line,Index,Context
0,wiki/Metis_Institute_of_Polytechnic.html,6,735,"lleges in Haryana"",""Engineering colleges"
1,wiki/Metis_Institute_of_Polytechnic.html,130,10,ring</li>\n
2,wiki/Metis_Institute_of_Polytechnic.html,131,15,/li>\n
3,wiki/Metis_Institute_of_Polytechnic.html,132,36,&amp; Communication Engineering</li>\n
4,wiki/Metis_Institute_of_Polytechnic.html,133,13,g</li>\n
...,...,...,...,...
302,wiki/C389cole_des_Mines_de_Douai.html,750,51,eration_of_National_Engineering_Associati
303,wiki/C389cole_des_Mines_de_Douai.html,750,116,eration of National Engineering Associati
304,wiki/C389cole_des_Mines_de_Douai.html,750,174,eration of National Engineering Associati
305,wiki/C389cole_des_Mines_de_Douai.html,1124,0,ering Sciences<


In [15]:
#Build multithreaded DataFrame constructor
#Could not pass the dictionary or keys directly as Python dicionary keys are not ordered, and can not be subscripted
def map_to_df(file_names_chunk):
    rows = []
    for file_name in file_names_chunk:
        with open(os.path.join(folder_name, file_name)) as file:
            lines = file.readlines()
            for line_list in total_tuples[file_name]:
                for tup in line_list:
                    context = lines[tup[0]][tup[1]-20 : tup[1]+10+len(target)]
                    row = [''.join([folder_name, '/', file_name]), tup[0], tup[1], context]
                    rows.append(row)
    return rows

def reducer_to_df(rows1, rows2):
    merged = []
    merged.extend(rows1)
    merged.extend(rows2)
    return merged

start = time.time()

to_df = map_reduce(file_names, 4, map_to_df, reducer_to_df)
df = pd.DataFrame(to_df, columns=['File', 'Line', 'Index', 'Context'])

print(time.time()-start, 's')
df

0.12343597412109375 s


Unnamed: 0,File,Line,Index,Context
0,wiki/Metis_Institute_of_Polytechnic.html,6,735,"lleges in Haryana"",""Engineering colleges"
1,wiki/Metis_Institute_of_Polytechnic.html,130,10,ring</li>\n
2,wiki/Metis_Institute_of_Polytechnic.html,131,15,/li>\n
3,wiki/Metis_Institute_of_Polytechnic.html,132,36,&amp; Communication Engineering</li>\n
4,wiki/Metis_Institute_of_Polytechnic.html,133,13,g</li>\n
...,...,...,...,...
302,wiki/C389cole_des_Mines_de_Douai.html,750,51,eration_of_National_Engineering_Associati
303,wiki/C389cole_des_Mines_de_Douai.html,750,116,eration of National Engineering Associati
304,wiki/C389cole_des_Mines_de_Douai.html,750,174,eration of National Engineering Associati
305,wiki/C389cole_des_Mines_de_Douai.html,1124,0,ering Sciences<


In [16]:
df.to_csv()

',File,Line,Index,Context\n0,wiki/Metis_Institute_of_Polytechnic.html,6,735,"lleges in Haryana"",""Engineering colleges "\n1,wiki/Metis_Institute_of_Polytechnic.html,130,10,"ring</li>\n"\n2,wiki/Metis_Institute_of_Polytechnic.html,131,15,"/li>\n"\n3,wiki/Metis_Institute_of_Polytechnic.html,132,36,"&amp; Communication Engineering</li>\n"\n4,wiki/Metis_Institute_of_Polytechnic.html,133,13,"g</li>\n"\n5,wiki/Metis_Institute_of_Polytechnic.html,134,15,"/li>\n"\n6,wiki/Metis_Institute_of_Polytechnic.html,233,381,"ref=""/wiki/Category:Engineering_colleges_"\n7,wiki/Metis_Institute_of_Polytechnic.html,233,430,"na"" title=""Category:Engineering colleges "\n8,wiki/Metis_Institute_of_Polytechnic.html,233,463,"olleges in Haryana"">Engineering colleges "\n9,wiki/Pictogram.html,426,19,\n10,wiki/Pictogram.html,426,47,"ing_drawing"" title=""Engineering drawing"">"\n11,wiki/Pictogram.html,426,68,"ngineering drawing"">Engineering drawing</"\n12,wiki/Claire_Danes.html,111,146,g Biology and Civil Enginee