In [1]:
from __future__ import print_function, division
import search_engine.util as util
import re
import os
import pandas
import re
import sys
import copy

In [2]:
WIKI_DIR = 'data/wiki'

WIKI_DUMP_FILENAME = os.path.join(WIKI_DIR, 'frwiki-20151226-pages-articles.xml')
WIKI_SMALL_FILENAME = os.path.join(WIKI_DIR, 'frwiki-small.xml')

WIKI_N_LINES = 246871655

PAGE_TO_ID_FILENAME = os.path.join(WIKI_DIR, 'page_to_id.csv')
ID_TO_PAGE_FILENAME = os.path.join(WIKI_DIR, 'id_to_page.csv')
PAGE_WORDS_FILENAME = os.path.join(WIKI_DIR, 'title_words.csv')
PAGE_LINKS_FILENAME = os.path.join(WIKI_DIR, 'page_links.csv')
WORDS_APPEARANCE_FILENAME = os.path.join(WIKI_DIR, 'words_appearance.csv')

DICTIONNARY_FILENAME = 'data/dictionnary.fr.csv'
STOPWORDS_FILENAME = 'data/stopwords.fr.txt'

In [3]:
if not os.path.exists(WIKI_DIR):
    os.mkdir(WIKI_DIR)

Functions to print progress during file browsing

In [4]:
def pretty_number(n):
    a = str(n)[::-1]
    return (' '.join([a[i:i+3] for i in range(0, len(a), 3)]))[::-1]

def print_progress(current_line):
    global last_line, last_print, WIKI_N_LINES
    if current_line - last_line < 100000:
        return
    sys.stdout.write('\b' * len(last_print))
    sys.stdout.write('\r')
    s = '{} / {}'.format(pretty_number(current_line), pretty_number(WIKI_N_LINES))
    sys.stdout.write(s)
    sys.stdout.flush()
    last_line = current_line
    last_print = s

## Load word dictionnary

In [5]:
dataframe = pandas.read_csv(DICTIONNARY_FILENAME)
word_to_id = {}
id_to_word = {}
for i in range(len(dataframe)):
    word_to_id[dataframe['word'][i]] = dataframe['id'][i]
    id_to_word[dataframe['id'][i]] = dataframe['word'][i]

## Get title of each page and attach an id to them

In [6]:
page_to_id = {} # page title => [ids]
id_to_page = {} # id => page title

In [7]:
fd_wiki_dump = open(WIKI_SMALL_FILENAME, 'r')

current_title = None

re_title = '.*<title>(.+)</title>.*'

last_print = ''
last_line = 0
line_count = 0

page_id = 1

for line in fd_wiki_dump:
    line = line[:-1]
    line = line.strip()
    line = util.process_word(line.decode('utf-8'))
    if '<title' in line:
        match = re.match(re_title, line)
        if match is None or ':' in match.group(1)
            continue
        current_title = match.group(1)   
        current_title = util.normalize_page_content(current_title)
        if current_title in page_to_id:
            page_to_id[current_title].append(page_id)
        else:
            page_to_id[current_title] = [page_id]
        id_to_page[page_id] = current_title
        page_id += 1
        current_title = None
    print_progress(line_count)
    line_count += 1

fd_wiki_dump.close()

In [8]:
print('nb titles:', pretty_number(len(id_to_page)))

nb titles: 380


## Function: extract links from page content

In [9]:
def extract_links(page_id, words):
    global page_to_id
    re_link = '\[\[(.*?)\]\]'
    match = re.findall(re_link, words)
    links = []
    if match:
        for m in match:
            for m1 in m.split('|'):
                if ':' in m1:
                    continue
                m1 = util.normalize_page_content(m1)
                if m1 in page_to_id:
                    links = links + page_to_id[m1]
    return sorted(set(links) - set([page_id]))

## Function: extract words id from page content

In [10]:
def extract_words_id(words):
    global word_to_id
    words = util.normalize_page_content(words)
    words_id = [word_to_id[w] for w in words.split(' ') if w in word_to_id]
    return words_id

## Function: append page data to word_appearance

In [11]:
def append_to_words_appearance(words_appearance, page_id, title, words_id):
    title_words_id = set(word_to_id[w] for w in title.split(' ') if w in word_to_id)
    freq_unique = 1.0 / float(len(words_id))
    occ = dict()
    for word_id in words_id:
        if word_id in occ:
            occ[word_id] = occ[word_id] + freq_unique
        else:
            occ[word_id] = freq_unique
    for word_id in title_words_id:
        if word_id in occ:
            occ[word_id] = 1.0 + occ[word_id]
        else:
            occ[word_id] = 1.0
    for k,v in occ.items():
        if k in words_appearance:
            words_appearance[k].append((page_id,v))
        else:
            words_appearance[k] = [(page_id,v)]

In [12]:
for k,v in page_to_id.items():
    page_to_id[k] = sorted(v)

In [13]:
in_page = False
in_text = False
words = None
page_title = None

re_id = '.*<id>(\d+)</id>.*'
re_text_start = '^.*<text.*>(.+)$'
re_text_end = '^(.*)</text>'

last_print = ''
last_line = 0
line_count = 0

page_to_id_cpy = copy.deepcopy(page_to_id)

words_appearance = dict()

In [14]:
fd_wiki_dump = open(WIKI_SMALL_FILENAME, 'r')

fd_links = open(os.path.join(PAGE_LINKS_FILENAME), 'w')
fd_links.write('page_id,links\n')

for line in fd_wiki_dump:
    line = line[:-1]
    line = line.strip()
    line = util.process_word(line.decode('utf-8'))
    if in_page:
        if in_text:
            if '</text' in line:
                match = re.match(re_text_end, line)
                if match is None:
                    continue
                words += ' ' + match.group(1)
                in_text = False
            else:
                words += ' ' + line
        else:
            if '</page' in line:
                if page_title and words:
                    page_id = page_to_id_cpy[page_title][0]
                    page_to_id_cpy[page_title] = page_to_id_cpy[page_title][1:]
                    
                    links = extract_links(page_id, words)                    
                    fd_links.write('{},{}\n'.format(page_id, ' '.join([str(k) for k in links])))
                    
                    words_id = extract_words_id(words)
                    if len(words_id) > 0:
                        append_to_words_appearance(
                            words_appearance,
                            page_id,
                            page_title,
                            words_id
                        )
                words = None
                page_title = None
                in_page = False
            elif '<title' in line:
                match = re.match(re_title, line)
                if match is None:
                    continue
                page_title = match.group(1)
                page_title = util.normalize_page_content(page_title)
            elif '<text' in line:
                match = re.match(re_text_start, line)
                if match is None:
                    continue
                words = match.group(1)
                in_text = True
    else:
        if '<page' in line:
            in_page = True
    print_progress(line_count)
    line_count += 1

fd_links.close()
fd_wiki_dump.close()

## Sort both page_to_id and id_to_page

In [15]:
page_to_id = sorted(page_to_id.items())
id_to_page = sorted(id_to_page.items())

## Sort words_appearance

In [16]:
words_appearance = sorted(words_appearance.items())

## Save page_to_id in csv file

In [17]:
with open(PAGE_TO_ID_FILENAME, 'w') as fd:
    fd.write('page@ids\n')
    for page,list_id in page_to_id:
        fd.write('{}@{}\n'.format(page, ' '.join([str(k) for k in list_id])))
    

## Save id_to_page in csv file

In [18]:
with open(ID_TO_PAGE_FILENAME, 'w') as fd:
    fd.write('id@page\n')
    for page_id,page in id_to_page:
        fd.write('{}@{}\n'.format(page_id, page))
    

## Save words_appearance in csv file

In [19]:
with open(WORDS_APPEARANCE_FILENAME, 'w') as fd:
    fd.write('word_id,frequencies\n')
    for word_id,freqs in words_appearance:
        fd.write('{},{}\n'.format(word_id, ' '.join([str(a)+':'+str(b) for a,b in freqs])))