# Collect french words

https://fr.wiktionary.org/wiki/Wiktionnaire:10000-wp-fr

In [17]:
import os
import urllib2
import unidecode

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

import search_engine.structure.suffix_tree as suffix_tree

In [2]:
dict_url = 'https://fr.wiktionary.org/wiki/Wiktionnaire:10000-wp-fr-{num}000'
dict_filename = 'data/dictionnary.fr.txt'
stopwords_filename = 'data/stopwords.fr.txt'

if not os.path.exists('data'):
    os.mkdir('data')

In [3]:
dictionnary = set()

In [4]:
def process_word(word):
    return unidecode.unidecode(word).lower()

In [5]:
w = 'ùûüÿàâ æ çéèêëïîô œ ÙÛÜŸÀÂÇÉÈÊËÏÎÔ Æ Œ'.decode('utf-8')
print process_word(w)

uuuyaa ae ceeeeiio oe uuuyaaceeeeiio ae oe


Get words from wiktionnaire

In [6]:
for i in range(1,11):
    url = dict_url.format(num=i)
    response = urllib2.urlopen(url).read()
    selector = Selector(text=response)
    
    words = selector.css('#mw-content-text ul li a').xpath('./text()').extract()
    for word in words:
        dictionnary.add(process_word(word))

Sort dictionnary

In [7]:
dictionnary = sorted(dictionnary)

In [8]:
print len(dictionnary)

8486


# Remove stopwords

In [9]:
stopwords = []
stopwords_fd = open(stopwords_filename, 'r')
for line in stopwords_fd.readlines():
    stopwords.append(process_word(line[:-1].decode('utf-8')))

Remove stopwords from dictionnary

In [10]:
dictionnary = set(dictionnary) - set(stopwords)

In [11]:
print len(dictionnary)

8303


# Transform dictionnary into Suffix Tree

In [12]:
tree = suffix_tree.from_sorted_dictionnary(dictionnary)
print len(tree)

8303


# Save Suffix Tree

In [13]:
tree.save('data/dictionnary.fr.tree')

# Load Suffix Tree from file

In [14]:
tree_copy = suffix_tree.from_file('data/dictionnary.fr.tree')

Test if both suffix_tree are the same

In [16]:
print len(tree_copy)
dictionnary_copy = suffix_tree.to_list(tree_copy)
print len(dictionnary_copy)
print set(dictionnary) == set(dictionnary_copy)

8303
8303
True
