# Collect french words

https://fr.wiktionary.org/wiki/Wiktionnaire:10000-wp-fr

In [70]:
import os
import urllib2
import unidecode

import numpy as np

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

import search_engine.structure.suffix_tree as suffix_tree

In [71]:
dict_url = 'https://fr.wiktionary.org/wiki/Wiktionnaire:10000-wp-fr-{num}000'
dict_filename = 'data/dictionnary.fr.txt'
stopwords_filename = 'data/stopwords.fr.txt'

if not os.path.exists('data'):
    os.mkdir('data')

In [72]:
dictionnary = set()

In [73]:
def process_word(word):
    return unidecode.unidecode(word).lower()

In [74]:
w = 'ùûüÿàâ æ çéèêëïîô œ ÙÛÜŸÀÂÇÉÈÊËÏÎÔ Æ Œ'.decode('utf-8')
print process_word(w)

uuuyaa ae ceeeeiio oe uuuyaaceeeeiio ae oe


Get words from wiktionnaire

In [75]:
for i in range(1,11):
    url = dict_url.format(num=i)
    response = urllib2.urlopen(url).read()
    selector = Selector(text=response)
    
    words = selector.css('#mw-content-text ul li a').xpath('./text()').extract()
    for word in words:
        dictionnary.add(process_word(word))

Sort dictionnary

In [76]:
print len(dictionnary)

8486


# Remove stopwords

In [77]:
stopwords = []
stopwords_fd = open(stopwords_filename, 'r')
for line in stopwords_fd.readlines():
    stopwords.append(process_word(line[:-1].decode('utf-8')))

Remove stopwords from dictionnary

In [78]:
dictionnary = set(dictionnary) - set(stopwords)
dictionnary = sorted(list(dictionnary))

# Remove bad characteres

In [79]:
bad = ['~', '|', '\\', '$', '+', '*', '%', '"', '_', '^', '.']
dictionnary = [word for word in dictionnary if not any(b in word for b in bad)]

In [80]:
d = []
for word in dictionnary:
    word = word.strip('-')
    word = word.strip('\'')
    if len(word) == 0:
        continue
    d.append(word)

In [81]:
dictionnary = sorted(list(set(d)))

In [82]:
print len(dictionnary)

8256


# Save CSV of word,id

In [83]:
with open('data/dictionnary.fr.csv', 'w') as fd:
    fd.write('id,word\n')
    for i,word in enumerate(dictionnary):
        fd.write('{},{}\n'.format(i, word))

# Transform dictionnary into Suffix Tree

In [12]:
tree = suffix_tree.from_dictionnary(dictionnary)
print len(tree)

8303


# Save Suffix Tree

In [13]:
tree.save('data/dictionnary.fr.tree')

# Load Suffix Tree from file

In [14]:
tree_copy = suffix_tree.from_file('data/dictionnary.fr.tree')

Test if both suffix_tree are the same

In [16]:
print len(tree_copy)
dictionnary_copy = suffix_tree.to_list(tree_copy)
print len(dictionnary_copy)
print set(dictionnary) == set(dictionnary_copy)

8303
8303
True
