# Collect french words

https://fr.wiktionary.org/wiki/Wiktionnaire:10000-wp-fr

In [1]:
import os
from urllib.request import urlopen

import numpy as np

from scrapy.selector import Selector
from scrapy.http import HtmlResponse

import search_engine.structure.suffix_tree as suffix_tree
import search_engine.util as util

In [2]:
dict_url = 'https://fr.wiktionary.org/wiki/Wiktionnaire:10000-wp-fr-{num}000'
dict_filename = 'data/dictionary.fr.csv'
stopwords_filename = 'data/stopwords.fr.txt'

if not os.path.exists('data'):
    os.mkdir('data')

In [3]:
dictionary = set()

In [4]:
w = u'ùûüÿàâ æ çéèêëïîô œ ÙÛÜŸÀÂÇÉÈÊËÏÎÔ Æ Œ'
print(type(w))
print(util.lower_and_no_accent(w))

<class 'str'>
uuuyaa ae ceeeeiio oe uuuyaaceeeeiio ae oe


Get words from wiktionnaire

In [5]:
for i in range(1,11):
    url = dict_url.format(num=i)
    response = urlopen(url).read()
    selector = Selector(text=response)
    
    words = selector.css('#mw-content-text ul li a').xpath('./text()').extract()
    for word in words:
        dictionary.add(util.lower_and_no_accent(word))

Sort dictionary

In [6]:
print(len(dictionary))

8040


# Remove stopwords

In [7]:
stopwords = []
stopwords_fd = open(stopwords_filename, 'r')
for line in stopwords_fd.readlines():
    stopwords.append(util.lower_and_no_accent(line[:-1]))

Remove stopwords from dictionary

In [8]:
dictionary = set(dictionary) - set(stopwords)
dictionary = sorted(list(dictionary))

# Remove bad characteres

In [9]:
bad = ['~', '|', '\\', '$', '+', '*', '%', '"', '_', '^', '.']
dictionary = [word for word in dictionary if not any(b in word for b in bad)]

In [10]:
d = []
for word in dictionary:
    word = word.strip('-')
    word = word.strip('\'')
    if len(word) == 0:
        continue
    d.append(word)

In [11]:
dictionary = sorted(list(set(d)))

In [12]:
print(len(dictionary))

7818


# Save CSV of word,id

In [13]:
with open(dict_filename, 'w') as fd:
    fd.write('id,word\n')
    for i,word in enumerate(dictionary):
        fd.write('{},{}\n'.format(i, word))