# Imports

In [None]:
import pandas as pd
import numpy as np
import sys

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import DutchStemmer, FrenchStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import re
from collections import Counter

from googletrans import Translator

from spellchecker import SpellChecker

# Config

In [None]:
class Config():
    def __init__(self, data_path='/Users/guillaumecorda/Desktop/UvA/Information Retrieval/project/data/', url=None):
        self.data_path = data_path
        self.url = url

In [None]:
cfg = Config()

# Load data

In [None]:
df = pd.read_csv(cfg.data_path + 'final_data.csv', encoding='utf-8', engine='python')

# Processing

## Helper functions

In [None]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open(cfg.data_path+'big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
def get_language(text):
    try:
        lang = detect(text)
        return lang
    except:
        pass

In [None]:
def make_query():
    
    query = input('What are you looking for ? \n' )
    print()
    lang = get_language(query)
    
    if lang !='en':
        translator = Translator()
        txt = translator.translate(query)
        query = txt.text
        print(query)
        answer = input('Is it what you meant ? \nyes/no \n')
        
        if answer.lower() == 'no':
            print('Please use english only')
            sys.exit()
            
    #tokenization
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_sent = tokenizer.tokenize(query)
    
    stop_words = set(stopwords.words("english"))
    cleaned_txt = []
    for w in tokenized_sent:
        if w not in stop_words:
            cleaned_txt.append(w)
    # Spell checker
    cleaned_txt = [correction(word) for word in cleaned_txt]
    # Stemming
    ps = PorterStemmer()
    stemmed_words=[]
    for w in cleaned_txt:
        word = ps.stem(w)
        if word=='apart':
            word = word.replace('apart', 'apartment')
        stemmed_words.append(word) 

    return stemmed_words

## Example

In [None]:
# example_dutch ---> Ik ben op zoek naar een appartement in Amsterdam met een keuken en twee slaapkamers
# example_brazil ---> Eu estou procurando um apartamento em Amsterdam com uma cozinha e dois quartos

#run this line without any parameter then enter one of the three sentences above in the text field to test function
query = make_query()