# Imports

In [None]:
import numpy as np
import pandas as pd
import re
import pickle
import ast
from multiprocessing import Pool, Process

from langdetect import detect
from googletrans import Translator

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import DutchStemmer, FrenchStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

# Config

In [None]:
class Config():
    def __init__(self, data_path='/Users/guillaumecorda/Desktop/UvA/Information Retrieval/project/data/', url=None):
        self.data_path = data_path
        self.url = url

In [None]:
cfg = Config()

# Load data

In [None]:
df_ams = pd.read_csv('data/ams_data.csv', encoding='utf-8', engine='python')
df_rot = pd.read_csv('data/rot_data.csv', encoding='utf-8', engine='python')
df_haag = pd.read_csv('data/haag_data.csv', encoding='utf-8', engine='python')
df_gro = pd.read_csv('data/gro_data.csv', encoding='utf-8', engine='python')
df_utr = pd.read_csv('data/utr_data.csv', encoding='utf-8', engine='python')
df_ein = pd.read_csv('data/ein_data.csv', encoding='utf-8', engine='python')

In [None]:
df = pd.read_csv(cfg.data_path + 'crawled_data.csv', encoding='utf-8', engine='python')

In [None]:
df_all = pd.read_csv(cfg.data_path + 'data_merged.csv', encoding='utf-8', engine='python')

In [None]:
df_all.drop(['zipcode', 'url'], axis=1, inplace=True)

In [None]:
df_all

# Store all urls

In [None]:
df['url'].to_csv(cfg.data_path + 'urls.txt', sep=' ', index=False)

# Data cleansing

## Clean Location 

In [None]:
def get_correct_address(df):
    ind_list = list()
    for i in range(df.shape[0]):
        if 'renting' in df['Location'].iloc[i]:
            tmp = df['Location'].iloc[i].split()
            df['Location'].iloc[i] = ' '.join(word for word in tmp[-4:])

## Create City column

In [None]:
def get_city(row):
    try:
        if 'Amsterdam' in row:
            return 'Amsterdam'
        elif 'The Hague' in row:
            return 'The Hague'
        elif 'Rotterdam' in row:
            return 'Rotterdam'
        elif 'Utrecht' in row:
            return 'Utrecht'
        elif 'Groningen' in row:
            return 'Groningen'
        elif 'Eindhoven' in row:
            return 'Eindhoven'
        elif 'Den Haag' in row:
            return 'The Hague'
    except:
        pass

In [None]:
df['City'] = df['Location'].map(get_city)

In [None]:
df['City'].value_counts()

# Text Analysis

## Helper functions

### Preprocessing - Translate Dutch to English

In [None]:
def txt_translator(row):
    try:
        translator = Translator()
        new_text = translator.translate(row).text
        return new_text
    except:
        pass

### Analysis

In [None]:
def get_language(text):
    try:
        lang = detect(text)
        return lang
    except:
        pass

In [None]:
def text_analysis(row):
    
    try:
        text = row['Description']
        #tokenization
        tokenizer = RegexpTokenizer(r'\w+')
        tokenized_sent = tokenizer.tokenize(text)

        #stopwords removal 
        if row['Language']=='nl':
            stop_words = set(stopwords.words("dutch"))
            
            cleaned_txt = []
            for w in tokenized_sent:
                if w not in stop_words:
                    cleaned_txt.append(w)

            # Stemming
            ps = DutchStemmer()
            stemmed_words=[]
            for w in cleaned_txt:
                stemmed_words.append(ps.stem(w))
                
        elif row['Language']=='fr':
            stop_words = set(stopwords.words("french"))
            
            cleaned_txt = []
            for w in tokenized_sent:
                if w not in stop_words:
                    cleaned_txt.append(w)
            # Stemming
            ps = FrenchStemmer()
            stemmed_words=[]
            for w in cleaned_txt:
                stemmed_words.append(ps.stem(w))
                
        
        else:
            stop_words = set(stopwords.words("english"))

            cleaned_txt = []
            for w in tokenized_sent:
                if w not in stop_words:
                    cleaned_txt.append(w)
            # Stemming
            ps = PorterStemmer()
            stemmed_words=[]
            for w in cleaned_txt:
                stemmed_words.append(ps.stem(w))

            stemmed_words = [w.replace('apart', 'apartment') for w in stemmed_words]
        
        return stemmed_words
    
    except:
        pass

## Process text

### Guillaume's data only

In [None]:
df_all['Language'] = df_all['Description'].map(get_language)

In [None]:
df['Language'] = df['Description'].map(get_language)

In [None]:
df['Language'].loc[df['Language']=='af'] = 'nl'

In [None]:
df['Language'].value_counts()

In [None]:
df['Stem'] = df.apply(text_analysis, axis=1)

In [None]:
df.drop(['zipcode', 'url'], axis=1, inplace=True)

In [None]:
df.to_csv(cfg.data_path + 'data_processed.csv', encoding='utf-8', index=False)

### Merged data

In [None]:
df_all

In [None]:
df_all.to_csv(cfg.data_path + 'final_data.csv', encoding='utf-8', index=False)

# Distributed Index without ElasticSearch

## Load data

In [None]:
df = pd.read_csv(cfg.data_path + 'data_processed.csv', encoding='utf-8', engine='python')

In [None]:
df_final = pd.read_csv(cfg.data_path + 'final_data.csv', encoding='utf-8', engine='python')

In [None]:
df_final['Language'].value_counts()

In [None]:
df_final['Language'].loc[df_final['Language']=='af'] = 'nl'
df_final['Language'].loc[df_final['Language']=='cy'] = 'en'

## Create vocabularies

### Helper functions

In [None]:
def format_description(row):
    try:
        return ast.literal_eval(row)
    except Exception as e:
        print(e)

In [None]:
def get_vocabulary(df, lang):

    index_en = df['Stem'].loc[df['Language']==lang].index
    vocabulary = df['Stem'].loc[df['Language']==lang].loc[index_en[0]]

    for i in index_en[1:]:
        if df['Stem'].loc[df['Language']==lang].loc[i] is not None :
            vocabulary += df['Stem'].loc[df['Language']==lang].loc[i]
    
    return vocabulary

### Vocabularies

In [None]:
df_final['Stem'] = df_final['Stem'].map(format_description)

In [None]:
df_final

In [None]:
vocab_en = set(get_vocabulary(df_final, 'en'))
vocab_nl = set(get_vocabulary(df_final, 'nl'))
vocab_fr = set(get_vocabulary(df_final, 'fr'))

In [None]:
len(vocab_en)

## Create Index

### Helper functions

In [None]:
def is_word_in_description(df, vocab, lang):
    
    distributed_index = dict.fromkeys(vocab)
    try:
        for i in df.loc[df['Language']==lang].index:
            row = df['Stem'].loc[i]
            for word in vocab:
                if word in row:
                    if distributed_index[word] is None:
                        distributed_index[word] = [[i, row.index(word)]]
                    else:
                        distributed_index[word].append([i, row.index(word)])
    except:
        pass
    
    return distributed_index

### Index Multithread

In [None]:
jobs = []
for i in range(5):
    p = Process(target=is_word_in_description, args=(vocab_en, 'en'))
    jobs.append(p)
    p.start()

for proc in jobs:
    proc.join()
print(jobs)

In [None]:
distributed_index_en = is_word_in_description(df_final, vocab_en, 'en')
distributed_index_nl = is_word_in_description(df_final, vocab_nl, 'nl')
distributed_index_fr = is_word_in_description(df_final, vocab_fr, 'fr')

In [None]:
distributed_index_en

In [None]:
distributed_index = {'english' : distributed_index_en, 'dutch' : distributed_index_nl,
                     'french' : distributed_index_fr}

### Store index

In [None]:
np.save('distributed_index_en.npy', distributed_index_en)
np.save('distributed_index_nl.npy', distributed_index_nl)
np.save('distributed_index_fr.npy', distributed_index_fr)
np.save('distributed_index.npy', distributed_index)

In [None]:
distributed_index_en = np.load('distributed_index_en.npy').item()
distributed_index_nl = np.load('distributed_index_nl.npy').item()
distributed_index_fr = np.load('distributed_index_fr.npy').item()
distributed_index = np.load('distributed_index.npy').item()

# Query Processing

## Load data

In [None]:
df = pd.read_csv(cfg.data_path + 'final_data.csv', encoding='utf-8', engine='python')

In [None]:
df

## Apply text analysis on query

In [None]:
def create_query():
    
    query = input('What are you looking for ? \n' )
    lang = get_language(query)
    
    #tokenization
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_sent = tokenizer.tokenize(query)
    
    
    #stopwords removal 
    if lang=='nl':
        stop_words = set(stopwords.words("dutch"))

        cleaned_txt = []
        for w in tokenized_sent:
            if w not in stop_words:
                cleaned_txt.append(w)

        # Stemming
        ps = DutchStemmer()
        stemmed_words=[]
        for w in cleaned_txt:
            stemmed_words.append(ps.stem(w))

    elif lang=='fr':
        stop_words = set(stopwords.words("french"))

        cleaned_txt = []
        for w in tokenized_sent:
            if w not in stop_words:
                cleaned_txt.append(w)
        # Stemming
        ps = FrenchStemmer()
        stemmed_words=[]
        for w in cleaned_txt:
            stemmed_words.append(ps.stem(w))


    else:
        stop_words = set(stopwords.words("english"))

        cleaned_txt = []
        for w in tokenized_sent:
            if w not in stop_words:
                cleaned_txt.append(w)
        # Stemming
        ps = PorterStemmer()
        stemmed_words=[]
        for w in cleaned_txt:
            word = ps.stem(w)
            if w=='apart':
                word = w.replace('apart', 'apartment')
            stemmed_words.append(ps.stem(w)) 

    return stemmed_words

In [None]:
query = create_query()

In [None]:
query

# Filter data on query

In [None]:
def filter_data_before_matching(df, query):
    df = pd.DataFrame(columns=['Available From', 'City', 'Interior', 'Location', 'Number of bedrooms', 'Price',
        'Rooms', 'Surface min', 'Surface max'])
    col = df.columns[0]
    df_tmp = df.loc[df[col] == query[col].iloc[0]]
    print(df_tmp)
    for col in df.columns[1:]:
        print(col)
        if query[col].iloc[0] != -1:
            print(df.loc[df[col] == query[col].iloc[0]])
            df_tmp = pd.concat(df_tmp, df.loc[df[col] == query[col].iloc[0]])
    return df_tmp

In [None]:
filter_data_before_matching(df, query)

In [None]:
if query['City'] is not None:
    df_tmp = df.loc[query['City'].iloc[0] in df['City']]

In [None]:
df.loc[df['City'] == 'Amsterdam' ]

# Select best row using index

In [None]:
def find_matching_data(query, index):
    rows = list()
    for word in query :
        try:
            position = index[word]
            print(position)
            for el in position:
                rows.append(el[0])
        except Exception as e:
            print(e)
    return rows

In [None]:
find_matching_data(['now', 'Amsterdam'], distributed_index_en)

In [None]:
df.iloc[0]

## Tf-idf

In [None]:
query_test = ["Parkinggarage with private parking spot includedSplendid 3- room apartment of approx. 117m2 with large adjacent terrace facing South of approx. 60m2, overlooking the Sloterpark.This luxurious and practical apartment has 2 good size bedrooms, a walk-in closet, a luxurious bathroom with whirlpool, separate rainshower, and sink. The openplan kitchen is equipped with everyday modern appliances, such as a 5 spots induction hob, large oven, microwave combi, two door American refrigerator. There is a separate toilet with fontain.Furthermore the apartment has a spacious laundryroom with washer and dryer, large storage in the basement, and a private parkingspot.Above all this apartment offers a grand and bright livingroom with acces to the huge roofterrace facing South.The terrace has a dining and lounge area, from which you can enjoy"]

In [None]:
text_1 = [df['Description'].iloc[0]]
text_2 = [df['Description'].iloc[282]]

In [None]:
query_test

In [None]:
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted_1 = tf.fit(text_1)
txt_transformed_1 = txt_fitted.transform(text_1)

tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted_2 = tf.fit(text_2)
txt_transformed_2 = txt_fitted.transform(text_2)

tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted_query = tf.fit(query_test)
txt_transformed_query = txt_fitted.transform(query_test)

In [None]:
txt_fitted_1.vocabulary_

In [None]:
out_1 = txt_transformed_1.todense()
out_2 = txt_transformed_2.todense()
out_query = txt_transformed_query.todense()

In [None]:
out_1

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

output_1 = cosine_similarity(out_1, out_query)
output_2 = cosine_similarity(out_2, out_query)

In [None]:
output_1

In [None]:
output_2

In [None]:
text_1