## Imports 

In [1]:
import re
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
from langdetect import detect
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, RSLPStemmer
from nltk.stem.wordnet import WordNetLemmatizer

## Data Import

In [2]:
airbnb_df_train = pd.read_excel('./1. data_raw/train.xlsx')
airbnb_df_test = pd.read_excel('./1. data_raw/test.xlsx')
airbnb_df_train_reviews = pd.read_excel('./1. data_raw/train_reviews.xlsx')
airbnb_df_test_reviews = pd.read_excel('./1. data_raw/test_reviews.xlsx')
pd.options.display.max_colwidth = 50

## First Data Exploration

In [3]:
airbnb_df_train.shape

(12496, 4)

In [4]:
airbnb_df_train_reviews.shape

(721402, 2)

## Language Detection

In [5]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'Unknown'

### Language detection for reviews

In [14]:
airbnb_df_train_reviews["lang_comments"] = airbnb_df_train_reviews["comments"].apply(detect_language)

In [15]:
airbnb_df_train_reviews

Unnamed: 0,index,comments,lang_comments
0,1,this is a very cozy and comfortable house to s...,en
1,1,good<br/>,cy
2,1,"My first hostel experience, and all I have to ...",en
3,1,Das Hostel war neu und deshalb funktionierte a...,de
4,1,"It was fine for a dorm, but I think for the pe...",en
...,...,...,...
721397,12494,"We had a good time, the apartment has a great ...",en
721398,12494,Great apartment in very central location. The ...,en
721399,12494,"We are Airbnb Super Hosts too, so trust me, Li...",en
721400,12494,We had a lovely stay at this apartment. Sofia ...,en


### Language detection for desc/host_about

In [16]:
airbnb_df_train["lang_desc"] = airbnb_df_train["description"].apply(detect_language)

In [17]:
airbnb_df_train["lang_host"] = airbnb_df_train["host_about"].apply(detect_language)

In [18]:
airbnb_df_train

Unnamed: 0,index,description,host_about,unlisted,lang_desc,lang_host
0,1,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0,en,pt
1,2,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1,pt,en
2,3,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1,en,en
3,4,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0,pt,en
4,5,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0,fr,fr
...,...,...,...,...,...,...
12491,12492,CAT’S BY BAIRRO ALTO<br /><br />This cozy apar...,Travelling is one of my favorite hobbies. I've...,0,en,en
12492,12493,Beautifully located in the heart of Lisbon's h...,Founded by travel enthusiasts (just like you) ...,0,en,en
12493,12494,Enjoy breakfast in the sleek kitchen with its ...,I´m from Portugal and I love to dance and to t...,0,en,en
12494,12495,A Terra da Eira é uma casa de campo rodeada de...,Somos uma familia de 5. Gostamos de viajar e d...,1,pt,pt


## Import detected data

In [6]:
df_train_detected = pd.read_csv("./2. data_detected/airbnb_df_train_detected.csv", index_col="index",).drop("Unnamed: 0",axis=1)
df_train_reviews_detected = pd.read_csv("./2. data_detected/airbnb_df_train__reviews_detected.csv", index_col="index").drop("Unnamed: 0",axis=1)

### Count number of different Languages

In [155]:
#airbnb_df_train["lang_desc"].value_counts()

In [156]:
#airbnb_df_train["lang_host"].value_counts()

In [157]:
#airbnb_df_train_reviews["lang_comments"].value_counts()

In [7]:
def create_df(df, df_review, language):

    columns_to_drop = ['lang_desc', 'lang_host']
    
    df = df[(df['lang_desc'] == language) & (df['lang_host'] == language)]
    df_review = df_review[df_review['lang_comments'] == language]
    grouped_reviews = df_review.groupby('index')['comments'].apply(lambda x: ''.join(str(x))).reset_index()
    merged_df = pd.merge(df, grouped_reviews, on='index', how='left')
    merged_df = merged_df.drop(columns=columns_to_drop)
    merged_df = merged_df[["index",	"description", "host_about", "comments", "unlisted"]]

    return merged_df

### join all English host_about/desc with English comments

In [8]:
merged_df_english = create_df(df_train_detected, df_train_reviews_detected, "en")

In [9]:
merged_df_english

Unnamed: 0,index,description,host_about,comments,unlisted
0,3,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",,1
1,6,"IMPORTANT: In response to COVID-19, this prope...","Hi, we are Homing - a company that develops it...",,0
2,7,This is my home that I rent out when I'm trave...,Globe trotter. I'm of Portuguese nationality w...,,1
3,8,Find tranquility in this meticulously curated ...,I travel a lot and I love it. _x000D_\nOrigina...,index\n8 Shani was very helpful throughout ...,0
4,9,Charming apartment with one bedroom with doubl...,"Isabel & Helder, portugueses, parents of three...",index\n9 Great little space; lovely hosts -...,0
...,...,...,...,...,...
8201,12489,Feel at home wherever you choose to live with ...,"We’re Blueground, a global proptech company wi...",,1
8202,12492,CAT’S BY BAIRRO ALTO<br /><br />This cozy apar...,Travelling is one of my favorite hobbies. I've...,index\n12492 We were very happy to find thi...,0
8203,12493,Beautifully located in the heart of Lisbon's h...,Founded by travel enthusiasts (just like you) ...,"index\n12493 Nice place, nice location, ...",0
8204,12494,Enjoy breakfast in the sleek kitchen with its ...,I´m from Portugal and I love to dance and to t...,index\n12494 We had a very lovely stay in L...,0


### Seems like NaN values is a good indication if a flat is getting listed again or not. Makes sense in real life context, because flats that dont have visitors are more likly to be unlisted in the future.

## Preprocessing English

In [10]:
def preprocessing_eng(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text)
            text = " ".join(tokens)
        
        #REMOVE STOPWORDS
        if stop:
            stop_eng = set(stopwords.words('english'))
            text = " ".join([word for word in text.split() if word not in stop_eng])
        
        #Lemmatize
        if lemmatize:
            lemma_eng = WordNetLemmatizer()
            text = " ".join(lemma_eng.lemmatize(word) for word in text.split())
            
        
        #Stemming
        if stemmertize:
            stemmer_eng = SnowballStemmer('english')
            text = " ".join(stemmer_eng.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [11]:
download('wordnet')
download('stopwords')
download('punkt')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_english[columns_to_apply] = merged_df_english[columns_to_apply].astype(str).apply(lambda row: preprocessing_eng(row=row,
                                                                                                                        tokenize=True,
                                                                                                                        stop=True,
                                                                                                                        lemmatize = True, 
                                                                                                                        stemmertize = False
                                                                                                                        )
                                                                                            )

[nltk_data] Downloading package wordnet to /Users/huberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/huberto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 8206/8206 [00:05<00:00, 1563.12it/s]
100%|██████████| 8206/8206 [00:02<00:00, 2901.00it/s]
100%|██████████| 8206/8206 [00:03<00:00, 2189.38it/s]


In [27]:
merged_df_english

Unnamed: 0,index,description,host_about,comments,unlisted
0,3,trafaria house cozy familiar villa facility ne...,social person liking communicate reading trave...,,1
1,6,important response covid property extended cle...,hi homing company develops activity tourism pr...,,0
2,7,home rent traveling perfect vacation without c...,globe trotter portuguese nationality german fa...,,1
3,8,find tranquility meticulously curated lifestyl...,travel lot love x originally israel currently ...,index shani helpful throughout process accommo...,0
4,9,charming apartment one bedroom double bed doub...,isabel helder portuguese parent three wonderfu...,index great little space lovely host clean w c...,0
...,...,...,...,...,...
8201,12489,feel home wherever choose live blueground love...,blueground global proptech company several tho...,,1
8202,12492,cat bairro alto cozy apartment lisbon city cen...,travelling one favorite hobby already visited ...,index happy find place com great stay lisbon a...,0
8203,12493,beautifully located heart lisbon historic cent...,founded travel enthusiast like bnbird want con...,index nice place nice location easy communicat...,0
8204,12494,enjoy eakfast sleek kitchen freestanding knott...,portugal love dance travel x pleasure welcome ...,index lovely stay lisbon apartm liliana apartm...,0


### join all French host_about/desc with French comments

In [28]:
merged_df_french = create_df(df_train_detected, df_train_reviews_detected, "fr")

In [29]:
merged_df_french

Unnamed: 0,index,description,host_about,comments,unlisted
0,5,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,index\n5 Un très bel appartement avec une m...,0
1,189,"Hostel dans maison de ville , location possibl...",Maison typique avec trois chambres individuell...,,0
2,192,"Appartement très chaleureux, avec beaucoup de ...","Je suis teresa, novice dans le domaines de la ...","index\n192 Appartement spacieux et propre, ...",0
3,301,Chambre double tout confort dans grande maison...,Couple de français installé au Portugal où nou...,index\n301 Tout d’abord une très belle prop...,0
4,329,Situé au pied du Panthéon dans le quartier de ...,Romain et moi sommes un couple qui sommes tomb...,index\n329 Appartement très bien situé dans...,1
...,...,...,...,...,...
100,11799,Appartement de 2 pièces entièrement rénové se ...,"Bonjour, je m’appelle Leo, un français d’origi...",index\n11799 Accueil très chaleureux de Ped...,0
101,11878,Chambre sexy in-love. Chambre avec de nombreux...,"Avec mon mari Dominique, nous vous proposons à...",,0
102,11997,Ce logement affiche un style résolument unique...,j'aime les paisagem la montagne et tout qui c'...,,1
103,12210,Jolie maison typique portugaise et entièrement...,Christophe & Lila marié . Trois enfants,index\n12210 Nous avons passé un très bon d...,0


## Preprocessing French

In [25]:
#!pip3 -m spacy download fr_core_news_md

In [30]:
def preprocessing_fr(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text, language="french")
            text = " ".join(tokens)
 
        #REMOVE STOPWORDS
        if stop:
            stop_fr = set(stopwords.words('french'))
            text = " ".join([word for word in text.split() if word not in stop_fr])
        

        #Lemmatize
        if lemmatize:
            lemma_fr = spacy.load("fr_core_news_md")
            doc = lemma_fr(text)
            #for word in doc:
            text = " ".join(i.lemma_ for i in doc)
        
        #Stemming
        if stemmertize:
            stemmer_fr = SnowballStemmer('french')
            stem_doc = stemmer_fr(text)
            for word in stem_doc:
                text = " ".join(stemmer_fr.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [31]:
download('wordnet')
download('stopwords')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_french[columns_to_apply] = merged_df_french[columns_to_apply].astype(str).apply(lambda x: preprocessing_fr(row=x,
                                                                                                                     tokenize=True,
                                                                                                                     stop=True,
                                                                                                                     lemmatize = True, 
                                                                                                                     stemmertize = False
                                                                                                                    )
                                                                                         )

[nltk_data] Downloading package wordnet to /Users/huberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 105/105 [01:49<00:00,  1.04s/it]
100%|██████████| 105/105 [01:45<00:00,  1.00s/it]
100%|██████████| 105/105 [01:47<00:00,  1.03s/it]


In [34]:
merged_df_french.head()

Unnamed: 0,index,description,host_about,comments,unlisted
0,5,joli appartement bordure mer min avoir pied pl...,famille deux enfant an habit pendant plusieurs...,index tr bel appartement magnifique voir appar...,0
1,189,hostel maison ville location possible chame in...,maison typique trois chame individuel r servab...,,0
2,192,appartement tr chaleureux beaucoup luminosit b...,teresa novice domaine location esp r apparteme...,index appartement spacieux propre situ quelqu ...,0
3,301,chame double tout confort grand maison terrain...,couple fran ais install portugal o cr maison h...,index tout abord tr bel propri name dtyp object,0
4,329,situ pied panth quartier alfamer venir journer...,romain couple tomb sou charm colmar lisbonn e ...,index appartement tr bien situ quartier appart...,1


### join all Portuguese host_about/desc with Portuguese comments'

In [17]:
merged_df_pt = create_df(df_train_detected, df_train_reviews_detected, "pt")

In [18]:
merged_df_pt

Unnamed: 0,index,description,host_about,comments,unlisted
0,13,O proprietário recebe os hospedes pessoalmente...,"Faço questão de receber os hospedes, para que ...",,0
1,34,Simplifique neste espaço tranquilo e de locali...,"Ola sou Carla, tenho prazer em recebê-los em m...",,1
2,64,"Apartamento composto por uma suite, sala de es...",Apaixonada por Portugal!,,1
3,67,O apartamento está decorado com extremo bom go...,Sou uma pessoa otimista que vive a vida com a ...,,0
4,87,"Óptimo espaço exterior, ideal para quem gosta ...","O meu nome é Cátia, nasci e cresci em Sintra, ...",,1
...,...,...,...,...,...
1084,12457,Casa aconchegante familiar <br />Com ótimas co...,Boa pessoa,"index\n12457 Excelente localização, boas ár...",0
1085,12461,Bem vindo a Lisboa!<br />Este charmoso apartam...,Tenho 38 anos e sou Assistente social de forma...,index\n12461 Excelente apartamento. Um agra...,0
1086,12470,"Apartamento para 6 pessoas com 2 quartos, sala...","Chamo-me Margarida. Gosto do mar, praia, do so...",index\n12470 quem quiser passar uns dias nu...,0
1087,12476,"Apartamento numa zona muito calma de Lisboa, m...","O apartamento é muito, muito simples, mas tem ...",,0


## Preprocessing Portuguese

In [20]:
#!python -m spacy download pt_core_news_sm

zsh:1: command not found: python


In [19]:
def preprocessing_pt(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text, language="portuguese")
            text = " ".join(tokens)
            
        #REMOVE STOPWORDS
        if stop:
            stop_pt = set(stopwords.words('portuguese'))
            text = " ".join([word for word in text.split() if word not in stop_pt])
            
        #Lemmatize
        if lemmatize:
            lemma_pt = spacy.load("pt_core_news_sm")
            doc = lemma_pt(text)
            for word in doc:
                text = " ".join(word.lemma_).split()
        
        #Stemming
        if stemmertize:
            stemmer_pt = RSLPStemmer()
            stem_doc = stemmer_pt(text)
            for word in stem_doc:
                text = " ".join(stemmer_pt.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [20]:
download('wordnet')
download('stopwords')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_pt[columns_to_apply] = merged_df_pt[columns_to_apply].astype(str).apply(lambda row: preprocessing_pt(row=row,
                                                                                                             tokenize=True,
                                                                                                             stop=True,
                                                                                                             lemmatize = False, 
                                                                                                             stemmertize = False
                                                                                                            )
                                                                                )

[nltk_data] Downloading package wordnet to /Users/huberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/huberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1089/1089 [00:00<00:00, 2774.37it/s]
100%|██████████| 1089/1089 [00:00<00:00, 5286.69it/s]
100%|██████████| 1089/1089 [00:00<00:00, 5416.60it/s]


In [38]:
merged_df_pt

Unnamed: 0,index,description,host_about,comments,unlisted
0,13,propriet rio recebe hospedes pessoalmente hosp...,fa quest receber hospedes sintam casa,,0
1,34,simplifique neste espa tranquilo localiza cent...,ola carla prazer receb los alojamento bem vind...,,1
2,64,apartamento composto suite sala tv kitchenette...,apaixonada portugal,,1
3,67,apartamento est decorado extremo bom gosto com...,pessoa otimista vive vida alegria entusiasmo l...,,0
4,87,ptimo espa exterior ideal gosta desfrutar ar l...,nome c tia nasci cresci sintra vila cora apaix...,,1
...,...,...,...,...,...
1084,12457,casa aconchegante familiar timas condi es pert...,boa pessoa,index excelente localiza boas reas excelente i...,0
1085,12461,bem vindo lisboa charmoso apartamento fica bai...,anos assistente social forma h anos decidi ded...,index excelente apartamento agradecimento espe...,0
1086,12470,apartamento pessoas quartos sala terra mesa ca...,chamo margarida gosto mar praia sol portugal n...,index quiser passar uns dias sitio clamo casa ...,0
1087,12476,apartamento zona calma lisboa pr ximo centro c...,apartamento simples charme sala sof s wi fi gr...,,0


## TF - IDF for EN / FR / PT

In [39]:
# Concatenate text columns into a single column

#def tf_idf_RandomForest():
merged_df_pt['Concatenated_Text'] = merged_df_pt['description'] + ' ' + merged_df_pt['host_about'] + ' ' + merged_df_pt['comments']

#separate features and taget
X = merged_df_pt['Concatenated_Text']
y = merged_df_pt['unlisted']

X_train, X_val , y_train, y_val = train_test_split(X, y,test_size=0.2,random_state=42)

In [40]:
#create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train.values.ravel())

X_test_tfidf = vectorizer.transform(X_val.values.ravel())

In [41]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy_rf = rf_classifier.score(X_test_tfidf, y_val)
print("Accuracy:", accuracy_rf)

Accuracy: 0.8211009174311926


## Train Classifier

In [69]:
modelknn_word = KNeighborsClassifier(n_neighbors = 10, metric = 'cosine', weights = 'distance')

In [70]:
modelknn_word.fit(X_train_tfidf, y_train)

In [71]:
y_pred_knn = modelknn_word.predict(X_test_tfidf)

In [72]:
accuracy_knn = modelknn_word.score(X_test_tfidf, y_val)
print("Accuracy:", accuracy_knn)

Accuracy: 0.8488726386349786


In [None]:
def prediction(row):
    lang = detect(row)
    if lang == "en":
        preprocessing_eng(row)
        return model_eng.predict(row)
    elif lang == "fr":
        preprocessing_fr(row)
        return model_fr.predict(row)
    elif lang == "pt":
        preprocessing_pt(row)
        return model_pt.predict(row)
    else:
        translate_to_english
        preprocessing_eng(row)
        return model_eng.predict(row)


df_test_["prediction"] = df.apply(lambda row: prediction(row))