## Imports 

In [97]:
import re
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
from langdetect import detect
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, RSLPStemmer
from nltk.stem.wordnet import WordNetLemmatizer

## Data Import

In [10]:
airbnb_df_train = pd.read_excel('./1. data_raw/train.xlsx')
airbnb_df_test = pd.read_excel('./1. data_raw/test.xlsx')
airbnb_df_train_reviews = pd.read_excel('./1. data_raw/train_reviews.xlsx')
airbnb_df_test_reviews = pd.read_excel('./1. data_raw/test_reviews.xlsx')
pd.options.display.max_colwidth = 50

## First Data Exploration

In [11]:
airbnb_df_train.shape

(12496, 4)

In [12]:
airbnb_df_train_reviews.shape

(721402, 2)

## Language Detection

In [13]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'Unknown'

### Language detection for reviews

In [14]:
airbnb_df_train_reviews["lang_comments"] = airbnb_df_train_reviews["comments"].apply(detect_language)

In [15]:
airbnb_df_train_reviews

Unnamed: 0,index,comments,lang_comments
0,1,this is a very cozy and comfortable house to s...,en
1,1,good<br/>,cy
2,1,"My first hostel experience, and all I have to ...",en
3,1,Das Hostel war neu und deshalb funktionierte a...,de
4,1,"It was fine for a dorm, but I think for the pe...",en
...,...,...,...
721397,12494,"We had a good time, the apartment has a great ...",en
721398,12494,Great apartment in very central location. The ...,en
721399,12494,"We are Airbnb Super Hosts too, so trust me, Li...",en
721400,12494,We had a lovely stay at this apartment. Sofia ...,en


### Language detection for desc/host_about

In [16]:
airbnb_df_train["lang_desc"] = airbnb_df_train["description"].apply(detect_language)

In [17]:
airbnb_df_train["lang_host"] = airbnb_df_train["host_about"].apply(detect_language)

In [18]:
airbnb_df_train

Unnamed: 0,index,description,host_about,unlisted,lang_desc,lang_host
0,1,"This is a shared mixed room in our hostel, wit...",Alojamento Local Registro: 20835/AL,0,en,pt
1,2,"O meu espaço fica perto de Parque Eduardo VII,...","I am friendly host, and I will try to always b...",1,pt,en
2,3,Trafaria’s House is a cozy and familiar villa ...,"I am a social person liking to communicate, re...",1,en,en
3,4,"Apartamento Charmoso no Chiado, Entre o Largo ...",Hello!_x000D_\nI m Portuguese and i love to me...,0,pt,en
4,5,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,0,fr,fr
...,...,...,...,...,...,...
12491,12492,CAT’S BY BAIRRO ALTO<br /><br />This cozy apar...,Travelling is one of my favorite hobbies. I've...,0,en,en
12492,12493,Beautifully located in the heart of Lisbon's h...,Founded by travel enthusiasts (just like you) ...,0,en,en
12493,12494,Enjoy breakfast in the sleek kitchen with its ...,I´m from Portugal and I love to dance and to t...,0,en,en
12494,12495,A Terra da Eira é uma casa de campo rodeada de...,Somos uma familia de 5. Gostamos de viajar e d...,1,pt,pt


## Import detected data

In [154]:
df_train_detected = pd.read_csv("./2. data_detected/airbnb_df_train_detected.csv", index_col="index",).drop("Unnamed: 0",axis=1)
df_train_reviews_detected = pd.read_csv("./2. data_detected/airbnb_df_train__reviews_detected.csv", index_col="index").drop("Unnamed: 0",axis=1)

### Count number of different Languages

In [155]:
#airbnb_df_train["lang_desc"].value_counts()

In [156]:
#airbnb_df_train["lang_host"].value_counts()

In [157]:
#airbnb_df_train_reviews["lang_comments"].value_counts()

In [158]:
def create_df(df, df_review, language):

    columns_to_drop = ['lang_desc', 'lang_host']
    
    df = df[(df['lang_desc'] == language) & (df['lang_host'] == language)]
    df_review = df_review[df_review['lang_comments'] == language]
    grouped_reviews = df_review.groupby('index')['comments'].apply(lambda x: ''.join(str(x))).reset_index()
    merged_df = pd.merge(df, grouped_reviews, on='index', how='left')
    merged_df = merged_df.drop(columns=columns_to_drop)
    merged_df = merged_df[["index",	"description", "host_about", "comments", "unlisted"]]

    return merged_df

### join all English host_about/desc with English comments

In [159]:
merged_df_english = create_df(df_train_detected, df_train_reviews_detected, "en")

In [168]:
merged_df_english

Unnamed: 0,index,description,host_about,comments,unlisted
0,3,trafaria house cozy familiar villa facility ne...,social person liking communicate reading trave...,,1
1,6,important response covid property extended cle...,hi homing company develops activity tourism pr...,,0
2,7,home rent traveling perfect vacation without c...,globe trotter portuguese nationality german fa...,,1
3,8,find tranquility meticulously curated lifestyl...,travel lot love x originally israel currently ...,index shani helpful throughout process accommo...,0
4,9,charming apartment one bedroom double bed doub...,isabel helder portuguese parent three wonderfu...,index great little space lovely host clean w c...,0
...,...,...,...,...,...
8201,12489,feel home wherever choose live blueground love...,blueground global proptech company several tho...,,1
8202,12492,cat bairro alto cozy apartment lisbon city cen...,travelling one favorite hobby already visited ...,index happy find place com great stay lisbon a...,0
8203,12493,beautifully located heart lisbon historic cent...,founded travel enthusiast like bnbird want con...,index nice place nice location easy communicat...,0
8204,12494,enjoy eakfast sleek kitchen freestanding knott...,portugal love dance travel x pleasure welcome ...,index lovely stay lisbon apartm liliana apartm...,0


### Seems like NaN values is a good indication if a flat is getting listed again or not. Makes sense in real life context, because flats that dont have visitors are more likly to be unlisted in the future.

## Preprocessing English

In [161]:
def preprocessing_eng(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text)
            text = " ".join(tokens)
        
        #REMOVE STOPWORDS
        if stop:
            stop_eng = set(stopwords.words('english'))
            text = " ".join([word for word in text.split() if word not in stop_eng])
        
        #Lemmatize
        if lemmatize:
            lemma_eng = WordNetLemmatizer()
            text = " ".join(lemma_eng.lemmatize(word) for word in text.split())
        
        #Stemming
        if stemmertize:
            stemmer_eng = SnowballStemmer('english')
            text = " ".join(stemmer_eng.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [163]:
download('wordnet')
download('stopwords')
download('punkt')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_english[columns_to_apply] = merged_df_english[columns_to_apply].astype(str).apply(lambda row: preprocessing_eng(row=row,
                                                                                                                        tokenize=True,
                                                                                                                        stop=True,
                                                                                                                        lemmatize = True, 
                                                                                                                        stemmertize = False
                                                                                                                        )
                                                                                            )

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 8206/8206 [00:05<00:00, 1459.60it/s]
100%|██████████| 8206/8206 [00:03<00:00, 2143.77it/s]
100%|██████████| 8206/8206 [00:04<00:00, 1779.09it/s]


In [164]:
merged_df_english

Unnamed: 0,index,description,host_about,comments,unlisted
0,3,trafaria house cozy familiar villa facility ne...,social person liking communicate reading trave...,,1
1,6,important response covid property extended cle...,hi homing company develops activity tourism pr...,,0
2,7,home rent traveling perfect vacation without c...,globe trotter portuguese nationality german fa...,,1
3,8,find tranquility meticulously curated lifestyl...,travel lot love x originally israel currently ...,index shani helpful throughout process accommo...,0
4,9,charming apartment one bedroom double bed doub...,isabel helder portuguese parent three wonderfu...,index great little space lovely host clean w c...,0
...,...,...,...,...,...
8201,12489,feel home wherever choose live blueground love...,blueground global proptech company several tho...,,1
8202,12492,cat bairro alto cozy apartment lisbon city cen...,travelling one favorite hobby already visited ...,index happy find place com great stay lisbon a...,0
8203,12493,beautifully located heart lisbon historic cent...,founded travel enthusiast like bnbird want con...,index nice place nice location easy communicat...,0
8204,12494,enjoy eakfast sleek kitchen freestanding knott...,portugal love dance travel x pleasure welcome ...,index lovely stay lisbon apartm liliana apartm...,0


### join all French host_about/desc with French comments

In [165]:
merged_df_french = create_df(df_train_detected, df_train_reviews_detected, "fr")

In [166]:
merged_df_french

Unnamed: 0,index,description,host_about,comments,unlisted
0,5,Joli appartement en bordure de mer.<br /> 2 m...,Nous sommes une famille avec deux enfants de 1...,index\n5 Un très bel appartement avec une m...,0
1,189,"Hostel dans maison de ville , location possibl...",Maison typique avec trois chambres individuell...,,0
2,192,"Appartement très chaleureux, avec beaucoup de ...","Je suis teresa, novice dans le domaines de la ...","index\n192 Appartement spacieux et propre, ...",0
3,301,Chambre double tout confort dans grande maison...,Couple de français installé au Portugal où nou...,index\n301 Tout d’abord une très belle prop...,0
4,329,Situé au pied du Panthéon dans le quartier de ...,Romain et moi sommes un couple qui sommes tomb...,index\n329 Appartement très bien situé dans...,1
...,...,...,...,...,...
100,11799,Appartement de 2 pièces entièrement rénové se ...,"Bonjour, je m’appelle Leo, un français d’origi...",index\n11799 Accueil très chaleureux de Ped...,0
101,11878,Chambre sexy in-love. Chambre avec de nombreux...,"Avec mon mari Dominique, nous vous proposons à...",,0
102,11997,Ce logement affiche un style résolument unique...,j'aime les paisagem la montagne et tout qui c'...,,1
103,12210,Jolie maison typique portugaise et entièrement...,Christophe & Lila marié . Trois enfants,index\n12210 Nous avons passé un très bon d...,0


## Preprocessing French

In [169]:
#!python -m spacy download fr_core_news_md

In [170]:
def preprocessing_fr(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text, language="french")
            text = " ".join(tokens)
            
        #REMOVE STOPWORDS
        if stop:
            stop_fr = set(stopwords.words('french'))
            text = " ".join([word for word in text.split() if word not in stop_fr])
        
        print(text)

        #Lemmatize
        if lemmatize:
            lemma_fr = spacy.load("fr_core_news_md")
            doc = lemma_fr(text)
            for word in doc:
                text = " ".join(word.lemma_).split()
        
        #Stemming
        if stemmertize:
            stemmer_fr = SnowballStemmer('french')
            stem_doc = stemmer_fr(text)
            for word in stem_doc:
                text = " ".join(stemmer_fr.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [171]:
download('wordnet')
download('stopwords')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_french[columns_to_apply] = merged_df_french[columns_to_apply].astype(str).apply(lambda x: preprocessing_fr(row=x,
                                                                                                                     tokenize=True,
                                                                                                                     stop=True,
                                                                                                                     lemmatize = True, 
                                                                                                                     stemmertize = False
                                                                                                                    )
                                                                                         )

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  0%|          | 0/105 [00:00<?, ?it/s]

joli appartement bordure mer min a pied plage vue magnifique oc an e tage petit balcon chames lit double deux lits separ salle bains douche italienne cuisine enti rement quip e salon canap lit parking gratuit place mini golf restaurants caf supermarch a disposition bas immeuble a km lisbonne proche zones touristiques parc a th dinopark obidos budhha eden jardin oriental ile berlengas etc b the space b logement totalement r nov vue magnifique oc an b license number b al


  1%|          | 1/105 [00:01<02:27,  1.42s/it]

hostel maison ville location possible chame individuellement a jardinet jaccuzzi relaxer retours commun tous r sidents salle bain baignoire lave linge salle eau douche salon tv wifi gratuit coin tente flipper cuisine tte quip e salle manger verri re vue jardinet jaccuzzi proche lisbonne centre sintra plages casinos estoril cascais b the space b tranquillit endroit malgr ville coin calme proches toutes commodit at acc facile rendre diff rents endroits typiques visiter tel lisbonne sintra cascais estoril mafra cabo da roca mafra tant autres agr able entre amis famille quipements b guest access b tous endroits puisque maison individuelle uniquement chames priv b license number


  2%|▏         | 2/105 [00:02<02:18,  1.35s/it]

appartement tr chaleureux beaucoup luminosit b license number b al


  3%|▎         | 3/105 [00:04<02:17,  1.35s/it]

chame double tout confort grande maison terrain piscine jaccuzzi nomeuses activit situ e petit village quelques commerces bars ur sites touristiques km plages oc anes km lisbonne pied montejunto cette chame a salle bain wc priv wi fi gratuit disposez activit ext rieures int rieures ainsi certains espaces maison accueil salons bar salle manger b license number b


  4%|▍         | 4/105 [00:05<02:15,  1.34s/it]

situ pied panth quartier alfama venez journer cet appartement ambiance cosy r cemment r nov quartier typiquement portugais emplacement id al deux restaurants commerces magnifiques vues panoramiques permettra profiter jour proche tram couvrirez mani re agr able gastronomie portugaise fado magnifiques paysages ville collines calme quip accueillir personnes b the space b logement b fice emplacement parfait minute pied panth nacional monuments plus visit lisbonne situ immeuble typiquement portugais duira gr confort quipements modernes charme afin offrir jour inoubliable logement offre situation calme donnant rue pi tonne o pourrez galement profiter patio privatif appartement quip


  5%|▍         | 5/105 [00:06<02:17,  1.37s/it]

ambiance boheme cet appartement rez jardin pouvez relaxer apr avoir visiter lisbonne mardi samedi bas rue trouverez march puces ainsi lieux embl matiques visiter o miradouro eglise da graca o panteon national wifi fie perfect b the space b literie neuve qualit espace ext rieur jardin possibilit faire barbeuk calme rare lisbonne b guest access b jardin terrasse veranda b other things to note b appartement rdc escalier b license number b al


  6%|▌         | 6/105 [00:08<02:21,  1.43s/it]

situ rue pi tonne deux alfama cet appartement permettra vivre ur quartier typique lisbonne acc der pied nomeux lieux touristiques b the space b appartement situ cot quartier historique alfama imm diatement derri re panth ur plus connu march puces ville feira da ladra trouve rue calme paisible ferm e circulation rez chauss e immeuble compos appartements totalement meubl tout dont besoin profiter jour peut accueillir jusqu personnes chames coucher quip grand lit grands placards canap lit places salon dot sommier latte matelas confortable disposerez cuisine enti rement quip e r frig rateur cong lateur four cuisini re gaz feux hotte lave vaissel


  7%|▋         | 7/105 [00:09<02:21,  1.45s/it]

faites plein aventures travers routes incroyables portugal pleine for long littoral itin raire b license number b exempt


  8%|▊         | 8/105 [00:11<02:18,  1.43s/it]

logement proche miradouro santa catarina logement parfait couples voyageurs solo voyageurs affaires familles enfants charmant duplex enti rement quip b ficiant chauffage central dispose toutes commodit possibles b guest access b ensemble appartement r serv locataires b license number b


  9%|▊         | 9/105 [00:12<02:17,  1.44s/it]

apr avoir r nov cor mat riaux nobles anciens cet appartement famille lisbo devenu petite merveille ravie pouvoir partager joies procure lorsqu habite quel bonheur pouvoir enfin profiter apr an mois travaux terrasses prendre petit jeuner doux soleil matin tout admirant paysages typiques lisbonne toits tage christ fameux pont suspendu b the space b situ quartier historique donnant grande cour arbor e appartement pr serv uit agr able jour nuit terrasse intime roof top panoramique permettent choisir chaque heure jour soir endroit pr dilection tendre lire admirer toits prendre petit jeuner bain soleil ambiance si particuli re cette ville moindre b tisse ruine belle visiteur


 10%|▉         | 10/105 [00:14<02:21,  1.48s/it]

profitez logement enti rement refait neuf lumineux gant central coeur quartiers historiques lisbonne gra a castelo sao jorge alfama b the space b appartement lumineux disposant jour cuisine enti rement quip e coration soign e profiterez galement salle bains douche italienne belle chame disposant coin bureau appartement climatis b other things to note b check in font entre h h apr h check in possibles suppl ment payable arriv e b license number b al


 10%|█         | 11/105 [00:15<02:17,  1.47s/it]

tendez logement calme gant b license number b exempt


 11%|█▏        | 12/105 [00:17<02:12,  1.43s/it]

chame mme pompadour revivez faste versailles temps nuit deux glissez r favorite roi chame romantique nuit e charme deux chame tr calme belle salle bains grande douche mare parking place linge toilette produits toilette serviette piscine inclus b the space b petits jeuners inclus deux personnes servis petit salon parc acc piscine chauff e bain minuit autoris h h matin bar cocktails salon commun chemin e place enfants non accept animaux non autoris parking ferm quinta motos b license number b al


 12%|█▏        | 13/105 [00:18<02:11,  1.43s/it]

studio donnant piscine jardin entr e ind pendante equip lave linge r frig rateur machine caf bouilloire grille pain micro onde vaisselle petite salle eau douche lavabo toilette coin bureau chaise parking quinta petits jeuners compris prix linge produits toilette serviette piscine b the space b acc piscine chauff e nage contre courant terrasse transats coin repos parc logement id al auteurs crivains tudiant etc calme reposant wifi bbq churasquero accessible clients cuisine autoris e studio seul r chauffement plats autoris petits jeuners servis petit salon biblioth salle commune chemin e partager autres clients b license number b al


 13%|█▎        | 14/105 [00:19<02:07,  1.40s/it]

logement paisible offre jour tente toute famille cette maison famille nich e petit village typique proche plage mn voiture proximit sites touristiques peniche obidos mn bombaral mn nazar mn lisbonne mn village petite sup rette disponible courses h maguy jacques pr sents conseiller visites b license number b


 14%|█▍        | 15/105 [00:21<02:03,  1.38s/it]

logement tr confortable fonctionnel a petit balcon vue remplie charme lisbonne autre poque situ eme tage immeuble tages logement contient deux chames dont vue jardin cuisine tr moderne toute quip acc terrasse escaliers arri re immeuble pourrais couvrir belle vue taje b the space b tr bel appartement confortable chaleureux plein centre historique belle vue raconte presque but histoire architecture contrast hauts plafonds logement donnent fra cheur naturel impression espace jardin terrasse priv partager entre deux logements accessible escaliers marquise logement tres confortable vue jardim taje b guest access b voyageur acc toutes pi b other things to note


 15%|█▌        | 16/105 [00:22<02:01,  1.37s/it]

chames glamour sexy jours couples forfait sp cial copains quinta r serv e pr sence propri taires place enfants non accept petits jeuners compris prix servis petit salon parc note chame suite harem compos e deux chames communiquent entre elles moucharabieh salle bain partag e deux chames b the space b grand parc deux hectares jardins fontaine cascade aucun vis vis piscine chauff e nage contre courant transats lits piscine salon ext rieur espace terrasse ave table chaises bain minuit autoris bbq disposition espace ext rieur si besoin cuisine chames espace service frigo machine caf micro ondes etc b guest access b a deux lisbonne campagne quinta tant peu isol e recommando


 16%|█▌        | 17/105 [00:23<01:59,  1.36s/it]

appartement mouraria palacio libelula refuge exceptionnel ur lisbonne b ficie vue couper souffle permet profiter chaque jour coucher soleil vue cristo rei tage pont avril colline ch teau ascenseur couvent santa justa a arri re appartement cour priv e appartement situ tage petit immeuble typique fa ade azulejos portugais b license number b al


 17%|█▋        | 18/105 [00:25<01:56,  1.34s/it]

venez entrez cet endroit plein charme enti rement restaur alliant bois vieilles pierres azulejos aussi tout confort moderne plein coeur centre historique alfama calme proche toutes commodit restaurants typiques proposant soir fado id al payser passer jourromantique b the space b appartement ancien enti rement r nov typique alfama bien situ spacieux meubl gout amateurs charme romantisme b guest access b logement tres calme frais enti rement di jolie chame salon salle manger cuisine ind pendante enti rement quip e appartement comporte double vitrage manquez vers entr e immeuble magnifique oranger borde escalier menant miradouro tramway ch teau saint georges place commerce minutes pieds splendide vue rio tejo


 18%|█▊        | 19/105 [00:26<02:04,  1.45s/it]

must terrasse panoramique appartement charme chames wifi quipement moderne superbe terrasse toit vue lisbonne bordure quartier historique gra a transports mn pied tram tro anjos couvrez lisbonne pied depuis lieu jour quartier authentique typique lisbo b the space b appartement charme vue panoramique splendide bordure quartier gra a spacieux terrasse lumineux traversant exposition ouest appartement contemporain pouvant accueillir entre personnes chames lit autre possibilit dormir canap salon panorama ville verdure id al couple groupe amis famille enfants cuisine totalement quip e moderne prenez petit jeuner profitant vue depuis salle manger superbe terrasse a tres tram


 18%|█▊        | 19/105 [00:27<02:03,  1.44s/it]


KeyboardInterrupt: 

In [146]:
merged_df_french.head()

Unnamed: 0,index,description,host_about,comments,unlisted
0,5,"[a, v, o, i, r]",[e],[e],0
1,189,[r],[o],"[a, v, o, i, r]",0
2,192,"[a, v, o, i, r]",[x],[e],0
3,301,[b],[e],[e],0
4,329,[p],[r],[e],1


### join all Portuguese host_about/desc with Portuguese comments'

In [147]:
merged_df_pt = create_df(df_train_detected, df_train_reviews_detected, "pt")

In [148]:
merged_df_pt

Unnamed: 0,index,description,host_about,comments,unlisted
0,13,O proprietário recebe os hospedes pessoalmente...,"Faço questão de receber os hospedes, para que ...",,0
1,34,Simplifique neste espaço tranquilo e de locali...,"Ola sou Carla, tenho prazer em recebê-los em m...",,1
2,64,"Apartamento composto por uma suite, sala de es...",Apaixonada por Portugal!,,1
3,67,O apartamento está decorado com extremo bom go...,Sou uma pessoa otimista que vive a vida com a ...,,0
4,87,"Óptimo espaço exterior, ideal para quem gosta ...","O meu nome é Cátia, nasci e cresci em Sintra, ...",,1
...,...,...,...,...,...
1084,12457,Casa aconchegante familiar <br />Com ótimas co...,Boa pessoa,"index\n12457 Excelente localização, boas ár...",0
1085,12461,Bem vindo a Lisboa!<br />Este charmoso apartam...,Tenho 38 anos e sou Assistente social de forma...,index\n12461 Excelente apartamento. Um agra...,0
1086,12470,"Apartamento para 6 pessoas com 2 quartos, sala...","Chamo-me Margarida. Gosto do mar, praia, do so...",index\n12470 quem quiser passar uns dias nu...,0
1087,12476,"Apartamento numa zona muito calma de Lisboa, m...","O apartamento é muito, muito simples, mas tem ...",,0


## Preprocessing Portuguese

In [151]:
!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.5.0/pt_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
                                              0.0/13.0 MB ? eta -:--:--
     -                                        0.4/13.0 MB 13.9 MB/s eta 0:00:01
     --                                       0.9/13.0 MB 11.3 MB/s eta 0:00:02
     ----                                     1.4/13.0 MB 10.9 MB/s eta 0:00:02
     ------                                   2.1/13.0 MB 12.1 MB/s eta 0:00:01
     --------                                 2.8/13.0 MB 12.7 MB/s eta 0:00:01
     ----------                               3.4/13.0 MB 12.9 MB/s eta 0:00:01
     ------------                             4.1/13.0 MB 13.0 MB/s eta 0:00:01
     --------------                           4.8/13.0 MB 13.3 MB/s eta 0:00:01
     ----------------                         5.4/13.0 MB 13.4 MB/s eta 0:00:01
     -------------------     

In [152]:
def preprocessing_pt(row, tokenize, stop, lemmatize, stemmertize):
    updates = []
    
    for j in tqdm(row):
        
        text = j
        
        #LOWERCASE TEXT
        text = str(text).lower()
        
        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )
        text = re.sub("br", "", text)

        if tokenize:
            tokens = word_tokenize(text, language="portuguese")
            text = " ".join(tokens)
            
        #REMOVE STOPWORDS
        if stop:
            stop_pt = set(stopwords.words('portuguese'))
            text = " ".join([word for word in text.split() if word not in stop_pt])
            
        #Lemmatize
        if lemmatize:
            lemma_pt = spacy.load("pt_core_news_sm")
            doc = lemma_pt(text)
            for word in doc:
                text = " ".join(word.lemma_).split()
        
        #Stemming
        if stemmertize:
            stemmer_pt = RSLPStemmer()
            stem_doc = stemmer_pt(text)
            for word in stem_doc:
                text = " ".join(stemmer_pt.stem(word) for word in text.split())
            
        updates.append(text)
        
    return updates

In [153]:
download('wordnet')
download('stopwords')
columns_to_apply = ['description', 'host_about', 'comments']
merged_df_pt[columns_to_apply] = merged_df_pt[columns_to_apply].astype(str).apply(lambda row: preprocessing_pt(row=row,
                                                                                                             tokenize=True,
                                                                                                             stop=True,
                                                                                                             lemmatize = True, 
                                                                                                             stemmertize = False
                                                                                                            )
                                                                                )

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leoal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
 19%|█▊        | 203/1089 [00:52<03:47,  3.90it/s]


KeyboardInterrupt: 

## TF - IDF

In [58]:
# Concatenate text columns into a single column
merged_df_english['Concatenated_Text'] = merged_df_english['description'] + ' ' + merged_df_english['host_about'] + ' ' + merged_df_english['comments']

#separate features and taget
X = merged_df_english['Concatenated_Text']
y = merged_df_english['unlisted']

X_train, X_val , y_train, y_val = train_test_split(X, y,test_size=0.2,random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_english['Concatenated_Text'] = merged_df_english['description'] + ' ' + merged_df_english['host_about'] + ' ' + merged_df_english['comments']


In [60]:
#create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train.values.ravel())

X_test_tfidf = vectorizer.transform(X_val.values.ravel())

In [68]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy_rf = rf_classifier.score(X_test_tfidf, y_val)
print("Accuracy:", accuracy_rf)

Accuracy: 0.9037172455819622


## Train Classifier

In [69]:
modelknn_word = KNeighborsClassifier(n_neighbors = 10, metric = 'cosine', weights = 'distance')

In [70]:
modelknn_word.fit(X_train_tfidf, y_train)

In [71]:
y_pred_knn = modelknn_word.predict(X_test_tfidf)

In [72]:
accuracy_knn = modelknn_word.score(X_test_tfidf, y_val)
print("Accuracy:", accuracy_knn)

Accuracy: 0.8488726386349786


In [None]:
def prediction(row):
    lang = detect(row)
    if lang == "en":
        preprocessing_eng(row)
        return model_eng.predict(row)
    elif lang == "fr":
        preprocessing_fr(row)
        return model_fr.predict(row)
    elif lang == "pt":
        preprocessing_pt(row)
        return model_pt.predict(row)
    else:
        translate_to_english
        preprocessing_eng(row)
        return model_eng.predict(row)


df_test_["prediction"] = df.apply(lambda row: prediction(row))