*Project 1 - Machine Learning for NLP*

*Michele Pulvirenti, Marco Riva*

# Project 1 - TripAdvisor reccomandation system

## 1. Data preparation

### Download dataset

In [61]:
import kagglehub
import zipfile
import os

dataset_filename = 'reviews.csv'
cache_dataset_zipped = kagglehub.dataset_download("joebeachcapital/hotel-reviews", path=dataset_filename)
# the file is actually downloaded as zipped, even if there is not '.zip' extensions
# so we need to extract it
extract_path = './'
# Check if the file already exists
if not os.path.exists(extract_path + dataset_filename):
    with zipfile.ZipFile(cache_dataset_zipped, 'r') as zip_ref:
        # Iterate through each file in the zip file
        for file in zip_ref.namelist():
            # Check if the file already exists
            if not os.path.exists(os.path.join(extract_path, file)):
                # Extract the file
                zip_ref.extract(file, extract_path)
        
dataset_path = extract_path + dataset_filename

### Import dataset

In [62]:
import pandas as pd

df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...",“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False


## 2. Data formatting

### Flatten the json 'ratings' column

In [63]:
from pandas import json_normalize
# convert JSON column to DataFrame
json_df = json_normalize(df.ratings.apply(eval))

# merge the two dataframes
df = df.drop(columns=['ratings']).join(json_df)
df

Unnamed: 0,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile,service,cleanliness,overall,value,location,sleep_quality,rooms,check_in_front_desk,business_service_(e_g_internet_access)
0,"“Truly is ""Jewel of the Upper Wets Side""”",Stayed in a king suite for 11 nights and yes i...,"{'username': 'Papa_Panda', 'num_cities': 22, '...",December 2012,93338,0,2012-12-17,147643103,False,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,
1,“My home away from home!”,"On every visit to NYC, the Hotel Beacon is the...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",December 2012,93338,0,2012-12-17,147639004,False,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,
2,“Great Stay”,This is a great property in Midtown. We two di...,"{'username': 'vuguru', 'num_cities': 12, 'num_...",December 2012,1762573,0,2012-12-18,147697954,False,4.0,5.0,4.0,4.0,5.0,4.0,4.0,,
3,“Modern Convenience”,The Andaz is a nice hotel in a central locatio...,"{'username': 'Hotel-Designer', 'num_cities': 5...",August 2012,1762573,0,2012-12-17,147625723,False,5.0,5.0,4.0,5.0,5.0,5.0,5.0,,
4,“Its the best of the Andaz Brand in the US....”,I have stayed at each of the US Andaz properti...,"{'username': 'JamesE339', 'num_cities': 34, 'n...",December 2012,1762573,0,2012-12-17,147612823,False,4.0,5.0,4.0,3.0,5.0,5.0,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878556,“vil komme igen”,"Pænt hotel i et livligt miljø. Store værelser,...","{'username': '', 'id': '', 'location': ''}",,84093,0,2008-08-31,51441576,False,,,4.0,,,,,,
878557,“excellent”,"un excellent hotel 4 *, pour un prix raisonnab...","{'username': 'dan016', 'num_reviews': 2, 'num_...",July 2008,84093,0,2008-07-18,18003332,False,4.0,5.0,4.0,4.0,4.0,,5.0,5.0,
878558,“Un hotel eccezionale”,"L'hotel, situato in una zona tranquilla e a du...","{'username': '', 'id': '', 'location': ''}",July 2008,84093,0,2008-04-18,15564515,False,5.0,5.0,5.0,,,,5.0,,
878559,“Gerne wieder”,Sehr schön ausgestattetes Hotel in bester Lage...,"{'username': '', 'id': '', 'location': ''}",July 2008,84093,0,2008-04-01,15564508,False,5.0,5.0,5.0,,,,5.0,,


### Remove unused columns and rows

In [64]:
# unused columns
df.drop(columns=['title', 'author', 'date_stayed', 'num_helpful_votes', 'date', 'id', 'via_mobile'], inplace=True, errors='ignore')
# reviews that have null values on relevant rating criteria
relevant_ratings = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']
df.dropna(subset=relevant_ratings, inplace=True)
# reviews that have more rating criteria than the relevant ones
df = df[~df['check_in_front_desk'].notna()]
df = df[~df['business_service_(e_g_internet_access)'].notna()]
df.drop(columns=['check_in_front_desk', 'business_service_(e_g_internet_access)'], inplace=True, errors='ignore')
df

Unnamed: 0,text,offering_id,service,cleanliness,overall,value,location,sleep_quality,rooms
0,Stayed in a king suite for 11 nights and yes i...,93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0
1,"On every visit to NYC, the Hotel Beacon is the...",93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0
2,This is a great property in Midtown. We two di...,1762573,4.0,5.0,4.0,4.0,5.0,4.0,4.0
3,The Andaz is a nice hotel in a central locatio...,1762573,5.0,5.0,4.0,5.0,5.0,5.0,5.0
4,I have stayed at each of the US Andaz properti...,1762573,4.0,5.0,4.0,3.0,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...
878548,"Das Hotel Palomar gehoert zur Kimpton Gruppe, ...",84093,4.0,5.0,4.0,3.0,4.0,4.0,4.0
878549,Anlässlich einer Geschäftsreise waren wir das ...,84093,5.0,5.0,5.0,5.0,5.0,5.0,5.0
878550,"Das Hotel liegt in Arlington, eine etwas gehob...",84093,5.0,5.0,5.0,4.0,5.0,5.0,5.0
878551,Mon mari et moi sommes restés à l'hôtel du jeu...,84093,5.0,5.0,5.0,5.0,5.0,5.0,5.0


## 3. Data processing
Language detection, tokenization, lowercasing, removing puntualization, speling correction, stopwords...

### Language detection

#### Inspect

In [65]:
# ensures no empty strings
print('empty reviews count:', df.text.apply(lambda x: x.strip() == '').sum())
df.isnull().sum()

empty reviews count: 0


text             0
offering_id      0
service          0
cleanliness      0
overall          0
value            0
location         0
sleep_quality    0
rooms            0
dtype: int64

In [66]:
# assess the reviews are mostly long, therefore helping in language detection
short_len_def = 70
short_reviews_percentage = round((len(df[df['text'].str.len() < short_len_def]) / len(df)) * 100, 2)
print(f'short (less than {short_len_def} character long) reviews: {short_reviews_percentage}%')

short (less than 70 character long) reviews: 0.31%


#### Detect

For detecting reviews in other languages than English we tried two libraries:
- `langdetect`
- `fast-langdetect`

We chose to use the second one because it gave the results 8x faster than the first.

As is written in the docs:

*" fast-langdetect provides ultra-fast and highly accurate language detection based on FastText, a library developed by Facebook. This package is 80x faster than traditional methods and offers 95% accuracy "*

Here is the version using `langdetect`

```python
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# ensure consistent results
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'
```

In [67]:
from fast_langdetect import detect

def detect_language(text):
    try:
        return detect(text.replace("\n", ""), low_memory=False)["lang"]
    except Exception:
        print(Exception.message)
        return 'unknown'

if 'language' not in df.columns:
    df['language'] = df['text'].apply(detect_language)

df

Unnamed: 0,text,offering_id,service,cleanliness,overall,value,location,sleep_quality,rooms,language
0,Stayed in a king suite for 11 nights and yes i...,93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0,en
1,"On every visit to NYC, the Hotel Beacon is the...",93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0,en
2,This is a great property in Midtown. We two di...,1762573,4.0,5.0,4.0,4.0,5.0,4.0,4.0,en
3,The Andaz is a nice hotel in a central locatio...,1762573,5.0,5.0,4.0,5.0,5.0,5.0,5.0,en
4,I have stayed at each of the US Andaz properti...,1762573,4.0,5.0,4.0,3.0,5.0,5.0,5.0,en
...,...,...,...,...,...,...,...,...,...,...
878548,"Das Hotel Palomar gehoert zur Kimpton Gruppe, ...",84093,4.0,5.0,4.0,3.0,4.0,4.0,4.0,de
878549,Anlässlich einer Geschäftsreise waren wir das ...,84093,5.0,5.0,5.0,5.0,5.0,5.0,5.0,de
878550,"Das Hotel liegt in Arlington, eine etwas gehob...",84093,5.0,5.0,5.0,4.0,5.0,5.0,5.0,de
878551,Mon mari et moi sommes restés à l'hôtel du jeu...,84093,5.0,5.0,5.0,5.0,5.0,5.0,5.0,fr


In [68]:
# Calculate the percentage of each language
language_counts = df['language'].value_counts(normalize=True) * 100
# language_counts.rename('percentage', inplace=True).to_dict()
language_counts.to_dict()

{'en': 90.37849829038674,
 'de': 2.4154589371980677,
 'it': 2.0749113109479413,
 'fr': 1.8663659947382412,
 'es': 1.6433829258678694,
 'ja': 0.6909495916178533,
 'pt': 0.5225091439100185,
 'nl': 0.11298114383668381,
 'zh': 0.11091860774230217,
 'sv': 0.08043890768088442,
 'da': 0.03391726021872049,
 'ru': 0.026812969226961473,
 'no': 0.02497960380973334,
 'tr': 0.007104290991759022,
 'pl': 0.004125072188763303,
 'el': 0.0022917067715351683,
 'ko': 0.0013750240629211011,
 'th': 0.0009166827086140674,
 'id': 0.0006875120314605506,
 'hr': 0.00022917067715351686,
 'ar': 0.00022917067715351686,
 'ca': 0.00022917067715351686,
 'hu': 0.00022917067715351686,
 'fa': 0.00022917067715351686,
 'nn': 0.00022917067715351686}

In [69]:
# visual quality check (second most common language)
second_common = list(language_counts.keys())[1]
df[df['language'] == second_common].head()

Unnamed: 0,text,offering_id,service,cleanliness,overall,value,location,sleep_quality,rooms,language
1066,Optimaler Aufenthalt für 3 Tage! Traumhaftes B...,239853,5.0,5.0,5.0,4.0,5.0,5.0,5.0,de
1107,Das Hotel liegt optimal nach Downtown (incl. Z...,239853,1.0,4.0,4.0,3.0,4.0,4.0,4.0,de
1110,Wir hatten nur eine Übernachtung in Houston bi...,239853,4.0,5.0,4.0,4.0,5.0,4.0,4.0,de
1151,Wir waren im März 2010 für ein paar Tage in Ho...,239853,4.0,5.0,4.0,4.0,5.0,5.0,5.0,de
1494,Es handelt sich um ein neues tolles Haus mit K...,1966350,4.0,4.0,4.0,4.0,5.0,5.0,5.0,de


#### Filter
Since the dataset is mostly in English, we can filter out the non-English reviews.

In [70]:
# keep only english reviews
df = df[df['language'] == 'en']
df.drop(columns=['language'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['language'], inplace=True)


#### Different approach

In [71]:
# Approccio diverso per rilevarer la lingua, ma non è molto efficiente
# per parole
to_filter = ['bonne','de','le','les','une','des','du','ce','cet','cette','ces','mon','ma','mes','ton','ta','tes','sa','ses','notre','nos','votre','vos','leur','leurs','quel','quelle','quels','quelles','ceci','cela','celui','celle','ceux','celles']
pattern = r'\b(?:' + '|'.join(to_filter) + r')\b'
# per caratteri
to_filter = ['â','ä','ç','é','è','ê','ë','î','ï','ô','ö','ù','û','ü','ÿ']
pattern = '|'.join(to_filter)
# Filter the DataFrame
df_filtered = df[df['text'].str.contains(pattern, case=False, na=False, regex=True)]

# Display the resulting DataFrame
df_filtered

Unnamed: 0,text,offering_id,service,cleanliness,overall,value,location,sleep_quality,rooms
44,The information about the Sherry-Netherland is...,93559,2.0,3.0,2.0,1.0,5.0,3.0,3.0
77,My wife and I stayed in four hotels in our rec...,1456560,4.0,5.0,5.0,4.0,5.0,5.0,5.0
173,I had booked my stay at the Setai as a 25th we...,1776857,1.0,4.0,2.0,2.0,4.0,3.0,5.0
265,The Sherry-Netherland Hotel was a perfect spot...,93559,5.0,5.0,5.0,4.0,5.0,5.0,5.0
456,Just got back from a 3 night stay at the Beaco...,93338,3.0,5.0,5.0,4.0,4.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...
873896,I recently spent a weekend with my girlfriend ...,258705,5.0,5.0,5.0,5.0,5.0,5.0,5.0
874168,"My fiancé and I, stayed at the Palomar for 3 n...",84093,1.0,5.0,3.0,5.0,5.0,5.0,5.0
877227,I just returned from a two-night stay at the M...,83040,5.0,5.0,4.0,3.0,5.0,3.0,4.0
877677,We liked everything about the Hotel Palomar: t...,84093,5.0,5.0,5.0,5.0,5.0,5.0,5.0


## 4. Tokenization

In [72]:
import nltk
nltk.download('punkt_tab')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\miche\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### By sentence

In [73]:
df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))


Unnamed: 0,text,offering_id,service,cleanliness,overall,value,location,sleep_quality,rooms
0,[Stayed in a king suite for 11 nights and yes ...,93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0
1,"[On every visit to NYC, the Hotel Beacon is th...",93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0
2,"[This is a great property in Midtown., We two ...",1762573,4.0,5.0,4.0,4.0,5.0,4.0,4.0
3,[The Andaz is a nice hotel in a central locati...,1762573,5.0,5.0,4.0,5.0,5.0,5.0,5.0
4,[I have stayed at each of the US Andaz propert...,1762573,4.0,5.0,4.0,3.0,5.0,5.0,5.0


## 5. Removing punctuation

### By word
Ha senso fare prima per frasi e poi per parole?? Ce l'ha fatto fare nel TD

In [74]:
from nltk.tokenize import word_tokenize
df['text'] = df['text'].apply(lambda x: [nltk.word_tokenize(sentence) for sentence in x])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: [nltk.word_tokenize(sentence) for sentence in x])


Unnamed: 0,text,offering_id,service,cleanliness,overall,value,location,sleep_quality,rooms
0,"[[Stayed, in, a, king, suite, for, 11, nights,...",93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0
1,"[[On, every, visit, to, NYC, ,, the, Hotel, Be...",93338,5.0,5.0,5.0,5.0,5.0,5.0,5.0
2,"[[This, is, a, great, property, in, Midtown, ....",1762573,4.0,5.0,4.0,4.0,5.0,4.0,4.0
3,"[[The, Andaz, is, a, nice, hotel, in, a, centr...",1762573,5.0,5.0,4.0,5.0,5.0,5.0,5.0
4,"[[I, have, stayed, at, each, of, the, US, Anda...",1762573,4.0,5.0,4.0,3.0,5.0,5.0,5.0


## 6. Lowercasing

## 7. Spelling correction

## 8. Stopwords removal

## 9. Stemming

## 10. Concatenate reviews

In [76]:
df['text'] = df['text'].apply(lambda x: ' '.join([word for sentence in x for word in sentence]))

df = df.groupby('offering_id').agg({
    'text': ' '.join, 
    'service': 'mean',
    'cleanliness': 'mean',
    'overall': 'mean',
    'value': 'mean',
    'location': 'mean',
    'sleep_quality': 'mean',
    'rooms': 'mean'
}).reset_index()

# Display the resulting DataFrame
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: ' '.join([word for sentence in x for word in sentence]))


Unnamed: 0,offering_id,text,service,cleanliness,overall,value,location,sleep_quality,rooms
0,72572,I had to make fast visit to seattle and I foun...,4.604061,4.634518,4.390863,4.324873,4.568528,4.329949,4.284264
1,72579,"Great service , rooms were clean , could use s...",4.247934,4.264463,3.892562,4.148760,4.206612,3.776860,3.876033
2,72586,Beautiful views of the space needle - especial...,4.261905,4.301587,4.071429,4.087302,4.587302,4.119048,4.015873
3,72598,This hotel is in need of some serious updates ...,3.243243,3.243243,2.918919,3.054054,3.027027,3.270270,3.189189
4,73236,My experience at this days inn was perfect . t...,4.277778,3.111111,3.388889,3.777778,4.111111,3.722222,3.222222
...,...,...,...,...,...,...,...,...,...
3722,3513445,"New rooms , clean beds , bathroom . You have w...",3.666667,4.000000,3.666667,4.000000,3.000000,4.000000,3.666667
3723,3523356,I 've stayed at plenty of Hampton Inns during ...,4.928571,4.928571,4.571429,4.214286,4.500000,4.571429,4.500000
3724,3541823,"Inn staff absolutely wonderful , helpful , kno...",4.750000,4.500000,4.000000,4.500000,5.000000,3.750000,3.250000
3725,3572384,"Crowded , noisy , dirty . Service is poor , fo...",3.000000,2.000000,2.000000,2.000000,4.000000,3.000000,2.000000


## 11. TRAINING

### BM25 documentation example

In [77]:
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!", # documento 1
    "It is quite windy in London", # documento 2
    "How is the weather today?" # documento 3
]

# in input ci deve essere una lista di liste di stringhe
model_input = [doc.split(" ") for doc in corpus]
print(model_input)
tokenized_corpus = model_input

bm25 = BM25Okapi(tokenized_corpus)

[['Hello', 'there', 'good', 'man!'], ['It', 'is', 'quite', 'windy', 'in', 'London'], ['How', 'is', 'the', 'weather', 'today?']]


In [78]:
# prova
query = "windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
doc_scores

array([0.        , 0.93729472, 0.        ])

In [79]:
# per recuperare i best n documenti
bm25.get_top_n(tokenized_query, corpus, n=1)

['It is quite windy in London']

## 12. VALIDATION