In [34]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

import pickle



In [2]:
os.getcwd()
# change the path to C:\Users\Kyriakos\Desktop\MsC AI\NLP\NLP AirBNB Project\AirBnB_score_prediction\dataset
# os.chdir('C:\\Users\\Kyriakos\\Desktop\\MsC AI\\NLP\\NLP AirBNB Project\\AirBnB_score_prediction\\dataset')
# os.getcwd()


'c:\\Users\\Kyriakos\\Desktop\\MsC AI\\NLP\\NLP AirBNB Project\\AirBnB_score_prediction'

In [79]:
# import label_encoder to use it for the categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [3]:
# 'listings_comments.csv is the datset compined 

# reaf the datset from Google Drive G:\.shortcut-targets-by-id\1CZZx9Bpz7hB-BhPFsqAL3fRVyxM1XfwF\AirBnB_project
data = pd.read_csv('G:\\.shortcut-targets-by-id\\1CZZx9Bpz7hB-BhPFsqAL3fRVyxM1XfwF\\AirBnB_project\\listings_comments.csv')



### check  for nan values


In [4]:
print(f'Description nan values: {data.description.isna().sum()}')
print(f'Comments nan values: {data.neighborhood_overview.isna().sum()}')
print(f'Comments nan values: {data.neighbourhood.isna().sum()}')

Description nan values: 6
Comments nan values: 2492
Comments nan values: 2492


In [5]:
def fill_nan(df, column):
    df[column].fillna('UNK', inplace=True)
    return df

data = fill_nan(data, 'description')
data = fill_nan(data, 'neighborhood_overview')
data = fill_nan(data, 'neighbourhood')

In [6]:
print(f'Description nan values: {data.description.isna().sum()}')
print(f'Comments nan values: {data.neighborhood_overview.isna().sum()}')
print(f'Comments nan values: {data.neighbourhood.isna().sum()}')

Description nan values: 0
Comments nan values: 0
Comments nan values: 0


In [7]:
# check if there are any missing values in "review_scores_rating"
print(data['review_scores_rating'].isnull().sum())

# fill missing values with the mean
data['review_scores_rating'].fillna(data['review_scores_rating'].mean(), inplace=True)

print(data['review_scores_rating'].isnull().sum())

682
0


In [9]:
# These are all the features we have in our dataset
data.columns

Index(['listing_id', 'listing_url', 'scrape_id', 'last_scraped', 'source',
       'name', 'description', 'neighborhood_overview', 'picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_n

In [11]:
# Crate a list for removing unnecessary columns from the dataset
columns_to_remove = ['listing_url', 'scrape_id', 'last_scraped', 'source', 'picture_url', 'host_url', 'host_name', 
                     'host_location', 'host_thumbnail_url', 'host_picture_url', 'host_verifications', 
                     'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_group_cleansed', 
                     'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
                     'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
                     'availability_30', 'availability_60', 'availability_90', 'availability_365', 
                     'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 
                     'first_review', 'last_review', 'license', 'instant_bookable', 'reviews_per_month', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location',
       'review_scores_value',]


In [12]:
# Remove the unnecessary columns from the dataset.
data.drop(columns_to_remove, axis=1, inplace=True)

In [80]:
# data.drop(columns_to_remove, axis=1, inplace=True)

# # Remove rows with missing review_scores_rating values
# # data.dropna(subset=['review_scores_rating'], inplace=True)

# # we can do it later when we combine all embeddings in one dataframe
# # Split into training and validation sets
# X = data.drop(['review_scores_rating'], axis=1)
# y = data['review_scores_rating']
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## Function that takes a column and returns an embeddings matrix

In [36]:
# Check the number of unique values in each column
data.nunique()

70

In [None]:
# load the model that is responsible for the text embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# create a device object
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
# move the model to the device
model.to(device)

# parallelize the model across multiple GPUs(if available)
model = nn.DataParallel(model)

In [81]:
# function to get the embeddings for a column in a dataframe and return them as a numpy array
def get_embeddings(dataframe, column_name, batch_size=100):
    """
    Obtain sentence embeddings for a dataframe column using SentenceTransformer with batch processing.

    Args:
        dataframe (pandas.DataFrame): The dataframe containing the column to obtain embeddings for.
        column_name (str): The name of the column to obtain embeddings for.
        batch_size (int, optional): The size of each batch to process. Default is 100.

    Returns:
        numpy.ndarray: A numpy array containing the embeddings for the specified column.
    """

    # Obtain the embeddings for the specified column in batches
    num_batches = int(np.ceil(len(dataframe) / batch_size))
    embeddings = []
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min(start_index + batch_size, len(dataframe))
        batch_embeddings = model.encode(dataframe[column_name][start_index:end_index].tolist())
        embeddings.append(batch_embeddings)
    embeddings = np.concatenate(embeddings)

    return embeddings

## Description Column

In [32]:
# we get the embeddings for the description column and store them in a description_embeddings variable
description_embeddings = model.encode(data.description.values)


In [33]:
description_embeddings.shape


(6998, 384)

In [35]:
# save the embeddings locally and then push them to github
with open('description_embeddings.pkl', 'wb') as f:
    pickle.dump(description_embeddings, f)

## Neighbourhood Column

In [78]:
data.neighbourhood.nunique()

51

In [82]:
# threat the neighbourhood column as a categorical variable
# convert the neighbourhood column with label encoding

data['neighbourhood'] = le.fit_transform(data['neighbourhood'])


## Neighbourhood_overview column

In [91]:
print(data.neighborhood_overview.nunique())
print(data.neighborhood_overview.values[0:5])

4136
['Indische Buurt ("Indies Neighborhood") is a neighbourhood in the eastern portion of the city of Amsterdam, in the Dutch province of Noord-Holland. The name dates from the early 20th century and is derived from the fact that the neighbourhood\'s streets are named after islands and other geographical concepts in the former Dutch colony of the Dutch East Indies. The first street was named in 1902. In 2003, there were around 23,357 inhabitants. The neighbourhood is bounded on the west by the railroad Amsterdam - Hilversum (with the Muiderpoort Station), on the east side by Flevopark, on the north side by Zeeburgerdijk and on the south side by the Ringvaart Watergraafsmeer. Indische Buurt is the oldest part of the Zeeburg district and is very ethnically diverse, and a high percentage of the population is of immigrant origin (for Zeeburg this is already high at 55%, but higher in the Indische Buurt) and there are an estimated 100 languages spoken.'
 'Perfect location in the lively cen

In [26]:
# it is a text column, so we need to get the embeddings for it
neighbourhood_overview_embeddings = model.encode(data.neighborhood_overview.values)

In [29]:
# neighbourhood_emmbeddings save it as pickle
import pickle
with open('neighbourhood_emmbeddings.pickle', 'wb') as f:
    pickle.dump(neighbourhood_overview_embeddings, f) 

## neighbourhood_cleansed column

In [70]:

data.neighbourhood_cleansed.value_counts()

De Baarsjes - Oud-West                    1139
Centrum-West                               922
De Pijp - Rivierenbuurt                    802
Centrum-Oost                               681
Westerpark                                 481
Zuid                                       464
Oud-Oost                                   408
Bos en Lommer                              344
Oud-Noord                                  322
Oostelijk Havengebied - Indische Buurt     260
Watergraafsmeer                            208
Noord-West                                 179
IJburg - Zeeburgereiland                   157
Slotervaart                                143
Noord-Oost                                 113
Geuzenveld - Slotermeer                     87
Buitenveldert - Zuidas                      80
De Aker - Nieuw Sloten                      49
Bijlmer-Centrum                             44
Gaasperdam - Driemond                       44
Osdorp                                      43
Bijlmer-Oost 

In [87]:
# threat this as a categorical variable
# label encode the neighbourhood_cleansed column
data['neighbourhood_cleansed'] = le.fit_transform(data['neighbourhood_cleansed'])

In [88]:
data.neighbourhood_cleansed.value_counts()

7     1139
5      922
8      802
4      681
20     481
21     464
17     408
2      344
16     322
14     260
19     208
13     179
11     157
18     143
12     113
10      87
3       80
6       49
0       44
9       44
15      43
1       28
Name: neighbourhood_cleansed, dtype: int64

## Column property_type

In [None]:
# Used label ecoder but we can try embedding for this column as well

In [90]:
data.property_type.value_counts()

Entire rental unit                    2540
Entire condo                          1304
Private room in rental unit            490
Entire home                            471
Private room in bed and breakfast      290
Entire townhouse                       213
Entire loft                            187
Private room in condo                  148
Houseboat                              130
Private room in home                   128
Room in hotel                          115
Room in boutique hotel                 111
Private room in houseboat               97
Boat                                    93
Private room in guest suite             92
Private room in townhouse               86
Entire serviced apartment               82
Private room in boat                    56
Private room in loft                    43
Room in bed and breakfast               26
Entire villa                            25
Shared room in hostel                   23
Entire guest suite                      23
Room in apa

In [91]:
# values less or equal to 4 will be replaced with 'other'
data['property_type'] = data['property_type'].apply(lambda x: 'other' if x in data.property_type.value_counts()[data.property_type.value_counts() <= 10].index else x)

In [92]:
data.property_type.value_counts()

Entire rental unit                    2540
Entire condo                          1304
Private room in rental unit            490
Entire home                            471
Private room in bed and breakfast      290
Entire townhouse                       213
Entire loft                            187
Private room in condo                  148
Houseboat                              130
Private room in home                   128
Room in hotel                          115
Room in boutique hotel                 111
Private room in houseboat               97
other                                   94
Boat                                    93
Private room in guest suite             92
Private room in townhouse               86
Entire serviced apartment               82
Private room in boat                    56
Private room in loft                    43
Room in bed and breakfast               26
Entire villa                            25
Entire guest suite                      23
Shared room

In [93]:
# label encode the property_type column
data['property_type'] = le.fit_transform(data['property_type'])


## room_type column

In [42]:
data.room_type.value_counts()

Entire home/apt    5154
Private room       1743
Hotel room           62
Shared room          39
Name: room_type, dtype: int64

In [43]:
# LabelEncoder for room_type
data['room_type'] = le.fit_transform(data['room_type'])

## Column host_neighbourhood

In [56]:
data.host_neighbourhood.value_counts()

45    4022
27     553
11     244
7      234
15     189
24     157
2      126
14     116
22      84
21      79
30      78
12      69
40      64
8       62
23      57
42      57
0       56
41      55
9       53
43      52
28      50
34      45
18      45
35      45
33      41
13      37
39      34
38      30
26      26
3       25
25      22
1       20
4       19
29      19
6       19
16      15
36      14
44      13
31      13
19      13
32      11
17      11
37       8
5        6
10       6
20       4
Name: host_neighbourhood, dtype: int64

In [57]:
# value_counts of host_neighbourhood == 1 rename as "other"
vc = data.host_neighbourhood.value_counts()
single_neighborhoods = vc[vc == 1].index.tolist()
data.loc[data.host_neighbourhood.isin(single_neighborhoods), 'host_neighbourhood'] = 'other'



In [58]:
data.host_neighbourhood.value_counts()

45    4022
27     553
11     244
7      234
15     189
24     157
2      126
14     116
22      84
21      79
30      78
12      69
40      64
8       62
23      57
42      57
0       56
41      55
9       53
43      52
28      50
34      45
18      45
35      45
33      41
13      37
39      34
38      30
26      26
3       25
25      22
1       20
4       19
29      19
6       19
16      15
36      14
44      13
31      13
19      13
32      11
17      11
37       8
5        6
10       6
20       4
Name: host_neighbourhood, dtype: int64

In [59]:
# apply label encoder to host_neighbourhood


data['host_neighbourhood'] = le.fit_transform(data['host_neighbourhood'])

## Column host_response_time

In [93]:
print(f'Number of unique values in host_response_time: {data.host_response_time.nunique()}')
print(data.host_response_time.value_counts())

Number of unique values in host_response_time: 5
3    2645
4    2039
2    1123
1    1083
0     108
Name: host_response_time, dtype: int64


In [50]:
# convert to category with label encoding
data['host_response_time'] = le.fit_transform(data['host_response_time'])



## Column host_about

In [60]:
len(data.host_about.value_counts())

3378

In [61]:
data.host_about.head()

0    Upon arriving in Amsterdam, one can imagine as...
1    We love cycling, travelling, healthy food and ...
2    I like to welcome guests who take good care of...
3    We are the parents of 4 beautiful children, tw...
4    Welcome guests, I'm Sebastiaan and it would be...
Name: host_about, dtype: object

In [62]:
# check if there is any null value
data.host_about.isnull().sum()

0

In [63]:
# fill the with 'UNK'
data.host_about.fillna('UNK', inplace=True)

In [64]:
host_abput_embeddings = get_embeddings(data, 'host_about')

In [65]:
# pickle
with open('host_about_embeddings.pkl', 'wb') as f:
    pickle.dump(host_abput_embeddings, f)