In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

import pickle



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getcwd()
os.chdir('C:/Users/frlan/Documents/NLP/dataset')

# change the path to C:\Users\Kyriakos\Desktop\MsC AI\NLP\NLP AirBNB Project\AirBnB_score_prediction\dataset
# os.chdir('C:\\Users\\Kyriakos\\Desktop\\MsC AI\\NLP\\NLP AirBNB Project\\AirBnB_score_prediction\\dataset')
# os.getcwd()


In [3]:
# import label_encoder to use it for the categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [4]:
# 'listings_comments.csv is the datset compined 

# reaf the datset from Google Drive G:\.shortcut-targets-by-id\1CZZx9Bpz7hB-BhPFsqAL3fRVyxM1XfwF\AirBnB_project
# data = pd.read_csv('G:\\.shortcut-targets-by-id\\1CZZx9Bpz7hB-BhPFsqAL3fRVyxM1XfwF\\AirBnB_project\\listings_comments.csv')

from dataloader import Dataloader
data = Dataloader()
data = data.getListings()



In [5]:
data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

# Base Preperation

## Remove unnecessary columns

In [6]:
len(data.columns)

75

In [7]:
# Crate a list for removing unnecessary columns from the dataset
columns_to_remove = ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source','picture_url', 'host_id',
       'host_url','host_name','host_location','host_thumbnail_url', 'host_picture_url','host_verifications','neighbourhood','neighbourhood_group_cleansed','latitude',
       'longitude','bathrooms','calendar_updated','has_availability','availability_30', 'availability_60', 'availability_90',
       'availability_365','calendar_last_scraped','review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value','instant_bookable',
       'calculated_host_listings_count','name','description']


In [8]:
# Remove the unnecessary columns from the dataset.
data.drop(columns_to_remove, axis=1, inplace=True)
len(data.columns)

39

## Check and replace nan values


In [9]:
def fill_nan(df, column):
    if(df[column].astype(str).str.isnumeric().all()==True):
        #Numeric values
        df[column].fillna(df[column].mean(), inplace=True)
    else:
        #string balue
        df[column].fillna('UNK', inplace=True)

    return df

for columnName in data.columns:
    data = fill_nan(data, columnName)

# Column processing

## Function that takes a column and returns an embeddings matrix

In [10]:
# Check the number of unique values in each column
data.nunique()


neighborhood_overview                           4039
host_since                                      2666
host_about                                      3302
host_response_time                                 5
host_response_rate                                50
host_acceptance_rate                              98
host_is_superhost                                  3
host_neighbourhood                                59
host_listings_count                               28
host_total_listings_count                         46
host_has_profile_pic                               2
host_identity_verified                             2
neighbourhood_cleansed                            22
property_type                                     59
room_type                                          4
accommodates                                      15
bathrooms_text                                    23
bedrooms                                          11
beds                                          

In [11]:
# load the model that is responsible for the text embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# create a device object
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
# move the model to the device
model.to(device)

# parallelize the model across multiple GPUs(if available)
model = nn.DataParallel(model)

underlying_model = model.module

Device: cuda


In [12]:
# function to get the embeddings for a column in a dataframe and return them as a numpy array
def get_embeddings(dataframe, column_name, batch_size=100):
    """
    Obtain sentence embeddings for a dataframe column using SentenceTransformer with batch processing.

    Args:
        dataframe (pandas.DataFrame): The dataframe containing the column to obtain embeddings for.
        column_name (str): The name of the column to obtain embeddings for.
        batch_size (int, optional): The size of each batch to process. Default is 100.

    Returns:
        numpy.ndarray: A numpy array containing the embeddings for the specified column.
    """

    # Obtain the embeddings for the specified column in batches
    num_batches = int(np.ceil(len(dataframe) / batch_size))
    embeddings = []
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min(start_index + batch_size, len(dataframe))
        batch_embeddings = model.encode(dataframe[column_name][start_index:end_index].tolist())
        embeddings.append(batch_embeddings)
    embeddings = np.concatenate(embeddings)

    return embeddings

In [13]:
data.columns
processedColumns = pd.DataFrame()


## Text Columns - word embedings

### neighborhood_overview

In [14]:
neighborhood_overview_embeddings = underlying_model.encode(data.neighborhood_overview.values)


In [15]:
neighborhood_overview_embeddings.shape

(6809, 384)

In [16]:
processedColumns['neighborhood_overview']= pd.Series(neighborhood_overview_embeddings.tolist())
processedColumns.neighborhood_overview

0       [-0.11637205630540848, 0.09759077429771423, -0...
1       [0.136016383767128, -0.015675336122512817, 0.0...
2       [0.06785199791193008, 0.060150280594825745, 0....
3       [0.11301513016223907, -0.03934653848409653, 0....
4       [0.054658882319927216, -0.059907153248786926, ...
                              ...                        
6804    [0.055526409298181534, -0.09739918261766434, 0...
6805    [-0.11637205630540848, 0.09759077429771423, -0...
6806    [-0.11637205630540848, 0.09759077429771423, -0...
6807    [0.11814238876104355, 0.06271287053823471, -0....
6808    [-0.11637203395366669, 0.09759080410003662, -0...
Name: neighborhood_overview, Length: 6809, dtype: object

### host_about

In [17]:
host_about_embeddings = underlying_model.encode(data.host_about)

In [18]:
processedColumns['host_about']= pd.Series(host_about_embeddings.tolist())
processedColumns.host_about

0       [-0.11637205630540848, 0.09759077429771423, -0...
1       [0.1730751097202301, 0.013356110081076622, 0.0...
2       [-0.07525812089443207, 0.04363350197672844, 0....
3       [-0.019661905243992805, -0.026079228147864342,...
4       [-0.09047619998455048, 0.04043954238295555, 0....
                              ...                        
6804    [0.06065135821700096, -0.09856018424034119, 0....
6805    [0.1282920241355896, 0.042223624885082245, 0.0...
6806    [-0.13382287323474884, 0.014150858856737614, -...
6807    [0.062177881598472595, 0.08505070209503174, 0....
6808    [-0.04520072415471077, -0.05978447198867798, 0...
Name: host_about, Length: 6809, dtype: object

### bathroom_text

In [19]:
data.bathrooms_text

0              1.5 baths
1       1.5 shared baths
2         1 private bath
3              1.5 baths
4          1 shared bath
              ...       
6804      1 private bath
6805              1 bath
6806              1 bath
6807      1 private bath
6808           1.5 baths
Name: bathrooms_text, Length: 6809, dtype: object

In [20]:
import re

def extract_number(string):
    pattern = r'^(\d+(?:\.\d+)?)' # regular expression pattern to match the number at the start of the string
    match = re.match(pattern, string)
    if match:
        return float(match.group(1))
    else:
        return None

numberOfBathrooms = data.bathrooms_text.apply(extract_number)
numberOfBathrooms

0       1.5
1       1.5
2       1.0
3       1.5
4       1.0
       ... 
6804    1.0
6805    1.0
6806    1.0
6807    1.0
6808    1.5
Name: bathrooms_text, Length: 6809, dtype: float64

In [21]:
processedColumns['bathrooms_text']=numberOfBathrooms
processedColumns.bathrooms_text

0       1.5
1       1.5
2       1.0
3       1.5
4       1.0
       ... 
6804    1.0
6805    1.0
6806    1.0
6807    1.0
6808    1.5
Name: bathrooms_text, Length: 6809, dtype: float64

### price

In [22]:
data.price

0       $100.00
1        $59.00
2       $106.00
3       $140.00
4        $75.00
         ...   
6804    $165.00
6805    $150.00
6806    $160.00
6807    $125.00
6808    $249.00
Name: price, Length: 6809, dtype: object

In [23]:
price_embeddings = underlying_model.encode( data.price)

In [24]:
processedColumns['price']= pd.Series(price_embeddings.tolist())
processedColumns.price

0       [-0.04563172161579132, 0.11141323298215866, -0...
1       [-0.0039479671977460384, 0.07245095074176788, ...
2       [-0.05546378716826439, 0.04769304022192955, -0...
3       [-0.05199503153562546, 0.07414033263921738, -0...
4       [0.0076210517436265945, 0.10229810327291489, -...
                              ...                        
6804    [-0.02837580256164074, 0.06916150450706482, -0...
6805    [-0.06527556478977203, 0.07389151304960251, -0...
6806    [-0.035829972475767136, 0.08255057036876678, -...
6807    [-0.07781103998422623, 0.05170097574591637, 0....
6808    [-0.028242487460374832, 0.020148305222392082, ...
Name: price, Length: 6809, dtype: object

## Categorical Columns - label encoding

### host_response_time

In [25]:
encoding = le.fit_transform(data.host_response_time)


In [26]:
processedColumns['host_response_time']=encoding
processedColumns

Unnamed: 0,neighborhood_overview,host_about,bathrooms_text,price,host_response_time
0,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.11637205630540848, 0.09759077429771423, -0...",1.5,"[-0.04563172161579132, 0.11141323298215866, -0...",2
1,"[0.136016383767128, -0.015675336122512817, 0.0...","[0.1730751097202301, 0.013356110081076622, 0.0...",1.5,"[-0.0039479671977460384, 0.07245095074176788, ...",4
2,"[0.06785199791193008, 0.060150280594825745, 0....","[-0.07525812089443207, 0.04363350197672844, 0....",1.0,"[-0.05546378716826439, 0.04769304022192955, -0...",4
3,"[0.11301513016223907, -0.03934653848409653, 0....","[-0.019661905243992805, -0.026079228147864342,...",1.5,"[-0.05199503153562546, 0.07414033263921738, -0...",4
4,"[0.054658882319927216, -0.059907153248786926, ...","[-0.09047619998455048, 0.04043954238295555, 0....",1.0,"[0.0076210517436265945, 0.10229810327291489, -...",3
...,...,...,...,...,...
6804,"[0.055526409298181534, -0.09739918261766434, 0...","[0.06065135821700096, -0.09856018424034119, 0....",1.0,"[-0.02837580256164074, 0.06916150450706482, -0...",2
6805,"[-0.11637205630540848, 0.09759077429771423, -0...","[0.1282920241355896, 0.042223624885082245, 0.0...",1.0,"[-0.06527556478977203, 0.07389151304960251, -0...",2
6806,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.13382287323474884, 0.014150858856737614, -...",1.0,"[-0.035829972475767136, 0.08255057036876678, -...",4
6807,"[0.11814238876104355, 0.06271287053823471, -0....","[0.062177881598472595, 0.08505070209503174, 0....",1.0,"[-0.07781103998422623, 0.05170097574591637, 0....",4


### host_response_rate

In [27]:
encoding = le.fit_transform(data.host_response_rate)


In [28]:
processedColumns['host_response_rate']=encoding
processedColumns

Unnamed: 0,neighborhood_overview,host_about,bathrooms_text,price,host_response_time,host_response_rate
0,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.11637205630540848, 0.09759077429771423, -0...",1.5,"[-0.04563172161579132, 0.11141323298215866, -0...",2,36
1,"[0.136016383767128, -0.015675336122512817, 0.0...","[0.1730751097202301, 0.013356110081076622, 0.0...",1.5,"[-0.0039479671977460384, 0.07245095074176788, ...",4,1
2,"[0.06785199791193008, 0.060150280594825745, 0....","[-0.07525812089443207, 0.04363350197672844, 0....",1.0,"[-0.05546378716826439, 0.04769304022192955, -0...",4,1
3,"[0.11301513016223907, -0.03934653848409653, 0....","[-0.019661905243992805, -0.026079228147864342,...",1.5,"[-0.05199503153562546, 0.07414033263921738, -0...",4,1
4,"[0.054658882319927216, -0.059907153248786926, ...","[-0.09047619998455048, 0.04043954238295555, 0....",1.0,"[0.0076210517436265945, 0.10229810327291489, -...",3,1
...,...,...,...,...,...,...
6804,"[0.055526409298181534, -0.09739918261766434, 0...","[0.06065135821700096, -0.09856018424034119, 0....",1.0,"[-0.02837580256164074, 0.06916150450706482, -0...",2,29
6805,"[-0.11637205630540848, 0.09759077429771423, -0...","[0.1282920241355896, 0.042223624885082245, 0.0...",1.0,"[-0.06527556478977203, 0.07389151304960251, -0...",2,1
6806,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.13382287323474884, 0.014150858856737614, -...",1.0,"[-0.035829972475767136, 0.08255057036876678, -...",4,1
6807,"[0.11814238876104355, 0.06271287053823471, -0....","[0.062177881598472595, 0.08505070209503174, 0....",1.0,"[-0.07781103998422623, 0.05170097574591637, 0....",4,1


### host_acceptance_rate

In [29]:
encoding = le.fit_transform(data.host_acceptance_rate)

In [30]:
processedColumns['host_acceptance_rate']=encoding
processedColumns

Unnamed: 0,neighborhood_overview,host_about,bathrooms_text,price,host_response_time,host_response_rate,host_acceptance_rate
0,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.11637205630540848, 0.09759077429771423, -0...",1.5,"[-0.04563172161579132, 0.11141323298215866, -0...",2,36,93
1,"[0.136016383767128, -0.015675336122512817, 0.0...","[0.1730751097202301, 0.013356110081076622, 0.0...",1.5,"[-0.0039479671977460384, 0.07245095074176788, ...",4,1,2
2,"[0.06785199791193008, 0.060150280594825745, 0....","[-0.07525812089443207, 0.04363350197672844, 0....",1.0,"[-0.05546378716826439, 0.04769304022192955, -0...",4,1,95
3,"[0.11301513016223907, -0.03934653848409653, 0....","[-0.019661905243992805, -0.026079228147864342,...",1.5,"[-0.05199503153562546, 0.07414033263921738, -0...",4,1,2
4,"[0.054658882319927216, -0.059907153248786926, ...","[-0.09047619998455048, 0.04043954238295555, 0....",1.0,"[0.0076210517436265945, 0.10229810327291489, -...",3,1,2
...,...,...,...,...,...,...,...
6804,"[0.055526409298181534, -0.09739918261766434, 0...","[0.06065135821700096, -0.09856018424034119, 0....",1.0,"[-0.02837580256164074, 0.06916150450706482, -0...",2,29,84
6805,"[-0.11637205630540848, 0.09759077429771423, -0...","[0.1282920241355896, 0.042223624885082245, 0.0...",1.0,"[-0.06527556478977203, 0.07389151304960251, -0...",2,1,0
6806,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.13382287323474884, 0.014150858856737614, -...",1.0,"[-0.035829972475767136, 0.08255057036876678, -...",4,1,79
6807,"[0.11814238876104355, 0.06271287053823471, -0....","[0.062177881598472595, 0.08505070209503174, 0....",1.0,"[-0.07781103998422623, 0.05170097574591637, 0....",4,1,96


### host_is_superhost

In [31]:
encoding = le.fit_transform(data.host_is_superhost)

In [32]:
processedColumns['host_is_superhost']=encoding
processedColumns.host_is_superhost

0       1
1       2
2       1
3       2
4       2
       ..
6804    2
6805    1
6806    1
6807    2
6808    1
Name: host_is_superhost, Length: 6809, dtype: int32

### host_neighbourhood

In [33]:
data.host_neighbourhood.value_counts()

UNK                                    3771
Oud-West                                555
Grachtengordel                          248
De Pijp                                 229
Jordaan                                 194
Oosterparkbuurt                         152
Bos en Lommer                           128
Indische Buurt                          118
Oost                                     88
Nieuwmarkt en Lastage                    83
Rivierenbuurt                            79
Hoofddorppleinbuurt                      72
Westelijke Eilanden                      70
De Wallen                                69
Weesperbuurt en Plantage                 66
Amsterdam Centrum                        61
Watergraafsmeer                          61
Oostelijke Eilanden en Kadijken          58
Zeeburg                                  56
Oud-Zuid                                 51
Frederik Hendrikbuurt                    51
Museumkwartier                           46
Spaarndammer en Zeeheldenbuurt  

In [34]:
vc = data.host_neighbourhood.value_counts()
single_neighborhoods = vc[vc == 1].index.tolist()

data.loc[data['host_neighbourhood'].isin(single_neighborhoods), 'host_neighbourhood'] = 'other'

single_neighborhoods

['Passy',
 'LB of Islington',
 'Grand Place',
 'Valdeacederas',
 'Belváros',
 'Stockwell',
 'Sant Antoni',
 'La Combe',
 'Nieuwendammerham',
 'La Sagrada Família',
 'Palermo Hollywood',
 'Cannaregio',
 'El Raval']

In [35]:
data.host_neighbourhood.value_counts().tail(5)

Tuindorp Nieuwendam                    7
Glòries - El Parc                      6
Buitenveldert-Oost                     4
Nieuwendammerdijk en Buiksloterdijk    4
Koreatown                              2
Name: host_neighbourhood, dtype: int64

In [36]:
encoding = le.fit_transform(data.host_neighbourhood)
processedColumns['host_neighbourhood']=encoding
processedColumns.host_neighbourhood

0       35
1       14
2       11
3       44
4        0
        ..
6804    22
6805    28
6806     7
6807    43
6808    31
Name: host_neighbourhood, Length: 6809, dtype: int32

### host_has_profile_pic

In [37]:
encoding = le.fit_transform(data.host_has_profile_pic)
processedColumns['host_has_profile_pic']=encoding
processedColumns.host_has_profile_pic

0       1
1       1
2       1
3       1
4       1
       ..
6804    1
6805    1
6806    1
6807    1
6808    1
Name: host_has_profile_pic, Length: 6809, dtype: int32

### host_identity_verified

In [38]:
encoding = le.fit_transform(data.host_identity_verified)
processedColumns['host_identity_verified']=encoding
processedColumns.host_identity_verified

0       1
1       1
2       1
3       1
4       1
       ..
6804    1
6805    1
6806    1
6807    1
6808    1
Name: host_identity_verified, Length: 6809, dtype: int32

### neighbourhood_cleansed

In [39]:
data.neighbourhood_cleansed.value_counts().tail(5)

De Aker - Nieuw Sloten    54
Bijlmer-Centrum           45
Gaasperdam - Driemond     44
Osdorp                    43
Bijlmer-Oost              25
Name: neighbourhood_cleansed, dtype: int64

In [40]:
encoding = le.fit_transform(data.neighbourhood_cleansed)
processedColumns['neighbourhood_cleansed']=encoding
processedColumns.neighbourhood_cleansed

0       20
1       14
2        4
3        5
4        5
        ..
6804     4
6805     7
6806     8
6807     4
6808     8
Name: neighbourhood_cleansed, Length: 6809, dtype: int32

### property_type

In [41]:
encoding = le.fit_transform(data.property_type)
processedColumns['property_type']=encoding
processedColumns.property_type

0       22
1       36
2       39
3       33
4       36
        ..
6804    22
6805    14
6806    14
6807    22
6808    14
Name: property_type, Length: 6809, dtype: int32

### room_type

In [42]:
encoding = le.fit_transform(data.room_type)
processedColumns['room_type']=encoding
processedColumns.room_type

0       2
1       2
2       2
3       2
4       2
       ..
6804    2
6805    0
6806    0
6807    2
6808    0
Name: room_type, Length: 6809, dtype: int32

## Numerical/date Columns - converted to regular numbers

### host_since

In [43]:
from datetime import date
from datetime import datetime

datetest =data.host_since.apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
time = datetest.apply(lambda x: x.toordinal())

processedColumns['host_since']= pd.Series(time.tolist())
processedColumns.host_since

0       734664
1       733309
2       733743
3       733854
4       733905
         ...  
6804    734825
6805    734829
6806    734528
6807    734830
6808    734832
Name: host_since, Length: 6809, dtype: int64

### host_listings_count

In [44]:
processedColumns['host_listings_count']= data.host_listings_count
processedColumns.host_listings_count

0       1
1       1
2       3
3       1
4       2
       ..
6804    1
6805    1
6806    1
6807    1
6808    1
Name: host_listings_count, Length: 6809, dtype: int64

### bedrooms

In [45]:
withoutNa = data.bedrooms.replace("UNK",0)
processedColumns["bedrooms"]=withoutNa.astype(float)
processedColumns.bedrooms

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
6804    1.0
6805    2.0
6806    2.0
6807    1.0
6808    2.0
Name: bedrooms, Length: 6809, dtype: float64

### beds

In [46]:
withoutNa=data.beds.replace('UNK', 0)
processedColumns["beds"]=withoutNa.astype(float)
processedColumns.beds

0       1.0
1       2.0
2       1.0
3       1.0
4       1.0
       ... 
6804    1.0
6805    2.0
6806    2.0
6807    1.0
6808    2.0
Name: beds, Length: 6809, dtype: float64

### host_total_listings_count

In [47]:

processedColumns['host_total_listings_count']= data.host_total_listings_count
processedColumns.host_total_listings_count

0       1
1       1
2       5
3       1
4       2
       ..
6804    1
6805    1
6806    2
6807    1
6808    5
Name: host_total_listings_count, Length: 6809, dtype: int64

### accommodates

In [48]:
processedColumns['accommodates']= data.accommodates
processedColumns.accommodates

0       2
1       2
2       2
3       2
4       2
       ..
6804    2
6805    4
6806    3
6807    2
6808    6
Name: accommodates, Length: 6809, dtype: int64

### minimum_nights

In [49]:
processedColumns['minimum_nights']= data.minimum_nights
processedColumns.minimum_nights

0       2
1       3
2       1
3       2
4       2
       ..
6804    1
6805    4
6806    3
6807    1
6808    4
Name: minimum_nights, Length: 6809, dtype: int64

### maximum_nights

In [50]:
processedColumns['maximum_nights']= data.maximum_nights
processedColumns.maximum_nights

0         30
1         28
2        365
3        356
4       1825
        ... 
6804    1125
6805      14
6806    1124
6807      30
6808      21
Name: maximum_nights, Length: 6809, dtype: int64

### minimum_minimum_nights

In [51]:
processedColumns['minimum_minimum_nights']= data.minimum_minimum_nights
processedColumns.minimum_minimum_nights

0       2
1       3
2       1
3       2
4       2
       ..
6804    1
6805    4
6806    3
6807    1
6808    3
Name: minimum_minimum_nights, Length: 6809, dtype: int64

### maximum_minimum_nights

In [52]:
processedColumns['maximum_minimum_nights']= data.maximum_minimum_nights
processedColumns.maximum_minimum_nights

0       2
1       3
2       1
3       2
4       2
       ..
6804    1
6805    4
6806    3
6807    1
6808    4
Name: maximum_minimum_nights, Length: 6809, dtype: int64

### minimum_maximum_nights

In [53]:
processedColumns['minimum_maximum_nights']= data.minimum_maximum_nights
processedColumns.minimum_maximum_nights

0         30
1       1125
2       1125
3       1125
4       1825
        ... 
6804    1125
6805      14
6806    1124
6807      30
6808       4
Name: minimum_maximum_nights, Length: 6809, dtype: int64

### maximum_maximum_nights

In [54]:
processedColumns['maximum_maximum_nights']= data.maximum_maximum_nights
processedColumns.maximum_maximum_nights

0         30
1       1125
2       1125
3       1125
4       1825
        ... 
6804    1125
6805      14
6806    1124
6807      30
6808      21
Name: maximum_maximum_nights, Length: 6809, dtype: int64

### minimum_nights_avg_ntm

In [55]:
processedColumns['minimum_nights_avg_ntm']= data.minimum_nights_avg_ntm
processedColumns.minimum_nights_avg_ntm

0       2.0
1       3.0
2       1.0
3       2.0
4       2.0
       ... 
6804    1.0
6805    4.0
6806    3.0
6807    1.0
6808    4.0
Name: minimum_nights_avg_ntm, Length: 6809, dtype: float64

### maximum_nights_avg_ntm

In [56]:
processedColumns['maximum_nights_avg_ntm']= data.maximum_nights_avg_ntm
processedColumns.maximum_nights_avg_ntm

0         30.0
1       1125.0
2       1125.0
3       1125.0
4       1825.0
         ...  
6804    1125.0
6805      14.0
6806    1124.0
6807      30.0
6808      20.8
Name: maximum_nights_avg_ntm, Length: 6809, dtype: float64

### number_of_reviews

In [57]:
processedColumns['number_of_reviews']= data.number_of_reviews
processedColumns.number_of_reviews

0        248
1        314
2        339
3        243
4        454
        ... 
6804     343
6805      18
6806      97
6807    1060
6808      17
Name: number_of_reviews, Length: 6809, dtype: int64

### number_of_reviews_l30d

In [58]:
processedColumns['number_of_reviews_l30d']= data.number_of_reviews_l30d
processedColumns.number_of_reviews_l30d

0        3
1        1
2        0
3        5
4        7
        ..
6804     0
6805     0
6806     0
6807    10
6808     0
Name: number_of_reviews_l30d, Length: 6809, dtype: int64

### first_review

In [59]:
data.first_review

0       2016-10-06
1       2009-03-30
2       2010-03-02
3       2012-01-09
4       2010-08-22
           ...    
6804    2013-02-19
6805    2013-01-04
6806    2013-05-14
6807    2012-11-29
6808    2013-02-26
Name: first_review, Length: 6809, dtype: object

In [60]:
timeData = data['first_review'].apply(lambda x: pd.Timestamp(x).toordinal() if x != 'UNK' else 0)
processedColumns['first_review']= pd.Series(timeData.tolist())
processedColumns.first_review

0       736243
1       733496
2       733833
3       734511
4       734006
         ...  
6804    734918
6805    734872
6806    735002
6807    734836
6808    734925
Name: first_review, Length: 6809, dtype: int64

### last_review


In [61]:
timeData = data['last_review'].apply(lambda x: pd.Timestamp(x).toordinal() if x != 'UNK' else 0)
processedColumns['last_review']= pd.Series(timeData.tolist())
processedColumns.last_review

0       738487
1       738465
2       737524
3       738486
4       738479
         ...  
6804    738449
6805    737473
6806    738445
6807    738492
6808    738411
Name: last_review, Length: 6809, dtype: int64

### calculated_host_listings_count_entire_homes

In [62]:
processedColumns['calculated_host_listings_count_entire_homes']= data.calculated_host_listings_count_entire_homes
processedColumns.calculated_host_listings_count_entire_homes

0       0
1       0
2       0
3       0
4       0
       ..
6804    0
6805    1
6806    1
6807    0
6808    1
Name: calculated_host_listings_count_entire_homes, Length: 6809, dtype: int64

### calculated_host_listings_count_private_rooms

In [63]:
processedColumns['calculated_host_listings_count_private_rooms']= data.calculated_host_listings_count_private_rooms
processedColumns.calculated_host_listings_count_private_rooms

0       1
1       1
2       2
3       1
4       2
       ..
6804    1
6805    0
6806    0
6807    1
6808    0
Name: calculated_host_listings_count_private_rooms, Length: 6809, dtype: int64

### calculated_host_listings_count_shared_rooms

In [64]:
processedColumns['calculated_host_listings_count_shared_rooms']= data.calculated_host_listings_count_shared_rooms
processedColumns.calculated_host_listings_count_shared_rooms

0       0
1       0
2       0
3       0
4       0
       ..
6804    0
6805    0
6806    0
6807    0
6808    0
Name: calculated_host_listings_count_shared_rooms, Length: 6809, dtype: int64

### reviews_per_month

In [65]:
withoutUNK = data.reviews_per_month.replace('UNK', 0)
processedColumns['reviews_per_month']= withoutUNK.astype(float)
processedColumns.reviews_per_month

0       3.30
1       1.88
2       2.18
3       1.83
4       3.03
        ... 
6804    2.88
6805    0.15
6806    0.83
6807    8.69
6808    0.14
Name: reviews_per_month, Length: 6809, dtype: float64

# Procesed Columns vizualization

In [102]:
processedColumns.select_dtypes(include=['object'])

Unnamed: 0,neighborhood_overview,host_about,price
0,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.11637205630540848, 0.09759077429771423, -0...","[-0.04563172161579132, 0.11141323298215866, -0..."
1,"[0.136016383767128, -0.015675336122512817, 0.0...","[0.1730751097202301, 0.013356110081076622, 0.0...","[-0.0039479671977460384, 0.07245095074176788, ..."
2,"[0.06785199791193008, 0.060150280594825745, 0....","[-0.07525812089443207, 0.04363350197672844, 0....","[-0.05546378716826439, 0.04769304022192955, -0..."
3,"[0.11301513016223907, -0.03934653848409653, 0....","[-0.019661905243992805, -0.026079228147864342,...","[-0.05199503153562546, 0.07414033263921738, -0..."
4,"[0.054658882319927216, -0.059907153248786926, ...","[-0.09047619998455048, 0.04043954238295555, 0....","[0.0076210517436265945, 0.10229810327291489, -..."
...,...,...,...
6804,"[0.055526409298181534, -0.09739918261766434, 0...","[0.06065135821700096, -0.09856018424034119, 0....","[-0.02837580256164074, 0.06916150450706482, -0..."
6805,"[-0.11637205630540848, 0.09759077429771423, -0...","[0.1282920241355896, 0.042223624885082245, 0.0...","[-0.06527556478977203, 0.07389151304960251, -0..."
6806,"[-0.11637205630540848, 0.09759077429771423, -0...","[-0.13382287323474884, 0.014150858856737614, -...","[-0.035829972475767136, 0.08255057036876678, -..."
6807,"[0.11814238876104355, 0.06271287053823471, -0....","[0.062177881598472595, 0.08505070209503174, 0....","[-0.07781103998422623, 0.05170097574591637, 0...."


# PCA

In [78]:
### save
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# select numeric columns
df_numeric = processedColumns.select_dtypes(include=['int', 'float'])

# drop na values
df_numeric.dropna(inplace=True)

# instantiate a StandardScaler object
scaler = StandardScaler()

# scale the data
scaled_df = scaler.fit_transform(df_numeric)

# instantiate a PCA object
pca = PCA(n_components=2)

# fit and transform the scaled data
pca_df = pca.fit_transform(scaled_df)

# create a new dataframe from the transformed data
pca_df = pd.DataFrame(data=pca_df, columns=['PC1', 'PC2'])

# display the resulting dataframe
print(pca_df)

           PC1       PC2
0     1.862878  0.371151
1     3.081446  0.879625
2     3.279360  0.797928
3     3.355589  0.917594
4     5.011942  1.605773
...        ...       ...
6780  2.924228  0.829386
6781 -0.895900 -0.405265
6782  0.087502 -0.118457
6783  7.090622  2.030041
6784 -2.019852 -0.744735

[6785 rows x 2 columns]


# TEST

In [128]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


In [137]:
data = pd.DataFrame({'comments':['This is a sentence.', 'This is another sentence.', 'And this is a third sentence.']})

embeddings = model.encode(data.comments.values)



In [135]:
save = embeddings


In [139]:
print(save)
print("___")
print(embeddings)

[[ 0.08211041  0.09573979  0.01097466 ... -0.02961806  0.01849113
  -0.10023987]
 [ 0.07808702  0.05164278  0.03081007 ... -0.06356346  0.036269
  -0.07523914]
 [ 0.07042705 -0.02454304  0.02798672 ... -0.03749176  0.07149752
  -0.07197584]]
___
[[ 0.08211041  0.09573979  0.01097466 ... -0.02961806  0.01849113
  -0.10023987]
 [ 0.07808702  0.05164278  0.03081007 ... -0.06356346  0.036269
  -0.07523914]
 [ 0.07042705 -0.02454304  0.02798672 ... -0.03749176  0.07149752
  -0.07197584]]


In [140]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principal_components = pca.fit_transform(embeddings)
