**Read Data**

In [1]:
import pandas as pd
import numpy as np

raw_data = pd.read_csv('./listings.csv')
raw_data.shape

(3659, 79)

In [2]:
raw_data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

**Remove useless features**

In [3]:


features = ['neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type',
          'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating',
            'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
            'review_scores_location', 'review_scores_value']

data = raw_data[features].copy()
print(data.shape)
data.head()

(3659, 19)


Unnamed: 0,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,Tampines,East Region,1.34537,103.95887,Private room in villa,Private room,2,1.0,2.0,3.0,$143.00,19,4.44,4.37,4.0,4.63,4.78,4.26,4.32
1,Tampines,East Region,1.34754,103.95958,Private room in home,Private room,1,,,,,24,4.16,4.22,4.09,4.43,4.43,4.17,4.04
2,Tampines,East Region,1.34531,103.961,Private room in home,Private room,2,0.5,1.0,2.0,$76.00,46,4.41,4.39,4.52,4.63,4.64,4.5,4.36
3,Bukit Merah,Central Region,1.29015,103.80814,Private room in rental unit,Private room,1,,,,,20,4.4,4.16,4.26,4.47,4.42,4.53,4.63
4,Bukit Merah,Central Region,1.28836,103.81144,Private room in rental unit,Private room,1,,,,,16,4.27,4.44,4.06,4.5,4.5,4.63,4.13


**Data Information**

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3659 entries, 0 to 3658
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   neighbourhood_cleansed        3659 non-null   object 
 1   neighbourhood_group_cleansed  3659 non-null   object 
 2   latitude                      3659 non-null   float64
 3   longitude                     3659 non-null   float64
 4   property_type                 3659 non-null   object 
 5   room_type                     3659 non-null   object 
 6   accommodates                  3659 non-null   int64  
 7   bathrooms                     2758 non-null   float64
 8   bedrooms                      3385 non-null   float64
 9   beds                          2721 non-null   float64
 10  price                         2665 non-null   object 
 11  number_of_reviews             3659 non-null   int64  
 12  review_scores_rating          1803 non-null   float64
 13  rev

In [5]:
# missing counts
missing_counts = data.isnull().sum()

# missing ratio
missing_ratio = data.isnull().mean().round(3)

missing_df = pd.DataFrame({
    'missing_count': missing_counts,
    'missing_ratio': missing_ratio
}).sort_values(by='missing_ratio', ascending=False)

print(missing_df)


                              missing_count  missing_ratio
review_scores_value                    1858          0.508
review_scores_communication            1857          0.508
review_scores_checkin                  1857          0.508
review_scores_accuracy                 1857          0.508
review_scores_cleanliness              1857          0.508
review_scores_location                 1858          0.508
review_scores_rating                   1856          0.507
price                                   994          0.272
beds                                    938          0.256
bathrooms                               901          0.246
bedrooms                                274          0.075
latitude                                  0          0.000
neighbourhood_group_cleansed              0          0.000
neighbourhood_cleansed                    0          0.000
accommodates                              0          0.000
longitude                                 0          0.0

**Data preprocessing**

missing value handling

In [6]:
import numpy as np

# review scores
for col in ['review_scores_value','review_scores_communication','review_scores_checkin',
            'review_scores_accuracy','review_scores_cleanliness','review_scores_location',
            'review_scores_rating']:
  data[col] = data[col].fillna(data[col].mean())

# price
data['price'] = (
    data['price']
    .replace(r'[\$,]', '', regex=True)
    .astype(float)
)

data['price'] = data['price'].fillna(data['price'].median())

# beds
data['beds'] = data.apply(
    lambda row: row['accommodates'] if pd.isna(row['beds']) else row['beds'], axis=1
)

# bedrooms
data['bedrooms'] = data.apply(
    lambda row: np.ceil(row['accommodates']/2) if pd.isna(row['bedrooms']) else row['bedrooms'], axis=1
)

# bathrooms
data['bathrooms'] = data.apply(
    lambda row: np.ceil(row['accommodates']/2) if pd.isna(row['bathrooms']) else row['bathrooms'], axis=1
)



In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3659 entries, 0 to 3658
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   neighbourhood_cleansed        3659 non-null   object 
 1   neighbourhood_group_cleansed  3659 non-null   object 
 2   latitude                      3659 non-null   float64
 3   longitude                     3659 non-null   float64
 4   property_type                 3659 non-null   object 
 5   room_type                     3659 non-null   object 
 6   accommodates                  3659 non-null   int64  
 7   bathrooms                     3659 non-null   float64
 8   bedrooms                      3659 non-null   float64
 9   beds                          3659 non-null   float64
 10  price                         3659 non-null   float64
 11  number_of_reviews             3659 non-null   int64  
 12  review_scores_rating          3659 non-null   float64
 13  rev

In [8]:
data.shape

(3659, 19)

In [9]:
data.head()

Unnamed: 0,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,Tampines,East Region,1.34537,103.95887,Private room in villa,Private room,2,1.0,2.0,3.0,143.0,19,4.44,4.37,4.0,4.63,4.78,4.26,4.32
1,Tampines,East Region,1.34754,103.95958,Private room in home,Private room,1,1.0,1.0,1.0,157.0,24,4.16,4.22,4.09,4.43,4.43,4.17,4.04
2,Tampines,East Region,1.34531,103.961,Private room in home,Private room,2,0.5,1.0,2.0,76.0,46,4.41,4.39,4.52,4.63,4.64,4.5,4.36
3,Bukit Merah,Central Region,1.29015,103.80814,Private room in rental unit,Private room,1,1.0,1.0,1.0,157.0,20,4.4,4.16,4.26,4.47,4.42,4.53,4.63
4,Bukit Merah,Central Region,1.28836,103.81144,Private room in rental unit,Private room,1,1.0,1.0,1.0,157.0,16,4.27,4.44,4.06,4.5,4.5,4.63,4.13


**Categorical data encoding**

In [10]:
categorical_cols = ['neighbourhood_cleansed',
            'neighbourhood_group_cleansed',
            'property_type',
            'room_type']

for col in categorical_cols:
    print(f"{col}: {data[col].nunique()} unique values")

neighbourhood_cleansed: 44 unique values
neighbourhood_group_cleansed: 5 unique values
property_type: 49 unique values
room_type: 4 unique values


In [11]:
property_counts = data['property_type'].value_counts()

print(property_counts)

property_type
Private room in rental unit           785
Entire serviced apartment             687
Room in hotel                         499
Entire rental unit                    481
Private room in home                  194
Private room in condo                 189
Entire condo                          185
Private room in serviced apartment    108
Room in boutique hotel                103
Entire home                            55
Room in aparthotel                     51
Private room in townhouse              38
Private room in bungalow               36
Room in hostel                         28
Entire loft                            23
Private room in guesthouse             23
Shared room in bed and breakfast       22
Private room in bed and breakfast      21
Private room in hostel                 18
Shared room in hostel                  14
Room in serviced apartment             13
Private room in villa                  13
Private room                           13
Entire townhouse    

In [12]:
# One-Hot Encoding

threshold = 50
counts = data['property_type'].value_counts()
rare_types = counts[counts < threshold].index
data['property_type'] = data['property_type'].replace(rare_types, 'Other')


ids = raw_data['id']

encoded_data = pd.get_dummies(
    data,
    columns=['neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'room_type', 'property_type'],
    drop_first=False
)
encoded_data.index = ids

encoded_data.shape

(3659, 80)

In [13]:
encoded_data.head()

Unnamed: 0_level_0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,review_scores_rating,review_scores_accuracy,...,property_type_Entire rental unit,property_type_Entire serviced apartment,property_type_Other,property_type_Private room in condo,property_type_Private room in home,property_type_Private room in rental unit,property_type_Private room in serviced apartment,property_type_Room in aparthotel,property_type_Room in boutique hotel,property_type_Room in hotel
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71609,1.34537,103.95887,2,1.0,2.0,3.0,143.0,19,4.44,4.37,...,False,False,True,False,False,False,False,False,False,False
71896,1.34754,103.95958,1,1.0,1.0,1.0,157.0,24,4.16,4.22,...,False,False,False,False,True,False,False,False,False,False
71903,1.34531,103.961,2,0.5,1.0,2.0,76.0,46,4.41,4.39,...,False,False,False,False,True,False,False,False,False,False
275343,1.29015,103.80814,1,1.0,1.0,1.0,157.0,20,4.4,4.16,...,False,False,False,False,False,True,False,False,False,False
275344,1.28836,103.81144,1,1.0,1.0,1.0,157.0,16,4.27,4.44,...,False,False,False,False,False,True,False,False,False,False


**Calculate cosine similarity**

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


num_features = [
    'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'price', 'number_of_reviews',
    'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value'
]

cat_features = [col for col in encoded_data.columns if col not in num_features]

scaler = MinMaxScaler()
X_num_scaled = scaler.fit_transform(encoded_data[num_features])

X_final = np.hstack([X_num_scaled, encoded_data[cat_features].astype(int).values])

print(X_final[0])


[0.52173287 0.91352113 0.06666667 0.07692308 0.25       0.13636364
 0.01092392 0.0152856  0.86       0.8425     0.75       0.9075
 0.945      0.815      0.83       0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         1.         0.
 0.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 1.         0.         0.         0.         0.         0.
 0.         0.        ]


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd


similarity_matrix = cosine_similarity(X_final)

similarity_df = pd.DataFrame(similarity_matrix, index=encoded_data.index, columns=encoded_data.index)

print(similarity_df)
similarity_df.to_csv("cosine_similarity.csv")

# upper_tri = np.triu(similarity_df, k=1)
# result = pd.DataFrame(upper_tri, index=encoded_data.index, columns=encoded_data.index)
# result.to_csv("cosine_similarity_upper_triangle.csv")




id                   71609                71896                \
id                                                              
71609                           1.000000             0.897696   
71896                           0.897696             1.000000   
71903                           0.901756             0.998599   
275343                          0.681629             0.671949   
275344                          0.680335             0.670880   
...                                  ...                  ...   
1449345947252383766             0.592514             0.580999   
1449362431787038446             0.592620             0.581112   
1449745682595467553             0.598019             0.585263   
1449768832549400450             0.604553             0.589592   
1450531852548562189             0.593668             0.577839   

id                   71903                275343               \
id                                                              
71609                   