In [43]:
import os
import csv
import pandas as pd
import numpy as np

#load datasets

airbnb_crime = pd.read_csv('cleaned_airnb_crime.csv')


In [44]:
airbnb_crime.shape

(38821, 28)

In [45]:
airbnb_crime.dtypes

id                                                     int64
neighbourhood_group                                   object
latitude                                             float64
longitude                                            float64
room_type                                             object
price                                                  int64
minimum_nights                                         int64
number_of_reviews                                      int64
last_review                                           object
reviews_per_month                                    float64
calculated_host_listings_count                         int64
availability_365                                       int64
crime_count                                            int64
distance_to_statue_of_liberty_km                     float64
distance_to_times_square_km                          float64
distance_to_central_park_km                          float64
distance_to_empire_state

In [46]:
#=========================================== 1ST ML REGRESSION AND LINEAR REGRESSION TEST ===========================================

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
X = airbnb_crime.drop(['id', 'last_review','neighbourhood_group','room_type','price'], axis=1)
y = airbnb_crime['price']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [50]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10) # n_neighbours is a "hyperparameter", which can be changed to improve performance of the model
knn.fit(X_train, y_train)

In [51]:
knn.score(X_test, y_test)  # R2 = 0.04

0.04351781610417438

In [52]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [53]:
lr.score(X_test, y_test) # R2 = 0.08

0.07883072026989235

Obs:Our model improves with linear regression.  R2 = 0.08 < R2 = 0.04

In [54]:
#=========================================== 2ND ML REGRESSION AND LINEAR REGRESSION TEST ===========================================

In [55]:
#Normalization 

from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler() # an exxample of a transformer

minmax.fit(X_train) #Fit to our data training 

X_train_transformed = minmax.transform(X_train) # Transform  our data training
X_test_transformed = minmax.transform(X_test) 

In [56]:
# Put transformed data into a DataFrame
X_train_transformed = pd.DataFrame(minmax.transform(X_train), columns=X_train.columns) 
X_test_transformed = pd.DataFrame(minmax.transform(X_test), columns=X_train.columns)

In [57]:
X_train_transformed

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,crime_count,distance_to_statue_of_liberty_km,distance_to_times_square_km,...,distance_to_rockefeller_center_km,distance_to_one_world_trade_center_km,distance_to_broadway_km,distance_to_grand_central_terminal_km,distance_to_the_metropolitan_museum_of_art_km,distance_to_american_museum_of_natural_history_km,distance_to_9/11_memorial_and_museum_km,distance_to_fifth_avenue_km,distance_to_chrysler_building_km,distance_to_the_high_line_km
0,0.504624,0.559398,0.000801,0.000000,0.000171,0.000000,0.000000,0.624909,0.234107,0.173723,...,0.162230,0.188394,0.173341,0.147953,0.193835,0.211408,0.192503,0.186190,0.141327,0.191626
1,0.538775,0.496517,0.000000,0.092357,0.024449,0.000000,0.841096,0.000000,0.166269,0.102625,...,0.098224,0.106340,0.103042,0.082483,0.152724,0.160893,0.112671,0.142251,0.077945,0.097998
2,0.440378,0.557880,0.000000,0.004777,0.048726,0.006135,0.000000,0.624909,0.216988,0.245899,...,0.236356,0.209620,0.245688,0.221945,0.268748,0.285021,0.211585,0.261502,0.215957,0.253412
3,0.380311,0.573234,0.000801,0.001592,0.034023,0.003067,0.161644,0.624909,0.260213,0.324606,...,0.314891,0.280756,0.324315,0.301230,0.341997,0.358946,0.281694,0.335795,0.295415,0.332809
4,0.635615,0.472318,0.001601,0.001592,0.000684,0.000000,0.000000,0.000000,0.265744,0.024669,...,0.033679,0.203034,0.025701,0.052224,0.067595,0.058704,0.211537,0.058264,0.056360,0.068424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31051,0.491493,0.440355,0.000801,0.014331,0.007181,0.000000,0.000000,0.000000,0.044586,0.170719,...,0.172651,0.018729,0.171748,0.160944,0.227461,0.229816,0.019412,0.217597,0.158805,0.135708
31052,0.729017,0.535445,0.023219,0.000000,0.002736,0.015337,0.832877,0.000000,0.453361,0.154570,...,0.146224,0.370679,0.153788,0.167170,0.065507,0.073729,0.378866,0.079660,0.167946,0.220497
31053,0.548445,0.482035,0.003203,0.011146,0.014361,0.000000,0.139726,0.000000,0.158170,0.088946,...,0.087575,0.096391,0.089675,0.073035,0.146257,0.151410,0.103589,0.135443,0.069856,0.075364
31054,0.693185,0.555336,0.010408,0.007962,0.004445,0.000000,0.008219,0.000000,0.421059,0.130497,...,0.117987,0.337400,0.129398,0.135167,0.039111,0.062795,0.345279,0.052109,0.134394,0.199983


In [58]:
#Put transformed data into a DataFrame 
X_train_transformed = pd.DataFrame(minmax.transform(X_train))
X_test_transformed = pd.DataFrame(minmax.transform(X_test))

In [59]:

knn.fit(X_train_transformed, y_train)

In [60]:
lr.fit(X_train_transformed, y_train)

In [61]:
knn.score(X_test_transformed, y_test) # R2 = 0.6

0.0590910134840994

In [62]:
lr.score(X_test_transformed, y_test) # R2 = 0.08

0.07883072027129079

Obs: With raw data we obtain a R2 of 0.04, just by normalizing our data, model's perfomance increase to a R2 of 0.06. For linear regression the R2 maintain.

This happens because KNN is a distance based algorithm, so its suffers a lot with data in completely different scales.

In [63]:
#=========================================== 3RD ML TEST ===========================================

In [64]:
# room_type- ordinal -> Label Encoding
airbnb_crime.room_type.value_counts(dropna=False)

room_type
Entire home/apt    20321
Private room       17654
Shared room          846
Name: count, dtype: int64

In [65]:
# neighbourhood_group - nominal -> One Hot Encoding
airbnb_crime.neighbourhood_group.value_counts(dropna=False)

neighbourhood_group
Manhattan        16621
Brooklyn         16439
Queens            4572
Bronx              875
Staten Island      314
Name: count, dtype: int64

In [66]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(drop='first')
oh_encoder.fit(X_train[['neighbourhood_group']])

X_train_cat_transformed = oh_encoder.transform(X_train[['neighbourhood_group']])
X_train_cat_transformed = pd.DataFrame(X_train_cat_transformed, columns=oh_encoder.get_feature_names_out())
X_train_cat_transformed

KeyError: "None of [Index(['neighbourhood_group'], dtype='object')] are in the [columns]"

In [None]:
from sklearn.preprocessing import OrdinalEncoder

l_encoder = OrdinalEncoder()
l_encoder.fit(X_train[['Experience']]) # input needs to be a dataframe, not a series
X_train_label = l_encoder.transform(X_train[['Experience']]) # need to have the same cols for transformation as we did for training
X_train_label