In [1]:
import pandas as pd
import numpy as np

airbnb_crime = pd.read_csv(r'C:\Madhuri\projects\machine_learning\NYC-Airbnb-Price-Prediction\cleaned_airnb_crime.csv')

Load data

In [2]:
airbnb_crime

Unnamed: 0,id,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,...,distance_to_rockefeller_center,distance_to_one_world_trade_center,distance_to_broadway,distance_to_grand_central_terminal,distance_to_the_metropolitan_museum_of_art,distance_to_american_museum_of_natural_history,distance_to_9/11_memorial_and_museum,distance_to_fifth_avenue,distance_to_chrysler_building,distance_to_the_high_line
0,2539,Brooklyn,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,...,12.381844,8.037748,12.442461,11.708876,14.691985,14.882287,7.913507,14.233274,11.584939,11.504155
1,2595,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,...,0.712997,5.185197,0.602361,0.559866,3.350997,3.188791,5.306275,2.862444,0.745382,1.875920
2,3831,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,...,8.337723,5.463919,8.473967,7.658067,10.489457,10.762152,5.387559,10.047648,7.511024,7.951725
3,5022,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,...,5.299525,11.185555,5.559599,5.809606,2.669340,3.166975,11.303146,3.150993,5.839852,7.599008
4,5099,Manhattan,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,...,1.269235,5.054090,1.492264,0.592728,3.668465,3.743125,5.160574,3.181790,0.443553,2.507579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38816,36425863,Manhattan,40.78099,-73.95366,Private room,129,1,1,2019-07-07,1.00,...,3.249367,9.105050,3.564490,3.717130,0.825234,1.711978,9.220080,1.184891,3.737230,5.655697
38817,36427429,Queens,40.75104,-73.81459,Private room,45,1,1,2019-07-07,1.00,...,13.847483,17.283137,14.336755,13.701176,12.911414,13.839918,17.317505,13.005918,13.538611,16.023458
38818,36438336,Staten Island,40.54179,-74.14275,Private room,235,1,1,2019-07-07,1.00,...,27.812877,21.921177,27.599060,27.297368,30.455700,30.200240,21.801995,29.965427,27.278328,25.714890
38819,36442252,Bronx,40.80787,-73.92400,Entire home/apt,100,1,2,2019-07-07,2.00,...,7.143857,12.983129,7.445728,7.595059,4.572722,5.140092,13.095514,5.032590,7.598851,9.517212


Data split

In [3]:
X = airbnb_crime[['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'availability_365', 'crime_count',
               'calculated_host_listings_count', 'distance_to_statue_of_liberty', 'distance_to_times_square', 'neighbourhood_group', 'room_type']]
y = airbnb_crime['price']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((31056, 11), (7765, 11), (31056,), (7765,))

Scaling

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Select only numeric columns
numeric_features = ['latitude', 'longitude', 'minimum_nights', 
                    'number_of_reviews', 'availability_365', 'crime_count',
                    'calculated_host_listings_count', 
                    'distance_to_statue_of_liberty', 'distance_to_times_square']

scaler = MinMaxScaler()
scaler.fit(X_train[numeric_features])

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Replace numeric columns with scaled versions
X_train_scaled[numeric_features] = scaler.transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])


In [9]:
# Put transformed data into a DataFrame (only numeric features)
X_train_transformed = pd.DataFrame(scaler.transform(X_train[numeric_features]), columns=numeric_features, index=X_train.index)
X_test_transformed = pd.DataFrame(scaler.transform(X_test[numeric_features]), columns=numeric_features, index=X_test.index)

One hot encoding

In [20]:
from sklearn.preprocessing import OneHotEncoder

# Columns you want to one-hot encode
cat_cols = ['room_type', 'neighbourhood_group'] 

# Initialize encoder
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False) 

# Fit on training data
ohe.fit(airbnb_crime.loc[X_train.index, cat_cols])

# Transform train and test
X_train_ohe = ohe.transform(airbnb_crime.loc[X_train.index, cat_cols])
X_test_ohe  = ohe.transform(airbnb_crime.loc[X_test.index, cat_cols])

# Convert to DataFrame with proper column names
X_train_ohe = pd.DataFrame(
    X_train_ohe,
    columns=ohe.get_feature_names_out(cat_cols),
    index=X_train.index
)

X_test_ohe = pd.DataFrame(
    X_test_ohe,
    columns=ohe.get_feature_names_out(cat_cols),
    index=X_test.index
)

# Store in dictionary (modular)
X_train_dict = {
    "numeric": X_train_encoded,
    "categorical": X_train_ohe
}

X_test_dict = {
    "numeric": X_test_encoded,
    "categorical": X_test_ohe
}

# Combine back into final DataFrame for modeling
X_train_final = pd.concat(X_train_dict.values(), axis=1)
X_test_final  = pd.concat(X_test_dict.values(), axis=1)


In [25]:
X_train_final

Unnamed: 0,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,latitude,longitude,minimum_nights,number_of_reviews,availability_365,crime_count,...,distance_to_statue_of_liberty,distance_to_times_square,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn.1,neighbourhood_group_Manhattan.1,neighbourhood_group_Queens.1,neighbourhood_group_Staten Island.1
28907,0,1.0,0,0,0.588826,0.466111,0.022418,0.001592,0.852055,0.000000,...,0.179923,0.054271,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
37419,1.0,0,0,0,0.449966,0.481978,0.000000,0.000000,1.000000,0.624909,...,0.091419,0.226271,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
10999,0,1.0,0,0,0.550905,0.448725,0.000801,0.030255,0.000000,0.000000,...,0.112088,0.107248,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
28196,1.0,0,0,0,0.512263,0.567578,0.000801,0.109873,0.082192,0.624909,...,0.246727,0.181275,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
29403,0,1.0,0,0,0.805816,0.569287,0.000000,0.044586,0.282192,0.000000,...,0.577611,0.249029,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,1.0,0,0,0,0.455892,0.540209,0.000801,0.020701,0.000000,0.624909,...,0.187808,0.230138,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
11284,0,1.0,0,0,0.647835,0.469376,0.004804,0.007962,0.000000,0.000000,...,0.267154,0.029286,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
38158,0,0,1.0,0,0.581082,0.877522,0.000000,0.000000,0.487671,0.930002,...,0.772873,0.500619,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0,1.0,0,0,0.671489,0.545125,0.002402,0.160828,0.767123,0.000000,...,0.368925,0.094109,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


making dictionery

In [18]:
from sklearn.preprocessing import OneHotEncoder

# --- Initialize OneHotEncoder ---
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# --- Fit on train set ---
ohe.fit(airbnb_crime.loc[X_train.index, ['room_type']])

# --- Transform both train & test ---
X_train_ohe = ohe.transform(airbnb_crime.loc[X_train.index, ['room_type']])
X_test_ohe  = ohe.transform(airbnb_crime.loc[X_test.index, ['room_type']])

# --- Convert back to DataFrame with column names ---
X_train_ohe = pd.DataFrame(
    X_train_ohe,
    columns=ohe.get_feature_names_out(['room_type']),
    index=X_train.index
)

X_test_ohe = pd.DataFrame(
    X_test_ohe,
    columns=ohe.get_feature_names_out(['room_type']),
    index=X_test.index
)

# --- Store everything in a dictionary (easy to debug/modify) ---
X_train_dict = {
    "numeric": X_train_encoded,
    "room_type": X_train_ohe
}

X_test_dict = {
    "numeric": X_test_encoded,
    "room_type": X_test_ohe
}

# --- Combine back into final DataFrame for modeling ---
X_train_final = pd.concat(X_train_dict.values(), axis=1)
X_test_final  = pd.concat(X_test_dict.values(), axis=1)


Model building

Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

y_train = airbnb_crime.loc[X_train.index, 'price']
y_test  = airbnb_crime.loc[X_test.index, 'price']

# Now fit model
lr = LinearRegression()
lr.fit(X_train_final, y_train)



In [28]:
y_pred = lr.predict(X_test_final)



In [30]:
lr.score(X_test_final, y_test)



0.15274448689052622

In [38]:
from sklearn.metrics import mean_squared_error, r2_score
lr_train_mse = mean_squared_error(y_train, lr.predict(X_train_final))
lr_train_r2 = r2_score(y_train, lr.predict(X_train_final))

lr_test_mse = mean_squared_error(y_test, y_pred)
lr_test_r2 = r2_score(y_test, y_pred)

print('LR MSE (Train):', lr_train_mse)
print('LR R2  (Train):', lr_train_r2)
print('LR MSE  (Test):', lr_test_mse)       
print('LR R2   (Test):', lr_test_r2)

LR MSE (Train): 35453.99297539123
LR R2  (Train): 0.11684699744130311
LR MSE  (Test): 33628.446100193694
LR R2   (Test): -0.005299556840179465




In [41]:
lr_results = pd.DataFrame(['Linear Regression', lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ['Model', 'Train MSE', 'Train R2', 'Test MSE', 'Test R2']

In [42]:
lr_results

Unnamed: 0,Model,Train MSE,Train R2,Test MSE,Test R2
0,Linear Regression,35453.992975,0.116847,33628.4461,-0.0053


KNN

In [31]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10) # n_neighbours is a "hyperparameter", which can be changed to improve performance of the model
knn.fit(X_train_final, y_train)



In [32]:
knn.score(X_test_final, y_test)



0.156067584145277

In [34]:
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor(
    max_depth=10, # 30, 50, 100 - we can have many more splits when we have continuous variables
    min_samples_split=2,
    max_leaf_nodes=30 # basically equivalent to max_depth but still considered
)

In [35]:
dt_reg.fit(X_train_final, y_train)



In [36]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_pred = dt_reg.predict(X_test_final)

print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

MSE: 33628.446100193694
RMSE: 183.3806044820272
MAE: 58.18598071659245
R²: -0.005299556840179465




In [50]:
y_dt_reg_pred = dt_reg.predict(X_train_final)
y_dt_reg_test_pred = dt_reg.predict(X_test_final)



In [51]:
dt_reg_train_mse = mean_squared_error(y_train, dt_reg.predict(X_train_final))
dt_reg_train_r2 = r2_score(y_train, dt_reg.predict(X_train_final))

dt_reg_test_mse = mean_squared_error(y_test, y_pred)
dt_reg_test_r2 = r2_score(y_test, y_pred)

dt_reg_results = pd.DataFrame(['Decision Tree', dt_reg_train_mse,dt_reg_train_r2,dt_reg_test_mse,dt_reg_test_r2]).transpose()
dt_reg_results.columns = ['Model', 'Train MSE', 'Train R2', 'Test MSE', 'Test R2']



In [52]:
dt_reg_results

Unnamed: 0,Model,Train MSE,Train R2,Test MSE,Test R2
0,Decision Tree,22006.653241,0.451818,33628.4461,-0.0053


Random forest

In [44]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=2, random_state=100)
rf.fit(X_train_final, y_train)



In [None]:
y_rf_train_pred = rf.predict(X_train_final)
y_rf_test_pred = rf.predict(X_test_final)

In [47]:
rf_train_mse = mean_squared_error(y_train, rf.predict(X_train_final))
rf_train_r2 = r2_score(y_train, rf.predict(X_train_final))

rf_test_mse = mean_squared_error(y_test, y_pred)
rf_test_r2 = r2_score(y_test, y_pred)

rf_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Model', 'Train MSE', 'Train R2', 'Test MSE', 'Test R2']



In [49]:
rf_results

Unnamed: 0,Model,Train MSE,Train R2,Test MSE,Test R2
0,Random Forest,35437.656805,0.117254,33628.4461,-0.0053
