In [336]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from matplotlib_inline.backend_inline import set_matplotlib_formats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from scipy import stats
from scipy.stats import chi2_contingency

In [337]:
# 한글 폰트
plt.rc("font", family = 'AppleGothic')
plt.rc("axes", unicode_minus = False)

# 글씨 선명하게
set_matplotlib_formats("retina")

In [349]:
df = pd.read_csv('2025_Airbnb_NYC_listings.csv')
print(f"데이터 수 : {df.shape}")

데이터 수 : (22308, 73)


# 컬럼 정리

In [339]:
cols_keep =[
    'host_id',
    'host_response_time',
    'host_response_rate',
    'host_acceptance_rate',
    'host_is_superhost',
    'host_identity_verified',
    'host_listings_count',
    'neighbourhood_group_cleansed',
    'latitude',
    'longitude',
    'property_type',
    'room_type',
    'accommodates',
    'bedrooms',
    'beds',
    'bathrooms',
    'amenities',
    'number_of_reviews',
    'reviews_per_month',
    'review_scores_rating',
    'review_scores_cleanliness',
    'review_scores_communication',
    'first_review',
    'last_review',
    'price',
    'minimum_nights',
    'maximum_nights',
    'availability_365',
    'instant_bookable',
]

df = df[cols_keep]

df

Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,host_listings_count,neighbourhood_group_cleansed,latitude,longitude,...,review_scores_rating,review_scores_cleanliness,review_scores_communication,first_review,last_review,price,minimum_nights,maximum_nights,availability_365,instant_bookable
0,62165,,,,f,t,1.0,Brooklyn,40.673760,-73.966110,...,4.88,4.80,5.00,2010-12-11,2013-05-10,$200.00,90,365,362,f
1,157798,,,100%,f,t,1.0,Manhattan,40.792454,-73.940742,...,4.77,4.76,4.90,2010-10-04,2023-12-09,$82.00,30,999,204,f
2,165789,within a few hours,100%,40%,f,t,1.0,Brooklyn,40.684420,-73.980680,...,4.70,4.52,4.88,2012-07-09,2023-08-30,$765.00,3,60,326,f
3,166532,within an hour,100%,97%,t,t,1.0,Manhattan,40.818058,-73.946671,...,4.85,4.50,4.96,2010-08-28,2025-02-21,$139.00,2,45,25,f
4,168525,within an hour,100%,100%,t,t,2.0,Brooklyn,40.710651,-73.950874,...,4.82,4.61,4.88,2010-08-02,2025-01-03,$130.00,4,45,38,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22303,40019013,,,,f,t,1.0,Manhattan,40.813794,-73.952399,...,,,,,,$72.00,30,365,358,f
22304,483056418,within an hour,100%,80%,t,t,24.0,Brooklyn,40.685500,-73.919510,...,,,,,,$58.00,30,365,363,f
22305,30283594,within an hour,94%,99%,f,t,619.0,Manhattan,40.745290,-73.979380,...,,,,,,$299.00,30,365,99,f
22306,407304997,within an hour,89%,100%,f,t,26.0,Manhattan,40.757133,-73.983124,...,,,,,,$200.00,30,365,365,t


# neighbourhood_group_cleansed 전처리

In [340]:
#neighbourhood_group_cleansed 원 핫 인코딩 

# encoding = pd.get_dummies(df, columns = ['neighbourhood_group_cleansed'], drop_first = True)
df = pd.get_dummies(df, columns = ['neighbourhood_group_cleansed'])

df[df.select_dtypes(include = 'bool').columns] = df.select_dtypes(include = 'bool').astype(int)

df

Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,host_listings_count,latitude,longitude,property_type,...,price,minimum_nights,maximum_nights,availability_365,instant_bookable,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island
0,62165,,,,f,t,1.0,40.673760,-73.966110,Private room in rental unit,...,$200.00,90,365,362,f,0,1,0,0,0
1,157798,,,100%,f,t,1.0,40.792454,-73.940742,Private room in condo,...,$82.00,30,999,204,f,0,0,1,0,0
2,165789,within a few hours,100%,40%,f,t,1.0,40.684420,-73.980680,Private room in home,...,$765.00,3,60,326,f,0,1,0,0,0
3,166532,within an hour,100%,97%,t,t,1.0,40.818058,-73.946671,Entire home,...,$139.00,2,45,25,f,0,0,1,0,0
4,168525,within an hour,100%,100%,t,t,2.0,40.710651,-73.950874,Private room in rental unit,...,$130.00,4,45,38,f,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22303,40019013,,,,f,t,1.0,40.813794,-73.952399,Private room in rental unit,...,$72.00,30,365,358,f,0,0,1,0,0
22304,483056418,within an hour,100%,80%,t,t,24.0,40.685500,-73.919510,Private room in rental unit,...,$58.00,30,365,363,f,0,1,0,0,0
22305,30283594,within an hour,94%,99%,f,t,619.0,40.745290,-73.979380,Entire rental unit,...,$299.00,30,365,99,f,0,0,1,0,0
22306,407304997,within an hour,89%,100%,f,t,26.0,40.757133,-73.983124,Entire rental unit,...,$200.00,30,365,365,t,0,0,1,0,0


In [341]:
# 컬럼명 변경

df.rename(columns = {
    'neighbourhood_group_cleansed_Bronx' : 'bornx',
    'neighbourhood_group_cleansed_Brooklyn' : 'brooklyn',
    'neighbourhood_group_cleansed_Manhattan' : 'manhattan',
    'neighbourhood_group_cleansed_Queens' : 'queens',
    'neighbourhood_group_cleansed_Staten Island' : 'staten island'
}, inplace = True)

df.head()

Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,host_listings_count,latitude,longitude,property_type,...,price,minimum_nights,maximum_nights,availability_365,instant_bookable,bornx,brooklyn,manhattan,queens,staten island
0,62165,,,,f,t,1.0,40.67376,-73.96611,Private room in rental unit,...,$200.00,90,365,362,f,0,1,0,0,0
1,157798,,,100%,f,t,1.0,40.792454,-73.940742,Private room in condo,...,$82.00,30,999,204,f,0,0,1,0,0
2,165789,within a few hours,100%,40%,f,t,1.0,40.68442,-73.98068,Private room in home,...,$765.00,3,60,326,f,0,1,0,0,0
3,166532,within an hour,100%,97%,t,t,1.0,40.818058,-73.946671,Entire home,...,$139.00,2,45,25,f,0,0,1,0,0
4,168525,within an hour,100%,100%,t,t,2.0,40.710651,-73.950874,Private room in rental unit,...,$130.00,4,45,38,f,0,1,0,0,0


# price 전처리

In [342]:
# $ 제거

df['price'] = df['price'].replace('[\$,]', '', regex = True).astype(float)

df.head()

Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,host_listings_count,latitude,longitude,property_type,...,price,minimum_nights,maximum_nights,availability_365,instant_bookable,bornx,brooklyn,manhattan,queens,staten island
0,62165,,,,f,t,1.0,40.67376,-73.96611,Private room in rental unit,...,200.0,90,365,362,f,0,1,0,0,0
1,157798,,,100%,f,t,1.0,40.792454,-73.940742,Private room in condo,...,82.0,30,999,204,f,0,0,1,0,0
2,165789,within a few hours,100%,40%,f,t,1.0,40.68442,-73.98068,Private room in home,...,765.0,3,60,326,f,0,1,0,0,0
3,166532,within an hour,100%,97%,t,t,1.0,40.818058,-73.946671,Entire home,...,139.0,2,45,25,f,0,0,1,0,0
4,168525,within an hour,100%,100%,t,t,2.0,40.710651,-73.950874,Private room in rental unit,...,130.0,4,45,38,f,0,1,0,0,0


In [343]:
import matplotlib.pyplot as plt
import seaborn as sns

# IQR 계산
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

df


Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,host_listings_count,latitude,longitude,property_type,...,price,minimum_nights,maximum_nights,availability_365,instant_bookable,bornx,brooklyn,manhattan,queens,staten island
0,62165,,,,f,t,1.0,40.673760,-73.966110,Private room in rental unit,...,200.0,90,365,362,f,0,1,0,0,0
1,157798,,,100%,f,t,1.0,40.792454,-73.940742,Private room in condo,...,82.0,30,999,204,f,0,0,1,0,0
3,166532,within an hour,100%,97%,t,t,1.0,40.818058,-73.946671,Entire home,...,139.0,2,45,25,f,0,0,1,0,0
4,168525,within an hour,100%,100%,t,t,2.0,40.710651,-73.950874,Private room in rental unit,...,130.0,4,45,38,f,0,1,0,0,0
5,169927,,,,f,t,2.0,40.762030,-73.988690,Private room in rental unit,...,139.0,30,1125,365,f,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22303,40019013,,,,f,t,1.0,40.813794,-73.952399,Private room in rental unit,...,72.0,30,365,358,f,0,0,1,0,0
22304,483056418,within an hour,100%,80%,t,t,24.0,40.685500,-73.919510,Private room in rental unit,...,58.0,30,365,363,f,0,1,0,0,0
22305,30283594,within an hour,94%,99%,f,t,619.0,40.745290,-73.979380,Entire rental unit,...,299.0,30,365,99,f,0,0,1,0,0
22306,407304997,within an hour,89%,100%,f,t,26.0,40.757133,-73.983124,Entire rental unit,...,200.0,30,365,365,t,0,0,1,0,0


In [348]:
df.loc[:, 'log_price'] = np.log(df['price'])

# instant_bookable 전처리

In [345]:
df['instant_bookable'] = (df['instant_bookable'] == 't').astype(int)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['instant_bookable'] = (df['instant_bookable'] == 't').astype(int)


Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_identity_verified,host_listings_count,latitude,longitude,property_type,...,minimum_nights,maximum_nights,availability_365,instant_bookable,bornx,brooklyn,manhattan,queens,staten island,log_price
0,62165,,,,f,t,1.0,40.673760,-73.966110,Private room in rental unit,...,90,365,362,0,0,1,0,0,0,5.298317
1,157798,,,100%,f,t,1.0,40.792454,-73.940742,Private room in condo,...,30,999,204,0,0,0,1,0,0,4.406719
3,166532,within an hour,100%,97%,t,t,1.0,40.818058,-73.946671,Entire home,...,2,45,25,0,0,0,1,0,0,4.934474
4,168525,within an hour,100%,100%,t,t,2.0,40.710651,-73.950874,Private room in rental unit,...,4,45,38,0,0,1,0,0,0,4.867534
5,169927,,,,f,t,2.0,40.762030,-73.988690,Private room in rental unit,...,30,1125,365,0,0,0,1,0,0,4.934474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22303,40019013,,,,f,t,1.0,40.813794,-73.952399,Private room in rental unit,...,30,365,358,0,0,0,1,0,0,4.276666
22304,483056418,within an hour,100%,80%,t,t,24.0,40.685500,-73.919510,Private room in rental unit,...,30,365,363,0,0,1,0,0,0,4.060443
22305,30283594,within an hour,94%,99%,f,t,619.0,40.745290,-73.979380,Entire rental unit,...,30,365,99,0,0,0,1,0,0,5.700444
22306,407304997,within an hour,89%,100%,f,t,26.0,40.757133,-73.983124,Entire rental unit,...,30,365,365,1,0,0,1,0,0,5.298317


In [346]:
Q1 = df['maximum_nights'].quantile(0.25)
Q3 = df['maximum_nights'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['maximum_nights'] >= lower_bound) & (df['maximum_nights'] <= upper_bound)]

In [347]:
df = df[df['minimum_nights'] < 365]