In [363]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from matplotlib_inline.backend_inline import set_matplotlib_formats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from scipy import stats
from scipy.stats import chi2_contingency

In [364]:
# 한글 폰트
plt.rc("font", family = 'AppleGothic')
plt.rc("axes", unicode_minus = False)

# 글씨 선명하게
set_matplotlib_formats("retina")

In [365]:
df = pd.read_csv('2025_Airbnb_NYC_listings.csv')
print(f"데이터 수 : {df.shape}")

데이터 수 : (22308, 73)


# 컬럼 정리

In [366]:
cols_keep =[
    'host_id',
    'host_response_time',
    'host_response_rate',
    'host_acceptance_rate',
    'host_is_superhost',
    'host_identity_verified',
    'host_listings_count',
    'neighbourhood_group_cleansed',
    'latitude',
    'longitude',
    'property_type',
    'room_type',
    'accommodates',
    'bedrooms',
    'beds',
    'bathrooms',
    'amenities',
    'number_of_reviews',
    'reviews_per_month',
    'review_scores_rating',
    'review_scores_cleanliness',
    'review_scores_communication',
    'first_review',
    'last_review',
    'price',
    'minimum_nights',
    'maximum_nights',
    'availability_365',
    'instant_bookable',
]

df = df[cols_keep]

# neighbourhood_group_cleansed 전처리

In [367]:
#neighbourhood_group_cleansed 원 핫 인코딩 

# encoding = pd.get_dummies(df, columns = ['neighbourhood_group_cleansed'], drop_first = True)
df = pd.get_dummies(df, columns = ['neighbourhood_group_cleansed'])

df[df.select_dtypes(include = 'bool').columns] = df.select_dtypes(include = 'bool').astype(int)

In [368]:
# 컬럼명 변경

df.rename(columns = {
    'neighbourhood_group_cleansed_Bronx' : 'bornx',
    'neighbourhood_group_cleansed_Brooklyn' : 'brooklyn',
    'neighbourhood_group_cleansed_Manhattan' : 'manhattan',
    'neighbourhood_group_cleansed_Queens' : 'queens',
    'neighbourhood_group_cleansed_Staten Island' : 'staten island'
}, inplace = True)

# price 전처리

In [369]:
# $ 제거

df['price'] = df['price'].replace('[\$,]', '', regex = True).astype(float)

In [370]:
import matplotlib.pyplot as plt
import seaborn as sns

# IQR 계산
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

In [371]:
df.loc[:, 'log_price'] = np.log(df['price'])

# instant_bookable 전처리

In [372]:
df['instant_bookable'] = (df['instant_bookable'] == 't').astype(int)

In [373]:
Q1 = df['maximum_nights'].quantile(0.25)
Q3 = df['maximum_nights'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['maximum_nights'] >= lower_bound) & (df['maximum_nights'] <= upper_bound)]

In [374]:
df = df[df['minimum_nights'] < 365]