In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Union
from sklearn.preprocessing import LabelEncoder
import matplotlib.font_manager as fm
import matplotlib as mp
mp.rc('font',family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

: 

In [3]:
# 1. 데이터 로드
airbnb_df = pd.read_csv('../data/Airbnb_Open_Data.csv',low_memory=False)

In [None]:
# 2. 데이터 구조 및 기초 통계 확인
airbnb_df.head()


In [None]:
airbnb_df.info()

In [6]:
def drop_column(df,column_list):
    for column_name in column_list:
        df = df.drop(column_name,axis=1)
    return df

In [None]:
columns = [
    "host name",
    "country",
    "country code",
    "calculated host listings count",
    "license",
    "house_rules",
    "last review",
    "instant_bookable",
    "long",
    "lat",
    # 'neighbourhood',
    # "neighbourhood group",
    "NAME",
    "host id",
    "host_identity_verified"
]

airbnb_df = drop_column(airbnb_df,columns)
airbnb_df.info()


In [None]:
airbnb_df['cancellation_policy'].unique()

In [9]:
airbnb_df['price'] = airbnb_df['price'].replace(r'[\$,]', '', regex=True).astype(float)
airbnb_df['service fee'] = airbnb_df['service fee'].replace(r'[\$,]', '', regex=True).astype(float)

In [None]:
airbnb_df.describe()

In [None]:
airbnb_df.isnull().sum()

In [12]:
def fillna(df,columns,default=0):
    for column in columns:
        df[column] = df[column].fillna(default)
    return df

def dropna(df,column_list):
    df = df.dropna(subset=column_list,axis=0)
    return df
def change_type(df,columns,type):
    for column in columns:
        df[column] = df[column].astype(type)
    return df
def cleaned_data(df: pd.DataFrame, columns: List[str], value: Union[int, float, List[Union[int, float]]], compare_type: str) -> pd.DataFrame:
    if not columns:
        raise ValueError("컬럼 리스트가 비어있습니다.")
    
    if compare_type == "over":
        if not isinstance(value, (int, float)):
            raise TypeError("value가 숫자가 아닙니다.")
        for column in columns:
            df = df[df[column] < value]
    elif compare_type == "under":
        if not isinstance(value, (int, float)):
            raise TypeError("value가 숫자가 아닙니다.")
        for column in columns:
            df = df[df[column] > value]
    elif compare_type == "between":
        if not isinstance(value, (list, tuple)) or len(value) != 2:
            raise TypeError("value가 리스트 또는 튜플이 아니거나 길이가 2가 아닙니다.")
        lower, upper = value[0], value[1]
        for column in columns:
            df = df[(df[column] > lower) & (df[column] < upper)]
    else:
        raise ValueError(f"Invalid compare_type: {compare_type}. Use 'over', 'under', or 'between'")
    return df


In [None]:
airbnb_df.describe()

In [14]:
room_type_avg_price = airbnb_df.groupby('room type')['price'].mean()
# 데이터 전처리
airbnb_df = fillna(airbnb_df,['price'],airbnb_df['room type'].map(room_type_avg_price))
airbnb_df = fillna(airbnb_df,['minimum nights','availability 365'],1)
airbnb_df = fillna(airbnb_df,['service fee','number of reviews','review rate number'])
airbnb_df = fillna(airbnb_df,['reviews per month'],airbnb_df['review rate number']/12)
airbnb_df = dropna(airbnb_df,['Construction year','neighbourhood group','cancellation_policy'])

airbnb_df = cleaned_data(airbnb_df,['price'],[50,1000],'between')
airbnb_df = cleaned_data(airbnb_df,['availability 365'],[100,365],'between')
airbnb_df = cleaned_data(airbnb_df,['number of reviews'],100,'over')
airbnb_df = cleaned_data(airbnb_df,['minimum nights'],0,'under')

airbnb_df = change_type(airbnb_df,['availability 365','number of reviews','Construction year','price','service fee'],int)
airbnb_df['availability 365'] = round((365 - airbnb_df['availability 365']) / 365, 2)
airbnb_df['price'] = airbnb_df['price'] * 1400
airbnb_df['service fee'] = airbnb_df['service fee'] *1400

In [None]:
airbnb_df['availability 365'].max()

In [None]:
airbnb_df.info()
airbnb_df.isnull().sum()

In [None]:
airbnb_df.describe()

In [18]:
# Label Encoder 초기화
le_neighbourhood = LabelEncoder()
le_cancellation = LabelEncoder()
le_room_type = LabelEncoder()

# 범주형 데이터에 Label Encoding 적용
airbnb_df['neighbourhood group'] = le_neighbourhood.fit_transform(airbnb_df['neighbourhood group'])
airbnb_df['cancellation_policy'] = le_cancellation.fit_transform(airbnb_df['cancellation_policy'])
airbnb_df['room type'] = le_room_type.fit_transform(airbnb_df['room type'])
airbnb_df['cancellation_policy'] = 2 - airbnb_df['cancellation_policy']




In [None]:
# 1. 숙소 선호도 관련 변수
preference_columns = [
    'number of reviews', 'reviews per month', 'review rate number',
    'room type', 'neighbourhood group'
]
df_preference = airbnb_df[preference_columns]

# 상관관계 행렬 계산
pref_corr_matrix = df_preference.corr()

# 첫 번째 히트맵: 숙소 선호도
plt.figure(figsize=(10, 6))
sns.heatmap(pref_corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title("Heatmap of Airbnb Listing Preference Factors")
plt.show()
airbnb_df = airbnb_df.rename(columns={'availability 365': 'booking_rate'})
# 2. 예약에 영향을 주는 요소 관련 변수
booking_columns = [
    'price', 'service fee', 'minimum nights', 'booking_rate',
    'cancellation_policy'
]
df_booking = airbnb_df[booking_columns]

# 상관관계 행렬 계산
booking_corr_matrix = df_booking.corr()

# 두 번째 히트맵: 예약에 영향을 주는 요소
plt.figure(figsize=(10, 6))
sns.heatmap(booking_corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
plt.title("Heatmap of Factors Affecting Airbnb Bookings")
plt.show()

In [None]:
# Construction year별 평균값 계산
yearly_data = airbnb_df.groupby('Construction year').agg({
    'price': 'mean',
    'booking_rate': 'mean',
    'number of reviews': 'mean'
}).reset_index()

# 서브플롯 생성
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
min_year = int(yearly_data['Construction year'].min())
max_year = int(yearly_data['Construction year'].max())
# 레이아웃 조정
plt.tight_layout()
for ax in [ax1, ax2, ax3]:
    ax.set_xticks(range(min_year, max_year + 1))

# 첫 번째 그래프: 가격
sns.lineplot(x='Construction year', y='price', data=yearly_data, ax=ax1, color='blue', marker='o')
ax1.set_title('건축 년도에 따른 평균 가격')
ax1.set_ylabel('평균 가격 (log)')
ax1.grid(True)

# 두 번째 그래프: 리뷰 수
sns.lineplot(x='Construction year', y='number of reviews', data=yearly_data, ax=ax3, color='green', marker='o')
ax3.set_title('건축 년도에 따른 리뷰수')
ax3.set_xlabel('건축 년도')
ax3.set_ylabel('리뷰 수')
ax3.grid(True)

# 세 번째 그래프: 예약율
sns.lineplot(x='Construction year', y='booking_rate', data=yearly_data, ax=ax2, color='orange', marker='o')
ax2.set_title('건축 년도에 예약율')
ax2.set_ylabel('예약 율')
ax2.grid(True)


# 레이아웃 조정
plt.tight_layout()
plt.show()

In [None]:
# neighbourhood group, room type, Construction year별 평균 계산
def calculate_grouped_stats_with_outliers(df, use_median=False, iqr_factor=1.5):
    df['log_price'] = np.log1p(df['price'])  # 로그 변환
    
    # booking_rate가 0~1 사이의 값인 경우 10% 단위 구간 생성
    bins = np.arange(0, 1.1, 0.1)  # 0, 0.1, 0.2, ... 1.0
    labels = [f"{int(b*100)}-{int((b+0.1)*100)}%" for b in bins[:-1]]
    df['booking_rate_bucket'] = pd.cut(df['booking_rate'], bins=bins, labels=labels, right=False)
    
    grouped_stats = []
    
    # groupby에 booking_rate_bucket 추가
    for (year, nb_group, room_type, booking_rate_bucket), group in df.groupby(['Construction year', 'neighbourhood group', 'room type', 'booking_rate_bucket']):
        
        if use_median:
            price_stat = group['price'].median()
        else:
            price_stat = group['price'].mean()
        
        group_mean = {
            'Construction year': year,
            'neighbourhood group': nb_group,
            'room type': room_type,
            'booking_rate_bucket': booking_rate_bucket,
            'price': price_stat,
            'booking_rate': group['booking_rate'].mean()
        }
        grouped_stats.append(group_mean)
    
    return pd.DataFrame(grouped_stats)


# 데이터에 적용
grouped_data = calculate_grouped_stats_with_outliers(airbnb_df, use_median=False, iqr_factor=2.0)


# 히트맵: 지역과 숙소 종류별 평균 가격
pivot_table = grouped_data.pivot_table(values='price', index='neighbourhood group', 
                                       columns='room type', aggfunc='mean')
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu')
plt.xlabel('숙소 타입')
plt.ylabel('지역구')
plt.title('Mean Price by Neighbourhood Group and Room Type')
plt.show()

In [None]:
# 히트맵: neighbourhood group과 room type별 평균 가격
pivot_table = grouped_data.pivot_table(values='price', index='room type', 
                                       columns='booking_rate_bucket', aggfunc='mean')
plt.figure(figsize=(12,6))
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu')
plt.xlabel('예약율')
plt.ylabel('가격')
plt.title('Mean Price by Neighbourhood Group and Room Type')
plt.show()