## 데이터 불러오기 및 기본 정보 확인

In [10]:
# import usage libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# 시각화 OS별 한글폰트 설정
if os.name == 'nt':
    plt.rcParams['font.family'] = 'Malgun Gothic'  # Windows 폰트 설정
elif os.name == 'posix':
    plt.rcParams['font.family'] = 'AppleGothic'  # Mac 폰트 설정

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)


# 1. 데이터 불러오기
bnb_raw = pd.read_csv('2025_Airbnb_NYC_listings.csv')


# 2. 복사본 생성(전처리는 복사본으로 진행)
bnb = bnb_raw.copy()

## 데이터 형변환 처리

|번호|컬럼명|변환 전|변환 후|
|---|---|---|---|
|1|price|object|float|
|2|host_response_rate|object|float|
|3|host_acceptance_rate|object|float|
|4|last_review|object|Datetime|
|5|host_since|object|Datetime|
|6|host_response_time|object|category|
|7|host_is_superhost|object|boolean|
|8|host_has_profile_pic|object|boolean|
|9|host_identity_verified|object|boolean|
|10|has_availability|object|boolean|
|11|instant_bookable|object|boolean|
|12|license|object|boolean|

In [11]:
# 1. 'price' -> float형 변환
bnb['price'] = (
    bnb['price']
    .astype(str)
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)
# 2. 'host_response_rate' -> float형 변환
# 문자열에 '%'를 제거하고 100으로 나눠서 실수화함
bnb['host_response_rate'] = pd.to_numeric(
    bnb['host_response_rate'].replace('%', '', regex=True),
    errors='coerce'
) / 100

# 3. 'host_acceptance_rate' -> float형 변환
# 문자열에 '%'를 제거하고 100으로 나눠서 실수화함
bnb['host_acceptance_rate'] = pd.to_numeric(
    bnb['host_acceptance_rate'].replace('%', '', regex=True),
    errors='coerce'
) / 100

In [12]:
# 4. 'last_review' -> datetime형 변환
bnb['last_review'] = pd.to_datetime(bnb['last_review'], errors='coerce')

# 5. 'host_since' -> datetime형 변환
bnb['host_since'] = pd.to_datetime(bnb['host_since'], errors='coerce')

In [13]:
# 6. 'host_response_time' -> 범주형으로 변환
bnb['host_response_time'] = bnb['host_response_time'].astype('category')

### 시간 단위(int)로 변환 해도 될듯
# ex)
# within an hour = 1
# within a few hours = 6
# within a day = 24

In [14]:
# 7~12  . 결과값이 't / f'인 5개의 컬럼을 Boolean으로 형변환
bool_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified',
              'has_availability', 'instant_bookable']

for col in bool_cols:
    bnb[col] = bnb[col].map({'t': True, 'f': False})


# 13. 'license' -> Boolean으로 형변환.
bnb['license'] = bnb['license'].notna()

## 데이터 결측치 처리

In [15]:
bnb.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 73 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   Unnamed: 0                                    5 non-null      int64         
 1   id                                            5 non-null      int64         
 2   source                                        5 non-null      object        
 3   name                                          5 non-null      object        
 4   description                                   5 non-null      object        
 5   neighborhood_overview                         5 non-null      object        
 6   host_id                                       5 non-null      int64         
 7   host_name                                     5 non-null      object        
 8   host_since                                    5 non-null      datetime64[n

In [16]:
bnb['host_neighbourhood'].value_counts()


host_neighbourhood
Bedford-Stuyvesant       1138
Cambridge                1095
Jamaica                   834
Williamsburg              756
Flushing                  745
                         ... 
Saint-Jean-Cap-Ferrat       1
Marina District             1
West Dover                  1
Playa Negra                 1
alcaldia cuauhtemoc         1
Name: count, Length: 446, dtype: int64

In [None]:
# host_location 구분자로 분할
host_loc = bnb['host_location'].str.split(',', expand=True) 

# 비교군 county1, county2 컬럼 지정
county1 = host_loc.iloc[:, 0].str.strip() 
county2 = host_loc.iloc[:, 1].str.strip() 

# DataFrame 화
county = pd.DataFrame({'city':county2,'local':county1}) 

# 대문자 변환으로 값 통일
county['local'] = county['local'].str.upper() 
county['city'] = county['city'].str.upper() 

county['host_neighbourhood'] = bnb['host_neighbourhood'] 

# 도시 이름 기준 location 변환
county.loc[county['local'] == 'NEW YORK', 'city'] = 'NY' 
county.loc[county['local'] == 'DUBAI', 'city'] = 'UAE' 
county.loc[county['local'] == 'LONDON', 'city'] = 'UK' 
county.loc[county['local'] == 'SANTO DOMINGO', 'city'] = 'DOMINICA' 
county.loc[county['local'] == 'WILLEMSTAD', 'city'] = 'CURACAO' 
county.loc[county['local'] == 'PORT OF SPAIN', 'city'] = 'TRINIDAD TOBAGO' 
county.loc[county['local'] == 'PIARCO', 'city'] = 'TRINIDAD TOBAGO' 
county.loc[county['local'] == 'BRAZIL', 'city'] = 'BRAZIL' 
county.loc[county['local'] == 'JAMAICA', 'city'] = 'JAMAICA' 
county.loc[county['local'] == 'HONG KONG', 'city'] = 'HONG KONG' 
county.loc[county['local'] == 'ISRAEL', 'city'] = 'ISRAEL' 
county.loc[county['local'] == 'MEXICO', 'city'] = 'MEXICO' 
county.loc[county['local'] == 'COLOMBIA', 'city'] = 'COLOMBIA' 
county.loc[county['local'] == 'DOMINICAN REPUBLIC', 'city'] = 'DOMINICA' 
county.loc[county['local'] == 'GERMANY', 'city'] = 'GERMANY' 
county.loc[county['local'] == 'SINGAPORE', 'city'] = 'SINGAPORE' 
county.loc[county['local'] == 'NEW JERSEY', 'city'] = 'NJ' 
county.loc[county['local'] == 'MAINE', 'city'] = 'ME' 
county.loc[county['local'] == 'MASSACHUSETTS', 'city'] = 'MA' 
county.loc[county['local'] == 'FLORIDA', 'city'] = 'FL' 
county.loc[county['local'] == 'CONNECTICUT', 'city'] = 'CT' 
county.loc[county['local'] == 'VERMONT', 'city'] = 'VT' 
county.loc[county['local'] == 'UTAH', 'city'] = 'UT' 
county.loc[county['local'] == 'OREGON', 'city'] = 'OT' 
county.loc[county['local'] == 'HAWAII', 'city'] = 'HI' 
county.loc[county['local'] == 'CALIFORNIA', 'city'] = 'CA' 
county.loc[county['local'] == 'MISSOURI', 'city'] = 'MO' 
county.loc[county['local'] == 'GEORGIA', 'city'] = 'GA' 

# 구글링을 통한 지역 매칭 기준 
county.loc[county['host_neighbourhood'] == 'Elmhurst', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Harlem', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'West Bronx', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Bushwick', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'St Kilda East', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == "Hell's Kitchen", 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Bedford-Stuyvesant', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'East New York', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Chelsea', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Park Slope', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Gowanus', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Sheepshead Bay', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Lefferts Garden', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Flatiron District', 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == "Prince's Bay", 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == "West Village", 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == "Lefferts Manor Historic District", 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == "Flushing", 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == "East Flatbush", 'city'] = 'NY'
county.loc[county['host_neighbourhood'] == 'Williamsburg', 'city'] = 'VA'
county.loc[county['host_neighbourhood'] == 'Little Haiti', 'city'] = 'FL'
county.loc[county['host_neighbourhood'] == 'Waikiki', 'city'] = 'HI'
county.loc[county['host_neighbourhood'] == 'Astoria', 'city'] = 'OR'
county.loc[county['host_neighbourhood'] == 'Civic Center', 'city'] = 'CA'
county.loc[county['host_neighbourhood'] == 'East Rock', 'city'] = 'CT'
county.loc[county['host_neighbourhood'] == 'Pine Hills', 'city'] = 'FL'

# 동네 기준 매칭 딕셔너리 생성
region_state_map = {
    "Jamaica": "NY",
    "Gateway District": "FL",
    "Crown Heights": "NY",
    "Woodside": "NY",
    "Ocean Hill": "NY",
    "Long Island City": "NY",
    "Brickell": "FL",
    "Canarsie": "NY",
    "Clearwater Beach": "FL",
    "East Bronx": "NY",
    "Flatbush": "NY",
    "Fort Hamilton": "NY",
    "Arverne": "NY",
    "Greenpoint": "NY",
    "Cypress Hills": "NY",
    "Far Rockaway": "NY",
    "Brooklyn Heights": "NY",
    "Little Caribbean": "NY",
    "Midtown": "NY",
    "North Beach": "NY",
    "Hollywood South Central Beach": "CA",
    "Southside": "DC",
    "Mott Haven": "NY",
    "Flatlands": "NY",
    "Sunset Park": "NY",
    "Central Business District": "NY",
    "Upper East Side": "NY",
    "Bay Ridge": "NY",
    "Hamilton Heights": "NY",
    "Clinton Hill": "NY",
    "Midtown East": "NY",
    "Boerum Hill": "NY",
    "Concourse Village": "NY",
    "North City": "MO",
    "South Beach": "FL",
    "Fort Greene": "NY",
    "Sunnyside": "NY",
    "Upper West Side": "NY",
    "Midwood": "NY",
    "Brighton Beach": "NY",
    "Gramercy Park": "NY",
    "Corona": "NY",
    "Van Nest": "NY",
    "East Harlem": "NY",
    "Mid Island": "NY",
    "Chinatown": "NY",
    "Queens Village": "NY",
    "Sturbridge": "MA",
    "Clason Point": "NY",
    "Williamsbridge": "NY",
    "Stuyvesant Heights": "NY",
    "Brownsville": "NY",
    "Guilford Hills": "NC",
    "Woodstock": "NY",
    "Rockaway Beach": "NY",
    "Edenwald": "NY",
    "Hunts Point": "NY",
    "Prospect Heights": "NY",
    "Washington Heights": "NY",
    "Bergen Beach": "NY",
    "Ditmars Steinway": "NY",
    "Mill Basin": "NY",
    "Murray Hill": "NY",
    "East Elmhurst": "NY",
    "Union Square": "NY",
    "Ridgewood": "NY",
    "Laconia": "NH",
    "Pelham Gardens": "NY",
    "Kensington": "NY",
    "Mariners Harbor": "NY",
    "Port Richmond": "NY",
    "West Brighton": "NY",
    "Lindenwood": "NY",
    "The Rockaways": "NY",
    "Midland Beach": "NY",
    "Center City West": "PA",
    "Prospect Lefferts Gardens": "NY",
    "Morris Park": "NY",
    "Morrisania": "NY",
    "Maspeth": "NY",
    "Cobble Hill": "NY",
    "Gravesend": "NY",
    "Claremont": "NY",
    "Bridge Plaza": "NY",
    "Downtown Los Angeles": "CA",
    "Bath Beach": "NY",
    "Forest Hills": "NY",
    "Lower East Side": "NY",
    "St. Albans": "NY",
    "Rego Park": "NY",
    "Norwood": "NY",
    "Baychester": "NY",
    "Highbridge": "NY",
    "Pelham Bay": "NY",
    "East Village": "NY",
    "Rosebank": "NY",
    "Bensonhurst": "NY",
    "Alphabet City": "NY",
    "Concourse": "NY",
    "Eltingville": "NY",
    "Nolita": "NY",
    "Greenwich Village": "NY",
    "Richmond Hill": "NY",
    "Jackson Heights": "NY",
    "Wakefield": "NY",
    "Carroll Gardens": "NY",
    "Little Italy": "NY",
    "Dongan Hills": "NY",
    "Central City": "CO",
    "Elm Park": "MA",
    "Belmont": "NY",
    "Coney Island": "NY",
    "Red Hook": "NY",
    "Allerton": "NY",
    "Marine Park": "NY",
    "Manhattan Beach": "NY",
    "Fordham Heights": "NY",
    "Fordham Manor": "NY",
    "Antioch": "CA",
    "Morris Heights": "NY",
    "Glendale": "NY",
    "Inwood": "NY",
    "New Springville": "NY",
    "Kips Bay": "NY",
    "Morningside Heights": "NY",
    "Sylvan Heights": "NC",
    "Highlands": "NJ",
    "Shore Acres": "OR",
    "Randall Manor": "NY",
    "South Ozone Park": "NY",
    "Westchester Square": "NY",
    "El Mamey": "Dominica",
    "Southpoint": "NC",
    "Rossville": "NY",
    "Bulls Head": "NY",
    "Gerritsen Beach": "NY",
    "Spring Creek": "NY",
    "Downtown Brooklyn": "NY",
    "Journal Square": "NJ",
    "City Island": "NY",
    "Todt Hill": "NY",
    "Homecrest": "NY",
    "Parkchester": "NY",
    "Hollis": "NY",
    "Manor Heights": "NY",
    "Throgs Neck": "NY",
    "Utopia": "NY",
    "Borough Park": "NY",
    "Wilshire Montana": "CA",
    "College Park, MD": "MD",
    "Ozone Park": "NY",
    "Soho": "NY",
    "Times Square/Theatre District": "NY",
    "Tribeca": "NY",
    "Pacific Heights": "CA",
    "West Farms": "NY",
    "Rockaway Park": "NY",
    "Garrison": "NY",
    "Whitestone": "NY",
    "Briarwood": "NY",
    "Silver Lake": "CA",
    "Echo Park": "CA",
    "Mount Eden": "NY",
    "Miami Beach": "FL",
    "Tottenville": "NY",
    "University Heights": "NY",
    "Springfield Gardens": "NY",
    "Fresh Meadows": "NY",
    "Central LA": "CA",
    "South Jamesport": "NY",
    "French Quarter": "LA",
    "Downtown Miami": "FL",
    "Travis - Chelsea": "NY",
    "North Riverdale": "NY",
    "Concord": "CA",
    "La Jolla": "CA",
    "Windsor Terrace": "NY",
    "Oakwood": "NY",
    "New Brighton": "NY",
    "Bayside": "NY",
    "Arrochar": "NY",
    "Dutch Kills": "NY",
    "Cabo Branco": "Brazil",
    "East Williamsburg": "NY",
    "Downtown Jersey City": "NJ",
    "College Point": "NY",
    "Kew Gardens": "NY",
    "Edgewater": "NJ",
    "Oakland Gardens": "NY",
    "Brighton": "MA",
    "South Philadelphia": "PA",
    "Washington Square West": "PA",
    "Spuyten Duyvil": "NY",
    "Highland Lakes": "NJ",
    "Central Ward": "NJ",
    "Lazybrook / Timbergrove": "TX",
    "Mile Square": "CT",
    "Riverdale": "NY",
    "South Richmond Hill": "NY",
    "Downtown": "NY",
    "West Dover": "VT",
    "Elizabethport": "NJ",
    "South Slope": "NY",
    "Colgate Center": "NY",
    "South Side": "IL"
}

# 동네 기준 매핑 적용
county['region_code'] = county['host_neighbourhood'].map(region_state_map) 

# 동네 기준 매칭 적용
county['city'] = county['city'].fillna(county['region_code']) 

# bnb 데이터에 가공 데이터 변경
bnb['host_location'] = county['city']

In [None]:
bnb.head() 

Unnamed: 0.1,Unnamed: 0,id,source,name,description,neighborhood_overview,host_id,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,availability_eoy,number_of_reviews_ly,estimated_occupancy_l365d,estimated_revenue_l365d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,0,36121,city scrape,Lg Rm in Historic Prospect Heights,Cozy space share in the heart of a great neigh...,Full of tree-lined streets and beautiful brown...,62165,Michael,2009-12-11,NY,I’m an urban planner working for an internatio...,,,,False,Prospect Heights,1.0,3.0,"['email', 'phone', 'work_email']",True,True,Neighborhood highlights,Prospect Heights,Brooklyn,40.67376,-73.96611,Private room in rental unit,Private room,1,1.0,1 shared bath,1.0,1.0,"[""Refrigerator"", ""Dishes and silverware"", ""Wif...",200.0,90,365,90.0,90.0,365.0,365.0,90.0,365.0,,True,27,57,87,362,2025-03-03,9,0,0,301,0,0,0.0,2010-12-11,2013-05-10,4.88,5.0,4.8,5.0,5.0,5.0,5.0,False,False,1,0,1,0,0.05
1,1,36647,city scrape,"1 Bedroom & your own Bathroom, Elevator Apartment",Private bedroom with your own bathroom in a 2 ...,"Manhattan, SE corner of 2nd Ave/ E. 110th street",157798,Irene,2010-07-04,NY,,,,1.0,False,East Harlem,1.0,1.0,"['email', 'phone']",True,True,Neighborhood highlights,East Harlem,Manhattan,40.792454,-73.940742,Private room in condo,Private room,2,1.0,1 private bath,1.0,1.0,"[""Oven"", ""Blender"", ""Luggage dropoff allowed"",...",82.0,30,999,30.0,30.0,999.0,999.0,30.0,999.0,,True,0,0,0,204,2025-03-03,102,0,0,143,0,0,0.0,2010-10-04,2023-12-09,4.77,4.82,4.76,4.88,4.9,4.38,4.71,False,False,1,0,1,0,0.58
2,2,38663,city scrape,Luxury Brownstone in Boerum Hill,"Beautiful, large home in great hipster neighbo...","diverse, lively, hip, cool: loaded with restau...",165789,Sarah,2010-07-13,NY,I am a lawyer and work as an executive at an a...,within a few hours,1.0,0.4,False,Boerum Hill,1.0,3.0,"['email', 'phone', 'work_email']",True,True,Neighborhood highlights,Boerum Hill,Brooklyn,40.68442,-73.98068,Private room in home,Private room,2,2.5,2.5 baths,5.0,5.0,"[""Portable fans"", ""Oven"", ""Baking sheet"", ""Fir...",765.0,3,60,3.0,3.0,60.0,60.0,3.0,60.0,,True,30,49,66,326,2025-03-02,43,0,0,267,0,0,0.0,2012-07-09,2023-08-30,4.7,4.83,4.52,4.88,4.88,4.86,4.62,True,False,1,0,1,0,0.28
3,3,38833,city scrape,Spectacular West Harlem Garden Apt,This is a very large and unique space. An inc...,West Harlem is now packed with great restauran...,166532,Matthew,2010-07-14,NY,I have been a New Yorker for a long time\n and...,within an hour,1.0,0.97,True,Harlem,1.0,1.0,"['email', 'phone']",True,True,Neighborhood highlights,Harlem,Manhattan,40.818058,-73.946671,Entire home,Entire home/apt,2,1.0,1 bath,1.0,1.0,"[""Fire extinguisher"", ""Clothing storage: close...",139.0,2,45,2.0,2.0,1125.0,1125.0,2.0,1125.0,,True,7,18,25,25,2025-03-03,241,42,3,25,43,255,35445.0,2010-08-28,2025-02-21,4.85,4.87,4.5,4.96,4.96,4.79,4.82,True,False,1,1,0,0,1.36
4,4,39282,city scrape,“Work-from-home” from OUR home.,*Monthly Discount will automatically apply <br...,THE NEIGHBORHOOD:<br />Our apartment is locate...,168525,Gustavo,2010-07-16,VA,I am a music producer. And my wife is a hair s...,within an hour,1.0,1.0,True,Williamsburg,2.0,2.0,"['email', 'phone']",True,True,Neighborhood highlights,Williamsburg,Brooklyn,40.710651,-73.950874,Private room in rental unit,Private room,2,1.0,1 shared bath,1.0,1.0,"[""Oven"", ""Rice maker"", ""Laundromat nearby"", ""L...",130.0,4,45,4.0,4.0,45.0,45.0,4.0,45.0,,True,14,22,38,38,2025-03-02,274,12,0,38,12,154,20020.0,2010-08-02,2025-01-03,4.82,4.83,4.61,4.94,4.88,4.85,4.78,True,False,2,0,2,0,1.54
