## 객실의 사용 여부 관련 데이터
1. 데이터 로드 (hotel_bookings.csv)
2. 데이터에 대한 정보를 확인
3. 해당 데이터에서 문제가 있는 부분을 확인하는 수정

In [1]:
import pandas as pd
import numpy as np

In [5]:
# data 폴더 안에 hotel_bookings.csv 파일을 로드
hotel = pd.read_csv("../data/hotel_bookings.csv")

hotel.head()

Unnamed: 0,is_canceled,deposit_type,lead_time,stays_in_weekend_nights,stays_in_week_nights,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr
0,0,No Deposit,105.0,2,5,,0,0,1,0,131.5
1,0,No Deposit,303.0,2,2,,0,0,0,0,73.95
2,0,No Deposit,33.0,2,3,0.0,0,0,0,0,
3,0,No Deposit,48.0,0,1,0.0,0,0,1,0,80.3
4,0,No Deposit,216.0,4,7,0.0,0,0,2,0,60.9


In [6]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     20000 non-null  int64  
 1   deposit_type                    20000 non-null  object 
 2   lead_time                       19995 non-null  float64
 3   stays_in_weekend_nights         20000 non-null  int64  
 4   stays_in_week_nights            20000 non-null  int64  
 5   is_repeated_guest               19642 non-null  float64
 6   previous_cancellations          20000 non-null  int64  
 7   previous_bookings_not_canceled  20000 non-null  int64  
 8   booking_changes                 20000 non-null  int64  
 9   days_in_waiting_list            20000 non-null  int64  
 10  adr                             18937 non-null  float64
dtypes: float64(3), int64(7), object(1)
memory usage: 1.7+ MB


In [7]:
hotel.isna().sum()

is_canceled                          0
deposit_type                         0
lead_time                            5
stays_in_weekend_nights              0
stays_in_week_nights                 0
is_repeated_guest                  358
previous_cancellations               0
previous_bookings_not_canceled       0
booking_changes                      0
days_in_waiting_list                 0
adr                               1063
dtype: int64

In [None]:
hotel.describe()

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr
count,20000.0,19995.0,20000.0,20000.0,19642.0,20000.0,20000.0,20000.0,20000.0,18937.0
mean,0.12,85.978345,0.89255,2.3804,0.038133,0.0329,0.16905,0.2694,1.98395,101.410239
std,0.32497,96.42724,0.952077,1.777345,0.191521,0.455552,1.502426,0.687566,15.927212,49.245097
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.38
25%,0.0,11.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,68.8
50%,0.0,51.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,94.5
75%,0.0,132.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,126.0
max,1.0,629.0,13.0,30.0,1.0,26.0,66.0,17.0,379.0,451.5


### 해당 데이터의 컬럼의 의미
- is_canceled : 예약 취소 여부 (0=취소 안됨, 1=취소)
- deposit_type : 보증금 유형 (No Deposit, Not Refund, Refundable)
- lead_time : 예약일과 실제 도착일 사이의 일수
- stays_in_weekend_nights: 주말의 숙박 일수
- stays_in_week_nights  : 주중의 숙박 일수
- is_repeated_guest : 재방문 고객 여부 (0=신규, 1재방문)
- previous_cancellations : 과거 예약 취소 횟수
- previos_bookings_not_caneled : 과거 예약 중 취소되지 않은 건수
- booking_changes : 예약 후 변경 회수
- days_in_watiting_list : 대기자가 있었던 일수
- adr : 평균 일일 객실 요금

In [None]:
# 결측치가 존재하는 것은 확인 -> 실제 결측치의 개수를 확인
# 결측치가 존재 여부 함수 -> bool 타입의 데이터프레임 생성
# bool의 데이터들을 합산하여 컬럼별로 확인
hotel.isna().sum()

is_canceled                          0
deposit_type                         0
lead_time                            5
stays_in_weekend_nights              0
stays_in_week_nights                 0
is_repeated_guest                  358
previous_cancellations               0
previous_bookings_not_canceled       0
booking_changes                      0
days_in_waiting_list                 0
adr                               1063
dtype: int64

In [16]:
# 결측치의 비율 -> 결측치의 개수 / 데이터프레임의 길이 * 100
print("lead_time 컬럼의 결측치의 비율", round(5 / len(hotel) * 100 , 2))
print("is_repeated_guest 컬럼의 결측치의 비율", round(358 / len(hotel) * 100 , 2))
print("adr 컬럼의 결측치의 비율", round(1063 / len(hotel) * 100 , 2))

lead_time 컬럼의 결측치의 비율 0.03
is_repeated_guest 컬럼의 결측치의 비율 1.79
adr 컬럼의 결측치의 비율 5.32


- lead_time 컬럼의 결측치의 비율은 매우 작기 때문에 제거 -> 결측치인 인덱스를 제외
- is_pepeated_guest는 해당 데이터에서 개수가 많은 데이터로 결측치를 채워준다.
- adr 컬럼의 결측치는 해당 데이터들을 확인하고 특정한 조건에 맞춰서 데이터를 채워준다.


In [20]:
# lead_time의 결측치가 존재하는 인덱스를 제외
# 제거한다(drop) + 결측치(na) -> 결측치가 존재하는 행이나 열을 제거하는 함수
hotel.dropna(subset=['lead_time'], axis=0, inplace=True)

In [24]:
# 값들의 빈도수를 체크하는 함수를 이용
hotel['is_repeated_guest'].value_counts()

is_repeated_guest
0.0    18888
1.0      749
Name: count, dtype: int64

In [27]:
# is_repeated_guest 컬럼의 결측치는 0으로 채워준다.
hotel['is_repeated_guest'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hotel['is_repeated_guest'].fillna(0, inplace=True)


In [28]:
hotel.isna().sum()

is_canceled                          0
deposit_type                         0
lead_time                            0
stays_in_weekend_nights              0
stays_in_week_nights                 0
is_repeated_guest                    0
previous_cancellations               0
previous_bookings_not_canceled       0
booking_changes                      0
days_in_waiting_list                 0
adr                               1063
dtype: int64

In [30]:
hotel['adr'].describe()

count    18932.000000
mean       101.410702
std         49.241204
min         -6.380000
25%         68.822500
50%         94.500000
75%        126.000000
max        451.500000
Name: adr, dtype: float64

In [34]:
# 통계 정보를 확인하니 객실 요금 평균에 음수가 존재한다. -> 이상한 데이터가 발견
# 이상치 데이터는 제거
# 인덱스의 조건식을 생성 -> 객실 요금 데이터에서 0보다 작은
flag = hotel['adr'] < 0


hotel = hotel.loc[~flag]

In [None]:
# adr의 결측치들을 deposit_type의 값에 따라 그룹화를 하고 평균의 adr의 값을 채워준다.

# deposit_type에 따른 adr의 평균값을 확인
despoit_adr_mean = hotel.groupby(['deposit_type'])['adr'].mean()

In [53]:
# hotel 복사 본 생성
test_hotel = hotel.copy()

In [56]:
null_flag = test_hotel['adr'].isna()
for key in despoit_adr_mean.to_dict():
    # print(key)
    # print(despoit_adr_mean[key])
    # 인덱스 조건식 -> deposit_type이 key와 같다
    flag = test_hotel['deposit_type'] == key
    # flag와 null_flag 두 조건식이 모두 만족하는 인덱스 필터
    # 첫 번째 반복 시에는 deposit_type이 No Deposit이고 adr이 결측치인 데이터를 선택
    test_hotel.loc[flag & null_flag, 'adr'] = despoit_adr_mean[key]
    

In [57]:
test_hotel.isna().sum()

is_canceled                       0
deposit_type                      0
lead_time                         0
stays_in_weekend_nights           0
stays_in_week_nights              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
days_in_waiting_list              0
adr                               0
dtype: int64

In [59]:
# map(), apply()

hotel['adr'].map(
    lambda x : print(x)
)

131.5
73.95
nan
80.3
60.9
88.4
105.9
76.67
84.0
42.0
79.84
nan
88.0
126.0
nan
55.0
158.77
56.0
124.1
126.0
77.5
67.0
58.0
72.25
145.1
157.27
nan
100.0
89.0
170.0
67.0
nan
87.0
nan
90.0
107.1
123.0
85.85
56.1
30.24
62.0
110.88
0.0
nan
nan
42.3
0.0
107.1
283.67
55.0
69.0
105.0
62.0
88.0
77.18
134.14
25.11
150.3
168.3
115.2
108.15
130.0
96.0
48.0
75.0
104.5
111.0
130.0
118.13
98.1
168.3
98.0
74.13
82.24
196.0
58.0
87.0
60.0
144.0
118.8
0.0
nan
97.71
118.0
79.2
90.0
245.0
45.0
65.0
88.0
91.0
120.6
106.0
100.0
91.0
88.2
148.0
106.25
89.0
152.0
30.0
90.0
80.0
176.8
100.0
102.8
144.63
27.0
0.0
80.1
47.0
101.15
130.0
60.98
62.0
159.84
161.1
162.0
91.33
0.0
121.14
97.0
62.0
115.0
62.5
75.0
121.0
94.5
44.5
176.64
173.0
130.0
112.0
141.9
36.0
146.11
nan
90.0
126.0
62.0
100.8
54.6
42.0
82.0
169.0
77.85
50.0
35.0
95.0
177.0
89.4
149.4
73.8
0.0
152.0
90.95
48.0
87.0
119.7
nan
60.0
75.0
35.0
162.33
110.0
122.5
35.46
45.0
96.0
0.0
115.0
89.1
30.0
85.67
100.5
nan
82.35
153.62
164.0
120.0
38.0
170.57
75

0        None
1        None
2        None
3        None
4        None
         ... 
19995    None
19996    None
19997    None
19998    None
19999    None
Name: adr, Length: 19994, dtype: object

In [58]:
hotel['adr'].apply(
    lambda x : print(x)
)

131.5
73.95
nan
80.3
60.9
88.4
105.9
76.67
84.0
42.0
79.84
nan
88.0
126.0
nan
55.0
158.77
56.0
124.1
126.0
77.5
67.0
58.0
72.25
145.1
157.27
nan
100.0
89.0
170.0
67.0
nan
87.0
nan
90.0
107.1
123.0
85.85
56.1
30.24
62.0
110.88
0.0
nan
nan
42.3
0.0
107.1
283.67
55.0
69.0
105.0
62.0
88.0
77.18
134.14
25.11
150.3
168.3
115.2
108.15
130.0
96.0
48.0
75.0
104.5
111.0
130.0
118.13
98.1
168.3
98.0
74.13
82.24
196.0
58.0
87.0
60.0
144.0
118.8
0.0
nan
97.71
118.0
79.2
90.0
245.0
45.0
65.0
88.0
91.0
120.6
106.0
100.0
91.0
88.2
148.0
106.25
89.0
152.0
30.0
90.0
80.0
176.8
100.0
102.8
144.63
27.0
0.0
80.1
47.0
101.15
130.0
60.98
62.0
159.84
161.1
162.0
91.33
0.0
121.14
97.0
62.0
115.0
62.5
75.0
121.0
94.5
44.5
176.64
173.0
130.0
112.0
141.9
36.0
146.11
nan
90.0
126.0
62.0
100.8
54.6
42.0
82.0
169.0
77.85
50.0
35.0
95.0
177.0
89.4
149.4
73.8
0.0
152.0
90.95
48.0
87.0
119.7
nan
60.0
75.0
35.0
162.33
110.0
122.5
35.46
45.0
96.0
0.0
115.0
89.1
30.0
85.67
100.5
nan
82.35
153.62
164.0
120.0
38.0
170.57
75

0        None
1        None
2        None
3        None
4        None
         ... 
19995    None
19996    None
19997    None
19998    None
19999    None
Name: adr, Length: 19994, dtype: object

* 1차원 시리즈 데이터에서 map(), apply() 함수는 같은 행동은 한다.

In [60]:
hotel.map(
    lambda x : print(x)
)
# DataFrame에서 map() 함수는 첫 번재 시리즈의 values들을 모두 탐색하고 다음 스리즈의 vlaue로 넘어간다.

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


Unnamed: 0,is_canceled,deposit_type,lead_time,stays_in_weekend_nights,stays_in_week_nights,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
19995,,,,,,,,,,,
19996,,,,,,,,,,,
19997,,,,,,,,,,,
19998,,,,,,,,,,,


In [None]:
hotel.apply(
    lambda x : print(x)
)
# DataFrame에서 apply() 함수는 스리즈 별로 탐색을 하는 함수
# 차원을 하개 축소하여 데이터를 확인

0        0
1        0
2        0
3        0
4        0
        ..
19995    1
19996    1
19997    1
19998    1
19999    1
Name: is_canceled, Length: 19994, dtype: int64
0        No Deposit
1        No Deposit
2        No Deposit
3        No Deposit
4        No Deposit
            ...    
19995    Non Refund
19996    Non Refund
19997    Non Refund
19998    No Deposit
19999    Non Refund
Name: deposit_type, Length: 19994, dtype: object
0        105.0
1        303.0
2         33.0
3         48.0
4        216.0
         ...  
19995     89.0
19996    101.0
19997    277.0
19998      0.0
19999     40.0
Name: lead_time, Length: 19994, dtype: float64
0        2
1        2
2        2
3        0
4        4
        ..
19995    2
19996    0
19997    1
19998    0
19999    0
Name: stays_in_weekend_nights, Length: 19994, dtype: int64
0        5
1        2
2        3
3        1
4        7
        ..
19995    2
19996    3
19997    2
19998    1
19999    2
Name: stays_in_week_nights, Length: 19994, dtype: 

is_canceled                       None
deposit_type                      None
lead_time                         None
stays_in_weekend_nights           None
stays_in_week_nights              None
is_repeated_guest                 None
previous_cancellations            None
previous_bookings_not_canceled    None
booking_changes                   None
days_in_waiting_list              None
adr                               None
dtype: object

In [64]:
# groupby() 함수와 apply()를 조합하여 deposit_type에 따라서 adr의 결측치를 평균값으로 대체
hotel.groupby('deposit_type').apply(
    lambda x : x.fillna(x.mean())
).isna()

Unnamed: 0_level_0,Unnamed: 1_level_0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr
deposit_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
No Deposit,0,False,False,False,False,False,False,False,False,False,False
No Deposit,1,False,False,False,False,False,False,False,False,False,False
No Deposit,2,False,False,False,False,False,False,False,False,False,False
No Deposit,3,False,False,False,False,False,False,False,False,False,False
No Deposit,4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
Refundable,15249,False,False,False,False,False,False,False,False,False,False
Refundable,15428,False,False,False,False,False,False,False,False,False,False
Refundable,15661,False,False,False,False,False,False,False,False,False,False
Refundable,16644,False,False,False,False,False,False,False,False,False,False
