# 1. Import Data & Libraries

In [1]:
import pandas as pd
import numpy as np

# from ydata_profiling import ProfileReport

# viz
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


#%matplotlib inline

from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_columns', None)
# Suppress Warnings
import warnings
warnings.filterwarnings("ignore")

import functions as f

**Data**

In [2]:
df = pd.read_csv('./Case1_HotelCustomerSegmentation.csv', 
                 sep = ';', index_col = 'ID')
df

Unnamed: 0_level_0,Nationality,Age,DaysSinceCreation,NameHash,DocIDHash,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,DistributionChannel,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1,PRT,52.0,440,0x2C371FD6CE12936774A139FD7430C624F1C4D5109CE6...,0x434FD3D59469C73AFEA087017FAF8CA2296493AEABDE...,59,292.00,82.3,1,0,2,6,4,Corporate,Corporate,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PRT,,1385,0x198CDB98BF37B6E23F9548C56A88B00912D65A9AA0D6...,0xE3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B...,61,280.00,53.0,0,0,1,10,5,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
3,DEU,32.0,1385,0xDA46E62F66936284DF2844EC4FC542D0DAD780C0EE0C...,0x27F5DF762CCDA622C752CCDA45794923BED9F1B66300...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
4,FRA,61.0,1385,0xC45D4CD22C58FDC5FD0F95315F6EFA5A6E7149187D49...,0x8E59572913BB9B1E6CAA12FA2C8B7BF387B1D1F3432E...,93,240.00,60.0,0,0,1,10,5,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
5,FRA,52.0,1385,0xD2E3D5BFCA141865669F98D64CDA85AD04DEFF47F8A0...,0x42BDEE0E05A9441C94147076EDDCC47E604DA5447DD4...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111729,DEU,31.0,36,0x2F59F0E86596B861C3303585FDB7F090B1B2D70442C5...,0x86AC02B9393E7F58354BA4193D792C07DB91D4BE27C7...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,1,0,0,0,0,0
111730,BRA,43.0,36,0x2708B9F11C95F384129152CDF0830B566F02D42B87AC...,0xE87DEB08B0D7D0BDC590949FF04AAA893018BD8EB714...,170,755.25,20.0,0,0,1,10,5,Travel Agent/Operator,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
111731,BRA,37.0,36,0xEF5F6AAA6120F2AC49179E16FD12477C243A8F4FC504...,0x74B9B5A554F5F660371E56D4CE4C1D4C3DC80AE71D68...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
111732,DEU,48.0,36,0xC70AE03F615CB96B0C84CF29419C2DDFE8EC64B77104...,0xF589C016E6988AECD3E3BE793E7D606BBDB6B6D4795C...,66,708.00,185.0,0,0,1,8,4,Travel Agent/Operator,Other,0,0,0,0,0,0,0,0,0,0,0,0,0


# 3. Data Cleaning and Preprocessing

## 3.1 Duplicates

As previously identified, 111 duplicate value were found therefore we proceed to removing them from the dataset.

In [3]:
df.drop_duplicates(inplace = True)

<br>

## 3.2 Fixing inconsistencies

**Customers' age**

To ensure data consistency, we will treat ages below 16 and 90 or above as inconsistent and mark them as missing. This will help improve the quality of the booking data.

In [4]:
df.loc[(df['Age'] < 16) | (df['Age'] > 90), 'Age'] = np.nan

<br>

**Hash of customer identification number**

For this feature, we considered merging records based on `NameHash`, `DocIDHash` and `DistributionChannel`, this will allow to keep a record for the same customer. This approach ensures that each customer is represented by a single record while still accounting for repeated DocIDHash values. These duplicates may arise due to bookings made through different channels or corporate reservations where multiple names are linked to the same document (DEPENDENDO DO QUE O PROF DIZER TALVEZ AQUI POSSAMOS TB DIZER QUE ISTO PODERÁ ACONTECER PQ SÃO OS FILHOS, POR ISSO APARECE NOMES DIFERENTES, MAS COM O MESMO ID).

In [5]:
df2 = df.dropna(subset=['DocIDHash'])
df2

Unnamed: 0_level_0,Nationality,Age,DaysSinceCreation,NameHash,DocIDHash,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,DistributionChannel,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1,PRT,52.0,440,0x2C371FD6CE12936774A139FD7430C624F1C4D5109CE6...,0x434FD3D59469C73AFEA087017FAF8CA2296493AEABDE...,59,292.00,82.3,1,0,2,6,4,Corporate,Corporate,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PRT,,1385,0x198CDB98BF37B6E23F9548C56A88B00912D65A9AA0D6...,0xE3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B...,61,280.00,53.0,0,0,1,10,5,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
3,DEU,32.0,1385,0xDA46E62F66936284DF2844EC4FC542D0DAD780C0EE0C...,0x27F5DF762CCDA622C752CCDA45794923BED9F1B66300...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
4,FRA,61.0,1385,0xC45D4CD22C58FDC5FD0F95315F6EFA5A6E7149187D49...,0x8E59572913BB9B1E6CAA12FA2C8B7BF387B1D1F3432E...,93,240.00,60.0,0,0,1,10,5,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
5,FRA,52.0,1385,0xD2E3D5BFCA141865669F98D64CDA85AD04DEFF47F8A0...,0x42BDEE0E05A9441C94147076EDDCC47E604DA5447DD4...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111729,DEU,31.0,36,0x2F59F0E86596B861C3303585FDB7F090B1B2D70442C5...,0x86AC02B9393E7F58354BA4193D792C07DB91D4BE27C7...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Travel Agent/Operator,0,0,0,0,0,0,0,1,0,0,0,0,0
111730,BRA,43.0,36,0x2708B9F11C95F384129152CDF0830B566F02D42B87AC...,0xE87DEB08B0D7D0BDC590949FF04AAA893018BD8EB714...,170,755.25,20.0,0,0,1,10,5,Travel Agent/Operator,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
111731,BRA,37.0,36,0xEF5F6AAA6120F2AC49179E16FD12477C243A8F4FC504...,0x74B9B5A554F5F660371E56D4CE4C1D4C3DC80AE71D68...,0,0.00,0.0,0,0,0,0,0,Travel Agent/Operator,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
111732,DEU,48.0,36,0xC70AE03F615CB96B0C84CF29419C2DDFE8EC64B77104...,0xF589C016E6988AECD3E3BE793E7D606BBDB6B6D4795C...,66,708.00,185.0,0,0,1,8,4,Travel Agent/Operator,Other,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
bookings = f.aggregation(df2)

In [7]:
bookings

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
0,0x00006E606D590F336C289D2CC995A0C726878BCAC001...,0x0D54EA6B14A0574CAA2D3802565441048C5C831EB035...,Direct,FRA,60.0,479,1.0,177.0,14.0,0,0,1,2,1,Direct,0,0,0,0,0,0,0,1,0,0,0,0,1
1,0x00006E606D590F336C289D2CC995A0C726878BCAC001...,0x0D54EA6B14A0574CAA2D3802565441048C5C831EB035...,Travel Agent/Operator,FRA,60.0,480,127.0,318.0,148.0,0,0,1,4,2,Other,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0x0000E25BC06E730F6DDD6BD53CBFE05BABD2B0182019...,0xF33C33AE994E1AE82D574B2BF23945DC93A9D9CC89FE...,Travel Agent/Operator,DEU,54.0,1197,14.0,695.0,100.0,0,0,1,10,5,Other,1,0,0,0,0,0,0,1,0,0,0,0,0
3,0x0001425DE03D204F30757BA2E50DBD0486ED254E1E6F...,0x42FC523622627DB3F09A6A544335CDAF2EDA9C57BA61...,Direct,USA,34.0,120,0.0,0.0,0.0,0,0,0,0,0,Direct,1,0,0,0,0,0,1,1,0,0,0,0,0
4,0x0002AE77553DBEBD734F0C644E6BE142EA0B4C5D5D83...,0xF47414CC56472838252EE0BB9C0639AEA24312985ECA...,Travel Agent/Operator,IND,,569,0.0,0.0,0.0,0,0,0,0,0,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108202,0xFFFB1DBFAB9D8FFDC8602C6D365578FBFF637070F3EB...,0xC96442DFC501BAF021CB0FE9CC15C1C22A98B8672FAF...,Direct,USA,49.0,123,0.0,0.0,0.0,0,0,0,0,0,Direct,0,0,0,0,0,0,0,0,0,0,0,0,0
108203,0xFFFB9FE60E0D35B8426F2FB90646D0BA799B803A79C1...,0x342C21783C3293511503091DCBCD4EB112AB13C2CD46...,Direct,CAN,41.0,783,58.0,218.0,14.0,0,0,1,2,2,Direct,0,0,0,0,0,0,0,0,0,0,0,0,0
108204,0xFFFE14C4D0D6520C97125A115CDC705B8A2F0E5133D5...,0xFD4A86C21CE0E27CFDF3ED6BFAF83F6B288657F972A2...,Travel Agent/Operator,FRA,19.0,46,89.0,911.7,31.0,0,0,1,14,7,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
108205,0xFFFED0F2BC7D4A410C579CEE04804A70F09C1B21B340...,0x5CF6B8387455E82BC6A2BCD111C0117527AEA2B37BDC...,Travel Agent/Operator,IRL,26.0,507,0.0,0.0,0.0,0,0,0,0,0,Other,1,0,0,0,0,0,0,0,1,0,0,0,1


In [8]:
bookings['DocIDHash'].value_counts()

DocIDHash
0xE3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B934CA495991B7852B855    2834
0x1B16B1DF538BA12DC3F97EDBB85CAA7050D46C148134290FEBA80F8236C83DB9      10
0xA486FBACF4B4E5537B026743E3FDFE571D716839E758236F42950A61FE6B922B       9
0x3856085146F7BC27BD07BFC4CA1991ED4E65E179D7BDB7DBBA7E32620809C799       7
0xE76926D097F612D921A7494C6B18247BF7DBA28D0679960312DF2AD5DB7F37BD       7
                                                                      ... 
0x55A9187065EFE381F6211B1B98952620B8A784A29DE449FEDA41712064A4AA42       1
0x55A79C0BD3226CA1E1723B0C794192042D85B772D2A667DE2C59F9D25A0B648F       1
0x55A769E36CA3B9C45FF16B6A761BF1CBE8A486882B3AB208395A4686548EC8E4       1
0x55A720611D39B0072081BF4F259DA05CCCF26D4D3C12A17F9EA1DB6E241415A7       1
0xFFFF697C455DCDFA906747736B03F3A61D20A92C8F741ED432C03B2E7E5DD6AD       1
Name: count, Length: 103480, dtype: int64

From the output above, it is clear that we were able to reduce the number of records that were displaying duplicated information about the same customer.

<br>

**Number of days in advance that the customer makes the booking before the arrival date**

We identified a negative value in the `AverageLeadTime` feature, which we consider to be an error. As a result, we will treat these values as missing and proceed with imputation. Given that most customers are not from Portugal, imputing values like 0 or 1 wouldn't be realistic, as it is unlikely that customers would arrive without sufficient travel time. Instead, we will use K-nearest neighbors for imputation, which provides a more reasonable approach for estimating these missing values.

In [15]:
bookings[bookings['AverageLeadTime'] == -1].head()

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
17604,0x2A1EA7292AF2D9BB083726E0961A38A54950480886BA...,0x4567E92118939CF20A432369E32BA8959502682AD473...,Travel Agent/Operator,USA,70.0,468,-1.0,95.2,2.0,0,0,1,2,1,Other,0,0,0,0,0,0,0,0,0,0,0,0,0
27883,0x43126A8B69B8B1E3C1D4513EE586ABE2910947964929...,0xCD4216685825A5A65B100A4F8F19366A23B2BDDE8E29...,Travel Agent/Operator,BRA,31.0,394,-1.0,209.0,2.0,0,0,1,2,1,Other,0,0,0,0,0,0,0,0,0,0,0,0,0
31948,0x4CDEF046A3A97AAC6BD603F2250C8ACCB068C39995C4...,0x61B41490861314B7148F76EE7D7E17A240A86B45259D...,Travel Agent/Operator,CHN,33.0,558,-1.0,129.0,2.0,0,0,1,2,1,Other,0,0,0,0,0,0,0,0,0,0,0,0,0
35137,0x54D0B32490EA6DAB16F26AFBDAD5037A89E1BDA62B54...,0xBBBA5623819CD060BB49D9D700348FFEF885F825D019...,Travel Agent/Operator,ESP,46.0,485,-1.0,159.0,2.0,0,0,1,2,1,Other,0,0,0,0,0,0,0,0,0,0,0,0,0
53275,0x814B89362FCA5770AEDC4485136D7F673F78793CE218...,0x7DD53BA3E94C8D8B171B9D50041DC3A23E6F4752A3A8...,Travel Agent/Operator,FRA,61.0,499,-1.0,149.0,2.0,0,0,1,2,1,Other,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
bookings.loc[bookings['AverageLeadTime'] == -1, 'AverageLeadTime'] = np.nan

<br>

**Total amount of lodging revenue paid by the customer so far**

From previous insights, we identified customers who have not spend in any hotel amenities including both room charges and other expenses such as the bar. Since these customers do not contribute to the company's revenue, we have decided to exclude them from the dataset.

In [27]:
print(f'{bookings[(bookings['LodgingRevenue'] == 0) & (bookings['OtherRevenue'] == 0)].shape[0]} customers did not generate revenue to the hotel.')

31898 customers did not generate revenue to the hotel.


In [28]:
bookings.drop(bookings[(bookings['LodgingRevenue'] == 0) & (bookings['OtherRevenue'] == 0)].index, inplace=True)

31898 records were removed from the dataset.

<br>

**Booking check-ins, revenues and persons/rooms/nights**

We have identified some incoherences, namely:

- there we customers with `BookingsCheckedIn` value equal to 0, meaning that these are not customers;
- there was one customer customer who made purchases for both rooms and other amenities but reported no cancellations, no-show, or check-in activity. This is inconsistent, as it is unlikely that a customer who made such charges did not at least check in to the hotel. Therefore, this data anomaly suggests an error in the record.

In [30]:
bookings[bookings['BookingsCheckedIn']==0]

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom


Based on the output, we can confirm that this issue no longer occurs, as we had removed records from the dataset.

### isto acontece, o DocIDHash do individuo está missing

In [33]:
bookings[(bookings['BookingsCheckedIn'] == 0) & ((bookings['LodgingRevenue'] != 0) | (bookings['OtherRevenue'] != 0))]

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom


# Aqui não sei muito bem o que fazer

<br>

**Number of check-in higher than the number of persons/nights at the hotel and rooms**

In [34]:
bookings[bookings['BookingsCheckedIn']>bookings['PersonsNights']]

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
2080,0x050B3FEBB73D2924810D9C068846F14550B03BCFDC7C...,0xFC913F891030DD576FDA085E7859D44AB7EE8B000D59...,Travel Agent/Operator,DEU,52.0,214,36.0,2514.0,103.5,0,0,1,0,2,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
38782,0x5DC0150F4FDDED43B313C3782560597AC6665A129EF5...,0x7A8179ADB882FC633830D7EABDCD64D41B3CD539FEA2...,Corporate,PRT,46.0,466,3.0,269.0,7.0,0,1,2,1,2,Aviation,0,0,0,0,0,0,0,1,0,0,0,0,0
43009,0x67D8A25C96313AEF8A5842A87F2F6AB29DE01DAC80BE...,0x65E5986F79B75C8A171FEEA2E534AAD806D5ACB083D6...,Travel Agent/Operator,NLD,33.0,641,41.0,267.0,109.0,0,0,1,0,4,Groups,0,0,0,0,0,0,0,0,1,0,0,0,0
53376,0x8186236B5A280E296DC622D0B419BBEBB5BC8E1D4338...,0x0596170AE19299CB3CEBA98BC2FAF9BF5333031C6A32...,Travel Agent/Operator,BRA,35.0,336,63.0,242.0,24.0,0,0,1,0,2,Other,0,0,0,0,0,0,1,1,0,0,0,0,0
93943,0xE3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B...,0x14A77C11BA7C3B13207E72D4BA395580338D206DCA65...,Travel Agent/Operator,ESP,,377,69.0,488.0,49.0,0,0,2,1,4,Groups,0,0,0,0,0,0,0,0,0,0,0,0,0
96155,0xE3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B...,0xDCA87EABFF56A1A313E731ED676F25361722AB5AE790...,Travel Agent/Operator,PRT,,1314,8.0,0.0,5105.5,0,0,3,2,12,Corporate,0,0,0,0,0,0,0,0,0,0,0,0,0
97109,0xE5123E18BD091C1108E145BE447859B469199AAAC5D7...,0x9EEAA6DAE9241E36FF1A13DF7ED180939C37FC54E6BA...,Travel Agent/Operator,FIN,51.0,729,9.0,798.0,177.5,0,0,2,1,1,Other,1,0,0,0,0,0,0,1,0,0,0,0,0


In [35]:
bookings[bookings['PersonsNights']<bookings['RoomNights']]

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
2080,0x050B3FEBB73D2924810D9C068846F14550B03BCFDC7C...,0xFC913F891030DD576FDA085E7859D44AB7EE8B000D59...,Travel Agent/Operator,DEU,52.0,214,36.0,2514.0,103.5,0,0,1,0,2,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
3319,0x08007328E88BA93B551B9F9F056AAFE2C00E94CCEE69...,0x84DF93E126886E0003E45A8B0A1127EDDDA6A3BEC276...,Travel Agent/Operator,PRT,45.0,1288,75.0,138.0,38.5,0,0,2,2,88,Groups,0,0,0,0,0,0,0,0,0,0,0,0,0
16583,0x27A6504F8E07DC750D565994E86CB3837146A501E609...,0x53FB0EB1CE1FD50424F79500C54F8149081D5A3D82F6...,Travel Agent/Operator,ESP,50.0,736,16.5,1528.0,102.0,0,0,4,6,25,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
17079,0x28D83B97B0D5B78BCBA1C36FDF79FF276A1E32C9CB5C...,0xFB5F026540ECB47362289BB2507D92BED41D80F632F6...,Direct,PRT,34.0,489,0.0,459.0,42.0,0,0,2,2,12,Complementary,0,0,0,0,0,0,0,1,0,0,0,0,0
18019,0x2B17E9D2CCEF2EA0FE752EE345BEDFB06741FFC8ECEC...,0xFB64B4B6AB53A6A549A620009CA24A1C3A668A460D87...,Corporate,PRT,48.0,1323,34.066667,11740.65,3066.5,5,1,90,112,132,Corporate,0,0,0,0,0,0,0,1,0,0,0,0,0
26462,0x3FAE56E6A0168E9130F12E58CF45CDAAE18477EF809C...,0x04F37DBBE0138585CD31BC820E2C780F992915176507...,Corporate,GRC,52.0,901,7.0,3066.0,347.3,0,0,5,28,30,Aviation,0,0,0,0,0,0,0,1,0,0,0,0,0
27462,0x420A533D66150B9173FAB8F5AF0DB8C54EBDFA001663...,0xD0C14D44BB72D7FA9673B2EE2EE9C8021EBDF81A7EF0...,Direct,USA,67.0,261,0.5,442.4,157.1,0,0,2,2,4,Direct,0,0,0,0,0,0,0,1,0,0,0,0,0
32252,0x4DA48EED943299176444C244EAA74FACB04E8E66DE82...,0xE779FF69A9BC8AA6FE091759A095386C0E2A8F680DA8...,Corporate,PRT,66.0,1189,38.0,1239.0,230.0,4,0,19,20,22,Corporate,0,0,0,0,0,0,0,1,0,0,0,0,0
38782,0x5DC0150F4FDDED43B313C3782560597AC6665A129EF5...,0x7A8179ADB882FC633830D7EABDCD64D41B3CD539FEA2...,Corporate,PRT,46.0,466,3.0,269.0,7.0,0,1,2,1,2,Aviation,0,0,0,0,0,0,0,1,0,0,0,0,0
41903,0x6541AF2C1E55C2D3EC17E89DEDD22FE0EBB47AB92305...,0xF4F7ED49904EB1F7A72F33A071F96CE533019C81DF96...,Direct,PRT,49.0,1112,0.0,858.15,114.0,0,0,3,4,11,Direct,0,0,0,0,0,0,0,0,0,0,0,0,0


<br>

**Customers under age when they registered**

It is also important to check if exists customers who have created an account when they were less than 16.

In [38]:
bookings[bookings['Age'] - (bookings['DaysSinceCreation'] / 365) < 16]

Unnamed: 0,DocIDHash,NameHash,DistributionChannel,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,BookingsCheckedIn,PersonsNights,RoomNights,MarketSegment,SRHighFloor,SRLowFloor,SRAccessibleRoom,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
228,0x00894B7E1CABFA8EED3F3EBD07775EC805097E4C8731...,0x5EB470337A85ABCB5B93641CCF89831F0132CE4272B0...,Direct,FRA,17.0,977,9.0,1021.20,239.94,0,0,1,8,4,Direct,0,0,0,0,0,0,0,0,0,0,0,0,0
593,0x017761CDC0557FA4B549E9B6E57EFF9F2D9999CDD0DF...,0x0BD21A30BDA465A795AA961D3DFDB180904CB786E567...,Travel Agent/Operator,FRA,16.0,909,114.0,456.00,79.50,0,0,1,8,4,Other,0,0,0,0,0,0,0,1,0,0,0,0,0
625,0x018BC244424177263A49E155E0155703A55EB5DE5688...,0x463B57C1BF57C60D45AE958F37608B1CE2EDFF695147...,Travel Agent/Operator,NLD,16.0,366,12.0,497.00,6.00,0,0,1,6,3,Other,0,0,0,0,0,0,0,0,0,0,0,0,1
986,0x025918E07BFFA32395D931CAD20281E0F2B7306F37BF...,0x44538360D90434312302A54D95005529CDA5713B75CB...,Travel Agent/Operator,ESP,16.0,676,9.0,342.00,40.00,0,0,1,6,2,Other,0,0,0,0,0,0,0,0,1,0,0,0,0
1045,0x0283C4490521CA28CAB3399496A12119EBF56034E47D...,0xC8BCD3E725F9AC4133E2560781DEB0B448A0F21193CC...,Direct,DEU,19.0,1301,130.0,283.50,56.00,0,0,1,8,4,Direct,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107720,0xFED7CA45EDCD56E0569A1DD3299896E10FF6BA2D0B53...,0x0A48007FB86C37A62DC1A25E16CACA32CD73E7593921...,Travel Agent/Operator,NLD,19.0,1163,183.0,239.40,42.00,0,0,1,6,2,Other,0,0,0,0,0,0,0,0,1,0,0,0,0
107778,0xFEFB52E2FC271B128FD396F95C9798E06DCDE9B8F317...,0xE7E2BD819585F3A1DAAE61012B860291848AEA985E70...,Travel Agent/Operator,UMI,16.0,462,10.0,1199.66,69.50,0,0,1,7,7,Other,0,0,0,0,0,0,0,0,0,0,0,0,0
107802,0xFF091167E8C620839C5886914BA70DFDBA507096579E...,0x28D865CAA16F0FA0992FD6AAA7391D90A7DD032ABBB7...,Travel Agent/Operator,GBR,17.0,1167,164.0,189.00,62.00,0,0,1,4,2,Other,0,0,0,0,0,0,0,0,1,0,0,0,0
107911,0xFF51F390B351649222FFE9FEF664AC9182C834A42E0C...,0x7B44D5B34FC5911E1303D123C73059EB1B0F89096AD6...,Travel Agent/Operator,BEL,17.0,796,203.0,337.50,45.50,0,0,1,10,5,Travel Agent/Operator,0,0,0,0,0,0,0,0,0,0,0,0,0
