# Match searches with bookings

• For every search in the searches file, find out whether the search ended up in a booking or not (using the info in the bookings file). For instance, search and booking origin and destination should match. 

• For the bookings file, origin and destination are the columns dep_port and arr_port, respectively. 

• Generate a CSV file with the search data, and an additional field, containing 1 if the search ended up in a booking, and 0 otherwise.

## Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline

## Data Path in my computer

In [4]:
!ls /home/dsc/Data/challenge/

bookings.csv.bz2  searches.csv.bz2  searches_without_duplicates.csv


## Step 1: Let's explore briefly our both datasets

I will use the samples I have created in Exercise 1

In [5]:
bookings_sample = pd.read_csv('bookings.sample.csv.bz2',compression='bz2', sep='^')
searches_sample = pd.read_csv('searches.sample.csv.bz2',compression='bz2', sep='^')

In [6]:
bookings_sample.head()

Unnamed: 0,act_date,source,pos_ctry,pos_iata,pos_oid,rloc,cre_date,duration,distance,dep_port,...,route,carrier,bkg_class,cab_class,brd_time,off_time,pax,year,month,oid
0,2013-03-05 00:00:00,1A,DE,a68dd7ae953c8acfb187a1af2dcbe123,1a11ae49fcbf545fd2afc1a24d88d2b7,ea65900e72d71f4626378e2ebd298267,2013-02-22 00:00:00,1708,0,ZRH,...,LHRZRH,VI,T,Y,2013-03-07 08:50:00,2013-03-07 11:33:37,-1,2013,3,
1,2013-03-26 00:00:00,1A,US,e612b9eeeee6f17f42d9b0d3b79e75ca,7437560d8f276d6d05eeb806d9e7edee,737295a86982c941f1c2da9a46a14043,2013-03-26 00:00:00,135270,0,SAL,...,SALATLCLT,NV,L,Y,2013-04-12 13:04:00,2013-04-12 22:05:40,1,2013,3,
2,2013-03-26 00:00:00,1A,US,e612b9eeeee6f17f42d9b0d3b79e75ca,7437560d8f276d6d05eeb806d9e7edee,737295a86982c941f1c2da9a46a14043,2013-03-26 00:00:00,135270,0,SAL,...,CLTATLSAL,NV,U,Y,2013-07-15 07:00:00,2013-07-15 11:34:51,1,2013,3,
3,2013-03-26 00:00:00,1A,AU,0f984b3bb6bd06661c95529bbd6193bc,36472c6dbaf7afec9136ac40364e2794,5ecf00fdcbcec761c43dc7285253d0c1,2013-03-26 00:00:00,30885,0,AKL,...,AKLHKGSVO,XK,G,Y,2013-04-24 23:59:00,2013-04-25 16:06:31,1,2013,3,SYDA82546
4,2013-03-26 00:00:00,1A,AU,0f984b3bb6bd06661c95529bbd6193bc,36472c6dbaf7afec9136ac40364e2794,5ecf00fdcbcec761c43dc7285253d0c1,2013-03-26 00:00:00,30885,0,AKL,...,SVOHKGAKL,XK,G,Y,2013-05-14 20:15:00,2013-05-16 10:44:50,1,2013,3,SYDA82546


In [7]:
bookings_sample.columns

Index(['act_date           ', 'source', 'pos_ctry', 'pos_iata', 'pos_oid  ',
       'rloc          ', 'cre_date           ', 'duration', 'distance',
       'dep_port', 'dep_city', 'dep_ctry', 'arr_port', 'arr_city', 'arr_ctry',
       'lst_port', 'lst_city', 'lst_ctry', 'brd_port', 'brd_city', 'brd_ctry',
       'off_port', 'off_city', 'off_ctry', 'mkt_port', 'mkt_city', 'mkt_ctry',
       'intl', 'route          ', 'carrier', 'bkg_class', 'cab_class',
       'brd_time           ', 'off_time           ', 'pax', 'year', 'month',
       'oid      '],
      dtype='object')

In [14]:
bookings_sample.columns = bookings_sample.columns.str.strip()

In [15]:
bookings_sample.columns

Index(['act_date', 'source', 'pos_ctry', 'pos_iata', 'pos_oid', 'rloc',
       'cre_date', 'duration', 'distance', 'dep_port', 'dep_city', 'dep_ctry',
       'arr_port', 'arr_city', 'arr_ctry', 'lst_port', 'lst_city', 'lst_ctry',
       'brd_port', 'brd_city', 'brd_ctry', 'off_port', 'off_city', 'off_ctry',
       'mkt_port', 'mkt_city', 'mkt_ctry', 'intl', 'route', 'carrier',
       'bkg_class', 'cab_class', 'brd_time', 'off_time', 'pax', 'year',
       'month', 'oid'],
      dtype='object')

In [8]:
searches_sample.head(10)

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,...,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice
0,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1,2,TXL,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,FRA
1,2013-01-01,10:15:33,MPT,b0af35b31588dc4ab06d5cf2986e8e02,MD,ATH,MIL,0,1,ATH,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,KIV
2,2013-01-01,18:04:49,MPT,3561a60621de06ab1badc8ca55699ef3,US,ICT,SFO,1,2,ICT,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC
3,2013-01-01,17:42:40,FXP,1864e5e8013d9414150e91d26b6a558b,SE,RNB,ARN,0,1,RNB,...,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,STO
4,2013-01-01,17:48:29,MPT,1ec336348f44207d2e0027dc3a68c118,NO,OSL,MAD,1,2,OSL,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,OSL
5,2013-01-01,22:00:28,MPT,3561a60621de06ab1badc8ca55699ef3,US,IAH,BLR,1,2,IAH,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC
6,2013-01-01,10:47:14,MPT,d327ca6e35cc6732d4709828327ac7c1,DK,CPH,PAR,1,2,CPH,...,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,CPH
7,2013-01-01,23:39:49,MPT,38a3abb0a28e3f00fa79a11f552a5052,FR,PAR,DUB,1,2,PAR,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,PAR
8,2013-01-01,17:08:46,MPT,c8daef4f8bf73a61aa2c928705f7b82d,ES,DUS,ACE,1,2,DUS,...,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,MAD
9,2013-01-01,19:57:57,MPT,28d7a8c95e4db88589d3d35b66920e78,DE,FRA,BGW,1,2,FRA,...,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,BNJ


In [9]:
searches_sample.columns

Index(['Date', 'Time', 'TxnCode', 'OfficeID', 'Country', 'Origin',
       'Destination', 'RoundTrip', 'NbSegments', 'Seg1Departure',
       'Seg1Arrival', 'Seg1Date', 'Seg1Carrier', 'Seg1BookingCode',
       'Seg2Departure', 'Seg2Arrival', 'Seg2Date', 'Seg2Carrier',
       'Seg2BookingCode', 'Seg3Departure', 'Seg3Arrival', 'Seg3Date',
       'Seg3Carrier', 'Seg3BookingCode', 'Seg4Departure', 'Seg4Arrival',
       'Seg4Date', 'Seg4Carrier', 'Seg4BookingCode', 'Seg5Departure',
       'Seg5Arrival', 'Seg5Date', 'Seg5Carrier', 'Seg5BookingCode',
       'Seg6Departure', 'Seg6Arrival', 'Seg6Date', 'Seg6Carrier',
       'Seg6BookingCode', 'From', 'IsPublishedForNeg', 'IsFromInternet',
       'IsFromVista', 'TerminalID', 'InternetOffice'],
      dtype='object')

In [10]:
searches_sample[['Seg1Date','Seg1Carrier','Seg1BookingCode']]

Unnamed: 0,Seg1Date,Seg1Carrier,Seg1BookingCode
0,2013-01-26,D2,
1,2013-01-04,,
2,2013-08-02,,
3,2013-01-02,DU,W
4,2013-03-22,,
...,...,...,...
994,2013-01-04,,
995,2013-03-31,,
996,2013-01-29,BP,B
997,2013-01-05,,


We conclude we can relate searches to bookings by Origin, Destination and Date.

## Step 2: Let's prepare our datasets

### Drop duplicates

As I did on the last exercise with searches file I will drop duplicates from Bookings. 

In [3]:
chksize = 100000

In [4]:
%%time
reader = pd.read_csv('/home/dsc/Data/challenge/bookings.csv.bz2',compression='bz2',sep='^', iterator=True, chunksize=chksize)
all_chunks= pd.DataFrame()
chunk_counter=0

for df in reader:
    all_chunks=all_chunks.append(df)
    all_chunks.drop_duplicates(inplace=True)
    chunk_counter+=1
    print(chunk_counter)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50




51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
CPU times: user 16min 57s, sys: 1min 24s, total: 18min 21s
Wall time: 16min 16s


In [5]:
all_chunks.shape

(1016377, 38)

In [6]:
all_chunks.sample(10)

Unnamed: 0,act_date,source,pos_ctry,pos_iata,pos_oid,rloc,cre_date,duration,distance,dep_port,...,route,carrier,bkg_class,cab_class,brd_time,off_time,pax,year,month,oid
916646,2013-08-13 00:00:00,1P,BE,668c460478079d697fff9efeb1b57f90,b2e8d2bc0461e36027441614e8b59784,b78208fa80709a4ea508c33493b1d570,2013-08-13 00:00:00,120865,0,ABJ,...,BRUABJ,DX,E,Y,2013-12-16 11:30:00,2013-12-16 17:20:13,1.0,2013.0,8.0,
292090,2013-01-18 00:00:00,1S,US,04bc62cd0e78c30955cc9fcdac1009ec,0aaa90c3d2eca192e5d7b79af1f80ec1,422e22d582f495b4812e80402aeb1223,2013-01-17 00:00:00,18554,0,ORD,...,FCOORD,KK,N,Y,2013-06-30 10:00:00,2013-06-30 12:54:13,-2.0,2013.0,1.0,
475592,2013-10-04 00:00:00,1G,GB,a26ab5606d727b25c6aed7b82fa1d9d2,1b6a35afcdb871c78e5fc8a832722620,c196e503a13d1eb4c7fb295f3e2bdbf2,2013-10-04 00:00:00,23174,0,LHR,...,LHRAMSHRE,UV,L,Y,2013-10-05 06:30:00,2013-10-05 20:47:29,1.0,2013.0,10.0,
590294,2013-08-23 00:00:00,1S,AU,9222ab44126311c65b4965519830863d,0e83b40f689bf20e2bb9c32e27cff0c3,ad26e2ed7319eb4ad6693f331872171b,2013-08-20 00:00:00,26787,0,SYD,...,CDGDXBSYD,BP,R,Y,2013-10-22 21:50:00,2013-10-24 07:17:37,-1.0,2013.0,8.0,
603047,2013-03-21 00:00:00,1A,SE,aae9c3d28247b809dbaf0064024ef7a7,e0b23ea1bcc88056ceaa3bdf03f0bbae,0c8c429b6a8c3d542ed67d5dbe98e45b,2013-03-04 00:00:00,6535,0,ARN,...,ARNAGP,NJ,G,Y,2013-09-04 07:00:00,2013-09-04 11:20:45,-25.0,2013.0,3.0,
25224,2013-03-05 00:00:00,1P,US,d57eafa5e2fd19da6c09d8a48d38f77b,9d7d19fb7b579dd25577487c89084732,31d504b2326ad5400e93eb9197c672c4,2013-03-05 00:00:00,97412,0,JFK,...,JFKDOHKTM,BC,E,Y,2013-04-28 23:00:00,2013-04-30 09:30:18,1.0,2013.0,3.0,
842574,2013-07-03 00:00:00,1A,SD,b6d25bcd18c9724273f333eabb4c89bf,378db6a47de416847fc6207225d8f12e,20427a621741fbae01b7de1af3d50e24,2013-07-02 00:00:00,5686,0,KRT,...,DXBDOHKRT,BC,U,Y,2013-07-11 15:00:00,2013-07-11 19:16:44,-2.0,2013.0,7.0,
594208,2013-07-04 00:00:00,1S,IT,62ac702c3387f3c99c7d0aaa338ed00f,ddccc1688a8963b71a0bc1149e67a2e8,569045e9faea4e0ea39a6a6b582c4e6d,2013-07-04 00:00:00,12819,0,FCO,...,FCODXBHKG,OU,I,C,2013-09-09 15:25:00,2013-09-10 15:05:42,2.0,2013.0,7.0,
688088,2013-11-18 00:00:00,1A,IL,0ef1bd6518eddba2e6931f784b63e607,f7bd86c828db5021cfb1b1571d55320d,3a624d18148f46853e0d14730a402625,2013-10-29 00:00:00,58850,0,AMM,...,MNLDXBAMM,OU,Q,Y,2013-12-26 00:20:00,2013-12-26 09:05:36,-1.0,2013.0,11.0,
918092,2013-12-16 00:00:00,1S,CA,96ac3a7c13c9d34e39c11f9989c654da,3906e13efcda3332a3b208d7172b803a,9ede92da25c9b1e8a06397755c4b6bc0,2013-12-16 00:00:00,4918,0,YYZ,...,YYZLAS,HD,M,Y,2014-09-18 09:40:00,2014-09-18 11:08:07,1.0,2013.0,12.0,


In [10]:
all_chunks.to_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv',sep='^')

In [11]:
!ls /home/dsc/Data/challenge/

bookings.csv.bz2		 searches.csv.bz2
bookings_without_duplicates.csv  searches_without_duplicates.csv


### Selecting, cleaning and formatting data

In [15]:
chksize = 100000

In [16]:
%%time
bookings = pd.read_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv',sep='^', iterator=True, chunksize=chksize)
all_chunks= pd.DataFrame()
chunk_counter=0

for df in bookings:
    df.columns = df.columns.str.strip()
    df = df[['act_date','dep_port','arr_port','brd_time','off_time']]
    df['dep_port']= df['dep_port'].str.strip().str.upper()
    df['dep_port']= df['dep_port'].str.strip().str.upper()
    chunk_counter+=1
    print(chunk_counter)

1
2
3
4
5
6
7
8
9
10
11
CPU times: user 8.85 s, sys: 433 ms, total: 9.28 s
Wall time: 9.51 s
