## Match searches with bookings

- For every search in the searches file, find out whether the search ended up in a booking or not (using the info in the bookings file). For instance, search and booking origin and destination should match. 

- For the bookings file, origin and destination are the columns dep_port and arr_port, respectively. 

- Generate a CSV file with the search data, and an additional field, containing 1 if the search ended up in a booking, and 0 otherwise.



## 2) Prepare the data for processing

### Booking

#### We didnt check for duplicates so far... What if the file has duplicated lines?

In [1]:
import pandas as pd

In [2]:
ls -l

total 1019240
-rw-rw-r-- 1 dsc dsc      5833 dic 13 18:18 bad_search_lines.csv
-rw-rw-r-- 1 dsc dsc 554970628 may 31  2019 [0m[01;31mbookings.csv.bz2[0m
-rw-rw-r-- 1 dsc dsc    535893 nov 30 09:18 [01;31mbookings.sample.csv.bz2[0m
-rw-rw-r-- 1 dsc dsc     18340 nov 30 14:00 ch_01-clase.ipynb
-rw-rw-r-- 1 dsc dsc    144634 dic  1 09:10 ch_02-clase.ipynb
-rw-rw-r-- 1 dsc dsc    182508 dic 13 20:00 ch_03-class.ipynb
-rw-rw-r-- 1 dsc dsc      7895 dic 13 20:22 ch_04-Empty.ipynb
-rw-rw-r-- 1 dsc dsc     19188 dic 13 17:32 ch_05b-empty.ipynb
-rw-rw-r-- 1 dsc dsc      8629 dic  1 09:14 ch_05-clase.ipynb
-rw-rw-r-- 1 dsc dsc     25161 dic 13 20:01 ch_05-empty.ipynb
-rwxrwxrwx 1 dsc dsc   4232732 nov 30 09:54 [01;32msample.csv[0m*
-rw-rw-r-- 1 dsc dsc 483188920 may 31  2019 [01;31msearches.csv.bz2[0m
-rw-rw-r-- 1 dsc dsc    244720 nov 30 09:18 [01;31msearches.sample.csv.bz2[0m
-rw-rw-r-- 1 dsc dsc     77871 nov 30 13:29 top_airports.csv


In [5]:
%%time
bc=pd.read_csv('./bookings.csv.bz2', sep="^", chunksize=500000, low_memory=False, dtype=str)

all_chunks=pd.DataFrame()
for i, chunk in enumerate(bc):
    all_chunks=all_chunks.append(chunk)
    all_chunks.drop_duplicates(inplace=True)
    print((i+1)*500000, len(all_chunks))

all_chunks.to_csv('bookings_no_dup.csv', sep="^", index=False)
! bzip2 -f bookings_no_dup.csv

500000 500000
1000000 1000000
1500000 1000000
2000000 1000000
2500000 1000000
3000000 1000000
3500000 1000000
4000000 1000000
4500000 1000000
5000000 1000000
5500000 1000003
6000000 1000003
6500000 1000003
7000000 1000003
7500000 1000003
8000000 1000003
8500000 1000003
9000000 1000003
9500000 1000003
10000000 1000003
10500000 1000003
CPU times: user 12min 34s, sys: 22.8 s, total: 12min 57s
Wall time: 13min 23s


In [6]:
%%time
bc=pd.read_csv('./searches.csv.bz2', sep="^", chunksize=500000, low_memory=False, dtype=str)

all_chunks=pd.DataFrame()
for i, chunk in enumerate(bc):
    all_chunks=all_chunks.append(chunk)
    all_chunks.drop_duplicates(inplace=True)
    print((i+1)*500000, len(all_chunks))

all_chunks.to_csv('searches_no_dup.csv', sep="^", index=False)
! bzip2 -f searches_no_dup.csv

500000 358999
1000000 358999
1500000 359003
2000000 359003
2500000 359003
3000000 359003
3500000 359003
4000000 359003
4500000 359003
5000000 359003
5500000 359003
6000000 359003
6500000 359003
7000000 359003
7500000 359003
8000000 359003
8500000 359003
9000000 359003
9500000 359003
10000000 359003
10500000 359003
11000000 359003
11500000 359003
12000000 359003
12500000 359003
13000000 359003
13500000 359003
14000000 359003
14500000 359003
15000000 359003
15500000 359003
16000000 359003
16500000 359003
17000000 359003
17500000 359003
18000000 359003
18500000 359003
19000000 359003
19500000 359003
20000000 359003
20500000 359004
CPU times: user 16min 3s, sys: 31.6 s, total: 16min 35s
Wall time: 16min 14s


#### We have seen that we have white space in some columns....

In [None]:
Action plan
1. remove duplicates
2. read 3 columns from Bookings and all from search
3. remove white spaces from bookings dep/arr
4. format the date in bookings
5. remove duplicates from booking
6. create column "booked" with 1 in bookings table
7. merge on 3 columns
8. fill "booked" of merged table with 0


In [8]:
b=pd.read_csv('bookings_no_dup.csv.bz2', sep="^", low_memory=False, nrows=1000)
b.columns.tolist()

['act_date           ',
 'source',
 'pos_ctry',
 'pos_iata',
 'pos_oid  ',
 'rloc          ',
 'cre_date           ',
 'duration',
 'distance',
 'dep_port',
 'dep_city',
 'dep_ctry',
 'arr_port',
 'arr_city',
 'arr_ctry',
 'lst_port',
 'lst_city',
 'lst_ctry',
 'brd_port',
 'brd_city',
 'brd_ctry',
 'off_port',
 'off_city',
 'off_ctry',
 'mkt_port',
 'mkt_city',
 'mkt_ctry',
 'intl',
 'route          ',
 'carrier',
 'bkg_class',
 'cab_class',
 'brd_time           ',
 'off_time           ',
 'pax',
 'year',
 'month',
 'oid      ']

In [25]:
import pandas as pd
pd.set_option('display.max_columns',None)
s=pd.read_csv('searches_no_dup.csv.bz2', sep="^", low_memory=False)
#s=pd.read_csv('searches_no_dup.csv.bz2', sep="^", usecols=['Date', 'Origin', 'Destination'])

b=pd.read_csv('bookings_no_dup.csv.bz2', sep="^", usecols=['cre_date           ','dep_port', 'arr_port'])
b.columns=b.columns.str.strip()
b.dep_port=b.dep_port.str.strip()
b.arr_port=b.arr_port.str.strip()
b.cre_date=b.cre_date.str[0:10]
b.drop_duplicates(inplace=True)
b['booked']=1



In [26]:
b.head()

Unnamed: 0,cre_date,dep_port,arr_port,booked
0,2013-02-22,ZRH,LHR,1
1,2013-03-26,SAL,CLT,1
3,2013-03-26,AKL,SVO,1
5,2013-03-20,DEN,LGA,1
7,2013-03-25,NRT,SIN,1


In [28]:
s.tail()

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice
358999,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1.0,2.0,TXL,AUH,2013-01-26,D2,,AUH,TXL,2013-02-02,D2,,,,,,,,,,,,,,,,,,,1ASIWS,0,0.0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,,,
359000,"2013-01-01,10:15:33,MPT,b0af35b31588dc4ab06d5c...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
359001,2013-01-01,18:04:49,MPT,3561,US,ICT,SFO,1.0,2.0,ICT,SFO,2013-08-02,,,SFO,ICT,2013-08-09,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC
359002,2013-01-01,19:57:57,MPT,28d7a8c95e4db88589d3d35b66920e78,DE,FRA,BGW,1.0,2.0,FRA,BGW,2013-02-26,,,BGW,FRA,2013-04-08,,,,,,,,,,,,,,,,,,,,,1ASI,0.0,0,0,d41d8cd98f00b204e9800998ecf8427e,BNJ,,
359003,2013-10-13,18:57:54,MTP,e41c9d833aa74600552f2ed688b67d81,AT,VIE,HA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [23]:
print(len(b))
b.drop_duplicates(inplace=True)
print(len(b))

1000003
334876


In [29]:
len(s)

359004

In [30]:
s_b=s.merge(b,
           how='left',
           left_on=['Date', 'Origin', 'Destination'],
           right_on=['cre_date', 'dep_port', 'arr_port'])

In [31]:
len(s_b)

359004

In [33]:
s_b.drop(['cre_date', 'dep_port', 'arr_port'], axis=1, inplace=True)

In [34]:
s_b.tail()

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,booked
358999,2013-01-01,20:25:57,MPT,624d8c3ac0b3a7ca03e3c167e0f48327,DE,TXL,AUH,1.0,2.0,TXL,AUH,2013-01-26,D2,,AUH,TXL,2013-02-02,D2,,,,,,,,,,,,,,,,,,,1ASIWS,0,0.0,0,d41d8cd98f00b204e9800998ecf8427e,FRA,,,,
359000,"2013-01-01,10:15:33,MPT,b0af35b31588dc4ab06d5c...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
359001,2013-01-01,18:04:49,MPT,3561,US,ICT,SFO,1.0,2.0,ICT,SFO,2013-08-02,,,SFO,ICT,2013-08-09,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC,
359002,2013-01-01,19:57:57,MPT,28d7a8c95e4db88589d3d35b66920e78,DE,FRA,BGW,1.0,2.0,FRA,BGW,2013-02-26,,,BGW,FRA,2013-04-08,,,,,,,,,,,,,,,,,,,,,1ASI,0.0,0,0,d41d8cd98f00b204e9800998ecf8427e,BNJ,,,
359003,2013-10-13,18:57:54,MTP,e41c9d833aa74600552f2ed688b67d81,AT,VIE,HA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [35]:
s_b[ s_b['booked']>0 ].head()

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,booked
27,2013-01-01,18:33:28,CCP,3565e31495ecfd46fa018339d20382b1,SA,RUH,JED,0.0,1.0,RUH,JED,2013-01-04,KP,,,,,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,JED,1.0
40,2013-01-01,06:36:57,FFP,86f167b84e77346849f9439ae87c02a6,SA,DMM,MNL,1.0,2.0,DMM,MNL,2013-06-01,OJ,,MNL,DMM,2013-06-30,OJ,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,JED,1.0
59,2013-01-01,07:00:38,FQD,e8741eaf2fa2f71f931475d18fa72096,US,ATL,MIA,0.0,1.0,ATL,MIA,2013-01-10,NV,,,,,,,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,SEA,1.0
134,2013-01-01,23:30:44,MPT,5215502d9524c3183f3839b0d9a5e4f9,AU,MEL,SYD,0.0,1.0,MEL,SYD,2013-01-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0,0,0,d41d8cd98f00b204e9800998ecf8427e,SYD,1.0
172,2013-01-01,18:34:27,MPT,fd4afff0035bec8f8e305d38804c33f6,IN,BOM,JED,1.0,2.0,BOM,JED,2013-01-26,,,JED,BOM,2013-01-29,,,,,,,,,,,,,,,,,,,,,,,1ASI,0,0,0,d41d8cd98f00b204e9800998ecf8427e,DEL,1.0


In [36]:
s_b['booked']=s_b['booked'].fillna(0)

In [37]:
s_b=s_b.astype({"booked":int})

In [39]:
s_b.tail(3)

Unnamed: 0,Date,Time,TxnCode,OfficeID,Country,Origin,Destination,RoundTrip,NbSegments,Seg1Departure,Seg1Arrival,Seg1Date,Seg1Carrier,Seg1BookingCode,Seg2Departure,Seg2Arrival,Seg2Date,Seg2Carrier,Seg2BookingCode,Seg3Departure,Seg3Arrival,Seg3Date,Seg3Carrier,Seg3BookingCode,Seg4Departure,Seg4Arrival,Seg4Date,Seg4Carrier,Seg4BookingCode,Seg5Departure,Seg5Arrival,Seg5Date,Seg5Carrier,Seg5BookingCode,Seg6Departure,Seg6Arrival,Seg6Date,Seg6Carrier,Seg6BookingCode,From,IsPublishedForNeg,IsFromInternet,IsFromVista,TerminalID,InternetOffice,booked
359001,2013-01-01,18:04:49,MPT,3561,US,ICT,SFO,1.0,2.0,ICT,SFO,2013-08-02,,,SFO,ICT,2013-08-09,,,,,,,,,,,,,,,,,,,,,,,1ASIWS,0.0,0,0,d41d8cd98f00b204e9800998ecf8427e,NYC,0
359002,2013-01-01,19:57:57,MPT,28d7a8c95e4db88589d3d35b66920e78,DE,FRA,BGW,1.0,2.0,FRA,BGW,2013-02-26,,,BGW,FRA,2013-04-08,,,,,,,,,,,,,,,,,,,,,1ASI,0.0,0,0.0,d41d8cd98f00b204e9800998ecf8427e,BNJ,,,0
359003,2013-10-13,18:57:54,MTP,e41c9d833aa74600552f2ed688b67d81,AT,VIE,HA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0


In [40]:
import pandas as pd
pd.set_option('display.max_columns',None)
s=pd.read_csv('searches_no_dup.csv.bz2', sep="^", low_memory=False)
#s=pd.read_csv('searches_no_dup.csv.bz2', sep="^", usecols=['Date', 'Origin', 'Destination'])

b=pd.read_csv('bookings_no_dup.csv.bz2', sep="^", usecols=['cre_date           ','dep_port', 'arr_port'])
b.columns=b.columns.str.strip()
b.dep_port=b.dep_port.str.strip()
b.arr_port=b.arr_port.str.strip()
b.cre_date=b.cre_date.str[0:10]
b.drop_duplicates(inplace=True)
b['booked']=1

s_b=s.merge(b,
           how='left',
           left_on=['Date', 'Origin', 'Destination'],
           right_on=['cre_date', 'dep_port', 'arr_port'])
s_b.drop(['cre_date', 'dep_port', 'arr_port'], axis=1, inplace=True)

s_b['booked']=s_b['booked'].fillna(0)
s_b=s_b.astype({"booked":int})

In [41]:
s_b.to_csv("searches_with_book.csv", sep='^', index=False)