# Amadeus Challenge

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import bz2

## Exercise 1

- Count the number of lines in Python for each file

### - What type of file do I have?

bz2 is a free and open-source file compression program that uses the Burrows–Wheeler algorithm.

bzip2 compresses most files more effectively than the older LZW (.Z) and Deflate (.zip and .gz) compression algorithms, but is considerably slower. 

In [4]:
!ls -l ../../Data/challenge

total 1019188
-rwxr-x--- 1 dsc dsc 554970628 mar 13 17:24 bookings.csv.bz2
-rw-r--r-- 1 dsc dsc   5473249 may  5 10:01 bookings.sample.csv.bz2
-rwxr-x--- 1 dsc dsc 483188920 mar 13 17:24 searches.csv.bz2


In [11]:
!pwd

/home/dsc/Master-Data-Science/Amadeus Challenge


In [None]:
! bzcat {path_to_zips}bookings.csv.bz2 | head -100000 | bzip2 -c > {path_to_zips}bookings.sample.csv.bz2

In [6]:
path_to_zips = "/home/dsc/Data/challenge/"
f_bookings = "bookings.csv.bz2"
f_searches = "searches.csv.bz2"
f_bookings_sample = "bookings.sample.csv.bz2"

### - How can I read it?

Created a small sample to check if the codes below are right<br>
first result after using with file as: 5000006, which is wrong as I used readlines()<br>
second result after using with file as: 10000011 <br>

In [8]:
# reading lines in bookings to check the code is right

count = 0

with bz2.BZ2File(path_to_zips+f_bookings_sample, "r") as file:
    for line in file:
        count+=1
print(count)

100000


In [7]:
# reading lines in bookings

count = 0

with bz2.BZ2File(path_to_zips+f_bookings, "r") as file:
    for line in file:
        count+=1
print(count)

10000011


In [10]:
# reading lines in searches

count = 0

with bz2.BZ2File(path_to_zips+f_searches, "r") as file:
    for line in file:
        count+=1
print(count)

20390199


## Exercise 2

Top 10 arrival airports in the world in 2013 (using the bookings file)

Arrival airport is the column arr_port. It is the IATA code for the airport
To get the total number of passengers for an airport, you can sum the column pax, grouping by arr_port. Note that there is negative pax. That corresponds to cancelations. So to get the total number of passengers that have actually booked, you should sum including the negatives (that will remove the canceled bookings).
Print the top 10 arrival airports in the standard output, including the number of passengers.

In [9]:
df_bookings = pd.read_csv(path_to_zips+f_bookings_sample, compression='bz2', sep='^')
df_bookings.columns = df_bookings.columns.str.strip() #clear spaces from headers
df_bookings.columns

Index(['act_date', 'source', 'pos_ctry', 'pos_iata', 'pos_oid', 'rloc',
       'cre_date', 'duration', 'distance', 'dep_port', 'dep_city', 'dep_ctry',
       'arr_port', 'arr_city', 'arr_ctry', 'lst_port', 'lst_city', 'lst_ctry',
       'brd_port', 'brd_city', 'brd_ctry', 'off_port', 'off_city', 'off_ctry',
       'mkt_port', 'mkt_city', 'mkt_ctry', 'intl', 'route', 'carrier',
       'bkg_class', 'cab_class', 'brd_time', 'off_time', 'pax', 'year',
       'month', 'oid'],
      dtype='object')

In [11]:
df_bookings_arr_pax = df_bookings[df_bookings['year'] == 2013][['arr_port', 'pax']]
df_bookings_arr_pax.head()

Unnamed: 0,arr_port,pax
0,LHR,-1
1,CLT,1
2,CLT,1
3,SVO,1
4,SVO,1


In [13]:
df_bookings_arr_pax['arr_port'] = df_bookings_arr_pax['arr_port'].str.strip().str.upper()

In [16]:
df_bookings_all = pd.DataFrame()

for chunk in pd.read_csv(path_to_zips+f_bookings, compression='bz2', sep='^', chunksize=100000, low_memory=False):
    chunk.columns = chunk.columns.str.strip().str.lower() #clear spaces from headers    
    df_bookings_all = df_bookings_all.append(chunk[chunk['year'] == 2013][['arr_port', 'pax']])

In [94]:
df_bookings_all['arr_port'] = df_bookings_all['arr_port'].str.strip()

In [112]:
df_final_10 = df_bookings_all.groupby('arr_port', as_index=False).sum().sort_values(by='pax', ascending=False).head(10)
df_final_10.head(10)

Unnamed: 0,arr_port,pax
1088,LHR,88809.0
1190,MCO,70930.0
1050,LAX,70530.0
1047,LAS,69630.0
886,JFK,66270.0
315,CDG,64490.0
216,BKK,59460.0
1228,MIA,58150.0
1719,SFO,58000.0
517,DXB,55590.0


#### Bonus point

Get the name of the city or airport corresponding to that airport (programatically, we suggest to have a look at GeoBases in Github)

In [99]:
url = 'https://raw.githubusercontent.com/opentraveldata/geobases/public/GeoBases/DataSources/Airports/GeoNames/airports_geonames_only_clean.csv'

df_airports = pd.read_csv(url, sep='^', header=None, usecols=[0,1])
df_airports.sample(5)

Unnamed: 0,0,1
2958,NRS,Ream Field
439,GOU,Garoua
2135,TBJ,Tabarka
2884,#SDL,Scottsdale Airport
2200,JRO,Kilimanjaro International Airport


In [110]:
df_airports.columns = ['arr_port', 'city']
df_airports.head()

Unnamed: 0,arr_port,city
0,AUH,Abu Dhabi International Airport
1,AZI,Abu Dhabi Bateen Airport
2,AAN,Al Ain International Airport
3,DXB,Dubai International Airport
4,FJR,Fujairah


In [113]:
df_final_10_joined = df_final.join(df_airports.set_index('arr_port'), on='arr_port')
df_final_10_joined

Unnamed: 0,arr_port,pax,city
1088,LHR,88809.0,London Heathrow Airport
1190,MCO,70930.0,Orlando International Airport
1050,LAX,70530.0,Los Angeles International Airport
1047,LAS,69630.0,McCarran International Airport
886,JFK,66270.0,John F Kennedy International Airport
315,CDG,64490.0,Paris - Charles-de-Gaulle
216,BKK,59460.0,Suvarnabhumi
1228,MIA,58150.0,Miami International Airport
1719,SFO,58000.0,San Francisco International Airport
517,DXB,55590.0,Dubai International Airport
