# Write a Web Service

•  Wrap the output of the second exercise in a web service that returns the data in JSON format (instead of printing to the standard output).

• The web service should accept a parameter n>0. For the top 10 airports, n is 10. For the X top airports, n is X

## Step 1: Let's start with a sample

#### Pandas dataframe and json output

In [18]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = int(st.text_input("Insert a number:"))

try:
    bookings_sample = pd.read_csv('bookings.sample.csv.bz2', compression='bz2', sep='^', usecols=['year','arr_port','pax'])
    bookings_sample_2013 = bookings_sample[bookings_sample['year'] == 2013]
    top_airports = bookings_sample_2013.groupby('arr_port')['pax'].sum().sort_values(ascending=False).head(n)
    st.table(top_airports)
    result_json = top_airports.to_json()
    st.json(result_json)

except:
    print('Only numbers allowed')

Overwriting top_arrival_airports_2013.py


In [None]:
cat 

In [15]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = int(st.text_input("Insert a number:"))

bookings_sample = pd.read_csv('bookings.sample.csv.bz2', compression='bz2', sep='^', usecols=['year','arr_port','pax'])

bookings_sample_2013 = bookings_sample[bookings_sample['year'] == 2013]
top_airports = bookings_sample_2013.groupby('arr_port')['pax'].sum().sort_values(ascending=False).head(n)

result_json = top_airports.to_json()
st.json(result_json)

Overwriting top_arrival_airports_2013.py


## Step 2: Let's do it now with the whole dataset using chunks

#### Pandas dataframe and json output

In [13]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = int(st.text_input("Insert a number:"))

chksize = 100000
reader = pd.read_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv' , sep='^', usecols=['year','arr_port','pax'], iterator=True, chunksize=chksize)
all_chunks= []

for df in reader:
    df = df[df['year'] == 2013]
    result_chunk = df.groupby('arr_port')['pax'].sum()
    all_chunks.append(result_chunk)
    
pax_per_airport_2013 = pd.concat(all_chunks)
top_airports = pax_per_airport_2013.reset_index().groupby('arr_port')['pax'].sum().sort_values(ascending=False).head(n)
st.table(top_airports)

result_json = top_airports.to_json()
st.json(result_json)

Overwriting top_arrival_airports_2013.py


#### Only json output

In [12]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = int(st.text_input("Insert a number:"))

chksize = 100000
reader = pd.read_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv' , sep='^', usecols=['year','arr_port','pax'], iterator=True, chunksize=chksize)
all_chunks= []

for df in reader:
    df = df[df['year'] == 2013]
    result_chunk = df.groupby('arr_port')['pax'].sum()
    all_chunks.append(result_chunk)
    
pax_per_airport_2013 = pd.concat(all_chunks)
top_airports = pax_per_airport_2013.reset_index().groupby('arr_port')['pax'].sum().sort_values(ascending=False).head(n)

result_json = top_airports.to_json()
st.json(result_json)

Overwriting top_arrival_airports_2013.py


## Step 3: Let's do it again with our csv uploaded online

In [31]:
import pandas as pd
chksize = 100000
reader = pd.read_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv' , sep='^', usecols=['year','arr_port','pax'], iterator=True, chunksize=chksize)
all_chunks= pd.DataFrame()

for df in reader:
    df = df[df['year'] == 2013]
    result_chunk = df.groupby('arr_port')['pax'].sum()
    all_chunks = all_chunks.append(result_chunk)

EmptyDataError: No columns to parse from file

In [33]:
bookings = pd.read_csv('/home/dsc/Data/challenge/' , sep='^')
bookings.head() 


EmptyDataError: No columns to parse from file

In [3]:
import pandas as pd

In [4]:
chksize = 100000

In [5]:
%%time
reader = pd.read_csv('/home/dsc/Data/challenge/bookings.csv.bz2',compression='bz2',sep='^', iterator=True, chunksize=chksize)
all_chunks= pd.DataFrame()
chunk_counter=0

for df in reader:
    all_chunks=all_chunks.append(df)
    all_chunks.drop_duplicates(inplace=True)
    chunk_counter+=1
    print(chunk_counter)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50




51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
CPU times: user 16min 38s, sys: 1min 16s, total: 17min 54s
Wall time: 15min 44s


In [6]:
all_chunks.shape

(1016377, 38)

In [7]:
all_chunks.to_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv',sep='^', index=False)

In [9]:
bookings_test = pd.read_csv('/home/dsc/Data/challenge/bookings_without_duplicates.csv', sep='^')
bookings_test.shape


(1016377, 38)

In [10]:
bookings_test.head()

Unnamed: 0,act_date,source,pos_ctry,pos_iata,pos_oid,rloc,cre_date,duration,distance,dep_port,...,route,carrier,bkg_class,cab_class,brd_time,off_time,pax,year,month,oid
0,2013-03-05 00:00:00,1A,DE,a68dd7ae953c8acfb187a1af2dcbe123,1a11ae49fcbf545fd2afc1a24d88d2b7,ea65900e72d71f4626378e2ebd298267,2013-02-22 00:00:00,1708,0,ZRH,...,LHRZRH,VI,T,Y,2013-03-07 08:50:00,2013-03-07 11:33:37,-1.0,2013.0,3.0,
1,2013-03-26 00:00:00,1A,US,e612b9eeeee6f17f42d9b0d3b79e75ca,7437560d8f276d6d05eeb806d9e7edee,737295a86982c941f1c2da9a46a14043,2013-03-26 00:00:00,135270,0,SAL,...,SALATLCLT,NV,L,Y,2013-04-12 13:04:00,2013-04-12 22:05:40,1.0,2013.0,3.0,
2,2013-03-26 00:00:00,1A,US,e612b9eeeee6f17f42d9b0d3b79e75ca,7437560d8f276d6d05eeb806d9e7edee,737295a86982c941f1c2da9a46a14043,2013-03-26 00:00:00,135270,0,SAL,...,CLTATLSAL,NV,U,Y,2013-07-15 07:00:00,2013-07-15 11:34:51,1.0,2013.0,3.0,
3,2013-03-26 00:00:00,1A,AU,0f984b3bb6bd06661c95529bbd6193bc,36472c6dbaf7afec9136ac40364e2794,5ecf00fdcbcec761c43dc7285253d0c1,2013-03-26 00:00:00,30885,0,AKL,...,AKLHKGSVO,XK,G,Y,2013-04-24 23:59:00,2013-04-25 16:06:31,1.0,2013.0,3.0,SYDA82546
4,2013-03-26 00:00:00,1A,AU,0f984b3bb6bd06661c95529bbd6193bc,36472c6dbaf7afec9136ac40364e2794,5ecf00fdcbcec761c43dc7285253d0c1,2013-03-26 00:00:00,30885,0,AKL,...,SVOHKGAKL,XK,G,Y,2013-05-14 20:15:00,2013-05-16 10:44:50,1.0,2013.0,3.0,SYDA82546


In [21]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = int(st.text_input("Insert a number:"))

bookings_sample = pd.read_csv('https://github.com/Laurajmoreno/DS_Challenge/blob/main/bookings.sample.csv.bz2', compression='bz2', sep='^', usecols=['year','arr_port','pax'])

bookings_sample_2013 = bookings_sample[bookings_sample['year'] == 2013]
top_airports = bookings_sample_2013.groupby('arr_port')['pax'].sum().sort_values(ascending=False).head(10)
st.table(top_airports.head())

result_json = top_airports.to_json()
st.json(result_json)

Overwriting top_arrival_airports_2013.py


In [None]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = st.text_input("Insert a number:")

@st.cache
def get_csv():
    return pd.read_csv('https://github.com/Laurajmoreno/DS_Challenge/blob/main/bookings.sample.csv.bz2', compression='bz2', sep='^', usecols=['year','arr_port','pax'])
bookings_sample = get_csv()

bookings_sample_2013 = bookings_sample[bookings_sample['year'] == 2013]
st.table(bookings_sample_2013.head())

In [8]:
%%writefile top_arrival_airports_2013.py
import streamlit as st
import pandas as pd

st.title('Top arrival airports in 2013')
st.subheader('This web service will allow you to get the top arrival airports in terms of passengers in 2013 on a JSON format')
st.markdown('Please insert the number of TOP airports you want to get. For instance, for the TOP 10 airports you will have to specify 10.')

n = st.text_input("Insert a number:")

chksize=100

@st.cache
def csv_():
    return pd.read_csv('https://github.com/Laurajmoreno/DS_Challenge/blob/main/bookings.sample.csv.bz2', compression='bz2', sep='^', usecols=['year','arr_port','pax'], iterator=True, chunksize=chksize)

reader = csv_()
all_chunks= pd.DataFrame()

for df in reader:
    df = df[df['year'] == 2013]
    result_chunk = df.groupby('arr_port')['pax'].sum()
    all_chunks = all_chunks.append(result_chunk)

result = pax_per_airport_2013.reset_index().groupby('arr_port')['pax'].sum().sort_values(ascending=False).head(n)
result_json = result.to_json()

st.json(result_json)


Overwriting top_arrival_airports_2013.py


In [7]:
!ls

 bookings.sample.csv.bz2
'Exercise 1 - Counting the number of lines in a big file.ipynb'
'Exercise 2 - Top 10 arrival airports in 2013 .ipynb'
'Exercise 3 - Number of searches for Madrid, Barcelona and Malaga.ipynb'
'Exercise 4 - Searches with bookings match.ipynb'
'Exercise 5 - Write a Web Service.ipynb'
 README.md
 searches.sample.csv.bz2
 top_arrival_airports_2013.py
