# Ian's Data Discovery

## All the tables
Just to get all the tables first

In [1]:
import cadspy
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
icw = cadspy.DatabaseConnection(system='ICW',user='u243696')

In [3]:
# diplay all rows and cols when using 'dataframe'.head() or 'dataframe'.tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Lounge Eligibility Data

In [4]:
query = """

sel * from LDB_SBOX_OR.HACKATHON_OPS_LOUNGE_ELIGIBILITY

"""

df_lounge_eligibility = icw.queryToDataframe(query)

In [5]:
df_lounge_eligibility.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax
0,2019-08-101414,BA,1414,2019-08-10,LHR,BHD,M,M,Gold,EMER,Tier 2,3
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23


### Flight Info

In [6]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_FLIGHT_INFO

"""

df_flight_info = icw.queryToDataframe(query)

In [7]:
df_flight_info.head(5)

Unnamed: 0,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
0,BA,548,2019-06-03 07:30:00,2019-06-03 07:27:00,LHR,5,FCO,FCO,321,V6,LHRFCO,IT,Italy,MEDITERRANEAN,EUROPE EXC UK
1,BA,490,2019-06-12 10:35:00,2019-06-12 10:31:00,LHR,3,GIB,GIB,320,A3,LHRGIB,GI,Gibraltar,MEDITERRANEAN,EUROPE EXC UK
2,BA,386,2019-07-19 17:35:00,2019-07-19 18:14:00,LHR,5,NTE,NTE,321,V6,LHRNTE,FR,France,WEST EUROPE,EUROPE EXC UK
3,BA,582,2019-05-22 19:35:00,2019-05-22 19:32:00,LHR,5,MXP,VRN,32A,H3,LHRMXP,IT,Italy,MEDITERRANEAN,EUROPE EXC UK
4,BA,275,2019-05-18 15:40:00,2019-05-18 15:38:00,LHR,3,LAS,LAS,744,V4,LHRLAS,US,United States,USA,NORTH AMERICA


### Station Code Decode

In [8]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_COUNTRY_DECODE

"""

df_country = icw.queryToDataframe(query)

In [9]:
df_country.head(5)

Unnamed: 0,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
0,LHRINV,GB,United Kingdom and Northern Ireland,UK,UK
1,LHRSVO,RU,Russia in Europe,EASTERN EUROPE,EUROPE EXC UK
2,LHREZE,AR,Argentina,SOUTH AMERICA,SOUTH AMERICA INC CARIBBEAN
3,LHRLUX,LU,Luxembourg,BENELUX,EUROPE EXC UK
4,LHRCAI,EG,Egypt,NORTH AFRICA,AFRICA


### Aircraft Type

In [10]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_AC_TYPE

"""

df_acft_typ = icw.queryToDataframe(query)

In [11]:
df_acft_typ

Unnamed: 0,IATA_AC_TYP_CD,ACT_AC_TYP_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
0,320,A3,NB,0,24,0,132
1,777,K7,WB,0,48,24,203
2,32N,N3,NB,0,20,0,150
3,321,M6,NB,0,23,0,131
4,32Q,N6,NB,0,32,0,172
5,789,L8,WB,8,42,39,127
6,32A,H3,NB,0,20,0,150
7,319,A4,NB,0,16,0,119
8,788,B8,WB,0,35,25,154
9,77W,G7,WB,14,56,44,183


# Checkings
Basically just to check duplicated data

## 1. Loungue Eligibility
Norally each Skew ID should only have one destination

- We first group the lpungue eligibility by Skew ID and flight number, showing the discharge station code and total passenegr

In [12]:
df_lounge_by_skew = df_lounge_eligibility.groupby(['Skew_Id','OPERATING_FLT_NO','DISCHARGE_STN_CD'], as_index = False).agg(pax_count = ('pax','sum'))

df_lounge_by_skew.head(5)

Unnamed: 0,Skew_Id,OPERATING_FLT_NO,DISCHARGE_STN_CD,pax_count
0,2019-03-31103,103,YYC,197
1,2019-03-31105,105,DXB,209
2,2019-03-31107,107,DXB,201
3,2019-03-31109,109,DXB,217
4,2019-03-3111,11,SIN,443


- We then find all duplicated Skew ID's, and sorted by Skew ID for readability

In [13]:
duplicated_flag = df_lounge_by_skew.duplicated('Skew_Id',keep = False)

df_lounge_by_skew_duplicated = df_lounge_by_skew[duplicated_flag].sort_values(['Skew_Id','OPERATING_FLT_NO'])

df_lounge_by_skew_duplicated.head(20)

Unnamed: 0,Skew_Id,OPERATING_FLT_NO,DISCHARGE_STN_CD,pax_count
38,2019-03-311448,1448,ABZ,1
39,2019-03-311448,1448,EDI,139
55,2019-03-3115,15,SIN,165
56,2019-03-3115,15,SYD,114
165,2019-03-31462,462,MAD,173
166,2019-03-31462,462,PHX,15
348,2019-04-011314,1314,ABZ,119
349,2019-04-011314,1314,BHD,1
395,2019-04-0115,15,SIN,97
396,2019-04-0115,15,SYD,114


In [14]:
df_lounge_by_skew_duplicated.tail(10)

Unnamed: 0,Skew_Id,OPERATING_FLT_NO,DISCHARGE_STN_CD,pax_count
133890,2023-10-12253,253,NAS,87
133891,2023-10-12253,253,PLS,72
133928,2023-10-12362,362,GLA,13
133929,2023-10-12362,362,LYS,88
133940,2023-10-12404,404,BRU,10
133941,2023-10-12404,404,WAW,5
133989,2023-10-12554,554,EDI,1
133990,2023-10-12554,554,FCO,61
134009,2023-10-12606,606,GLA,2
134010,2023-10-12606,606,PSA,83


- Check Flight 1448 on 2019-3-31

In [15]:
df_flight_info[(df_flight_info['OPG_FLT_NO'] == 1448) & (pd.to_datetime(df_flight_info['GMT_PLND_DEP_TS']).dt.date == dt.date(2019,3,31))]

Unnamed: 0,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
101260,BA,1448,2019-03-31 14:20:00,2019-03-31 14:24:00,LHR,5,EDI,EDI,320,A3,LHREDI,GB,United Kingdom and Northern Ireland,UK,UK


Has the flight ever flew to Aberdeen?

- Check Flight 15 on 2019-3-31

In [16]:
df_flight_info[(df_flight_info['OPG_FLT_NO'] == 15) & (pd.to_datetime(df_flight_info['GMT_PLND_DEP_TS']).dt.date == dt.date(2019,3,31))]

Unnamed: 0,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
80189,BA,15,2019-03-31 20:30:00,2019-03-31 20:36:00,LHR,5,SIN,SIN,77W,G7,LHRSIN,SG,Singapore,FAR EAST JSA HUBS,AUSTRALASIA PACIFIC


No record to Sydney? Let's check for any flight to Sydney

In [17]:
df_flight_info[(df_flight_info['PLND_ARR_STN_CD']=="SYD") | (df_flight_info['ACT_ARR_STN_CD']=="SYD")]

Unnamed: 0,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM


No record to sydney, is it because already included into the LHR-SIN route?

## 2. Flight Info

Check if there is any duplicated flights

In [18]:
duplicated_flag = df_flight_info.duplicated(subset=['OPG_FLT_NO','GMT_PLND_DEP_TS'],keep=False)
df_flight_info_duplicated = df_flight_info[duplicated_flag].sort_values(['OPG_FLT_NO','GMT_PLND_DEP_TS','GMT_ACT_DEP_TS'])
df_flight_info_duplicated = df_flight_info_duplicated[['OPG_FLT_NO','GMT_PLND_DEP_TS','GMT_ACT_DEP_TS','ROUTE','PLND_ARR_STN_CD','ACT_ARR_STN_CD']]

df_flight_info_duplicated.head(10)

Unnamed: 0,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ROUTE,PLND_ARR_STN_CD,ACT_ARR_STN_CD
6366,31,2023-09-06 17:00:00,2023-09-06 17:29:00,LHRLHR,HKG,LHR
63963,31,2023-09-06 17:00:00,2023-09-07 16:59:00,LHRHKG,HKG,HKG
5050,55,2023-08-06 18:05:00,2023-08-06 18:21:00,LHRLHR,JNB,LHR
70885,55,2023-08-06 18:05:00,2023-08-07 22:17:00,LHRJNB,JNB,JNB
4604,65,2023-04-15 09:10:00,2023-04-15 09:13:00,LHRLHR,NBO,LHR
41149,65,2023-04-15 09:10:00,2023-04-15 20:55:00,LHRNBO,NBO,NBO
1285,119,2019-07-03 13:15:00,2019-07-03 14:01:00,LHRBLR,BLR,LHR
3349,119,2019-07-03 13:15:00,2019-07-03 20:28:00,LHRBLR,BLR,BLR
3705,207,2023-06-22 09:25:00,2023-06-22 11:11:00,LHRLHR,MIA,LHR
65148,207,2023-06-22 09:25:00,2023-06-23 14:15:00,LHRMIA,MIA,MIA


It seesm like the flightd are just being rescheduled.

Let's also check if the LHRLHR route actually exists

In [19]:
df_country[df_country['ROUTE'] == 'LHRLHR']

Unnamed: 0,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM


Of course why would it exist. BA 31 is due to Typhoon Hanna (Haikui 海葵).

## Data Cleaning

# Flight info
- First we sort out all flights by planned departure time and flight number

In [20]:
df_flight_info = df_flight_info.sort_values(['GMT_PLND_DEP_TS','OPG_FLT_NO','GMT_ACT_DEP_TS'])

df_flight_info.head(10)

Unnamed: 0,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
50194,BA,472,2019-03-31 05:15:00,2019-03-31 05:08:00,LHR,3,BCN,BCN,320,A3,LHRBCN,ES,Spain,WEST EUROPE,EUROPE EXC UK
74947,BA,456,2019-03-31 05:20:00,2019-03-31 05:16:00,LHR,5,MAD,MAD,32Q,N6,LHRMAD,ES,Spain,WEST EUROPE,EUROPE EXC UK
22731,BA,360,2019-03-31 05:35:00,2019-03-31 05:36:00,LHR,3,LYS,LYS,320,A3,LHRLYS,FR,France,WEST EUROPE,EUROPE EXC UK
105274,BA,428,2019-03-31 05:40:00,2019-03-31 05:40:00,LHR,5,AMS,AMS,32A,H3,LHRAMS,NL,Netherlands,BENELUX,EUROPE EXC UK
49888,BA,638,2019-03-31 05:45:00,2019-03-31 06:41:00,LHR,5,ATH,ATH,321,V6,LHRATH,GR,Greece,MEDITERRANEAN,EUROPE EXC UK
100028,BA,724,2019-03-31 05:45:00,2019-03-31 05:44:00,LHR,5,GVA,GVA,319,A4,LHRGVA,CH,Switzerland,WEST EUROPE,EUROPE EXC UK
9711,BA,948,2019-03-31 05:45:00,2019-03-31 05:48:00,LHR,5,MUC,MUC,321,V6,LHRMUC,DE,Germany,WEST EUROPE,EUROPE EXC UK
48578,BA,538,2019-03-31 05:50:00,2019-03-31 05:47:00,LHR,5,BLQ,BLQ,321,V6,LHRBLQ,IT,Italy,MEDITERRANEAN,EUROPE EXC UK
121504,BA,812,2019-03-31 05:55:00,2019-03-31 05:51:00,LHR,5,CPH,CPH,32N,N3,LHRCPH,DK,Denmark,SCANDINAVIA,EUROPE EXC UK
82165,BA,552,2019-03-31 06:00:00,2019-03-31 05:56:00,LHR,5,FCO,FCO,320,A3,LHRFCO,IT,Italy,MEDITERRANEAN,EUROPE EXC UK


We would like to keep the first occourance of the duplicates

In [21]:
df_flight_info_unique = df_flight_info.drop_duplicates(subset = ['OPG_FLT_NO','GMT_PLND_DEP_TS'],keep = "first")
df_flight_info_unique.head(10)
# df_flight_info_unique.shape,df_flight_info.shape

Unnamed: 0,OPG_ALN_CD,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
50194,BA,472,2019-03-31 05:15:00,2019-03-31 05:08:00,LHR,3,BCN,BCN,320,A3,LHRBCN,ES,Spain,WEST EUROPE,EUROPE EXC UK
74947,BA,456,2019-03-31 05:20:00,2019-03-31 05:16:00,LHR,5,MAD,MAD,32Q,N6,LHRMAD,ES,Spain,WEST EUROPE,EUROPE EXC UK
22731,BA,360,2019-03-31 05:35:00,2019-03-31 05:36:00,LHR,3,LYS,LYS,320,A3,LHRLYS,FR,France,WEST EUROPE,EUROPE EXC UK
105274,BA,428,2019-03-31 05:40:00,2019-03-31 05:40:00,LHR,5,AMS,AMS,32A,H3,LHRAMS,NL,Netherlands,BENELUX,EUROPE EXC UK
49888,BA,638,2019-03-31 05:45:00,2019-03-31 06:41:00,LHR,5,ATH,ATH,321,V6,LHRATH,GR,Greece,MEDITERRANEAN,EUROPE EXC UK
100028,BA,724,2019-03-31 05:45:00,2019-03-31 05:44:00,LHR,5,GVA,GVA,319,A4,LHRGVA,CH,Switzerland,WEST EUROPE,EUROPE EXC UK
9711,BA,948,2019-03-31 05:45:00,2019-03-31 05:48:00,LHR,5,MUC,MUC,321,V6,LHRMUC,DE,Germany,WEST EUROPE,EUROPE EXC UK
48578,BA,538,2019-03-31 05:50:00,2019-03-31 05:47:00,LHR,5,BLQ,BLQ,321,V6,LHRBLQ,IT,Italy,MEDITERRANEAN,EUROPE EXC UK
121504,BA,812,2019-03-31 05:55:00,2019-03-31 05:51:00,LHR,5,CPH,CPH,32N,N3,LHRCPH,DK,Denmark,SCANDINAVIA,EUROPE EXC UK
82165,BA,552,2019-03-31 06:00:00,2019-03-31 05:56:00,LHR,5,FCO,FCO,320,A3,LHRFCO,IT,Italy,MEDITERRANEAN,EUROPE EXC UK


Check if there's duplicated

In [22]:
duplicated_flag = df_flight_info_unique.duplicated(subset=['OPG_FLT_NO','GMT_PLND_DEP_TS'],keep=False)
df_flight_info_duplicated = df_flight_info_unique[duplicated_flag].sort_values(['OPG_FLT_NO','GMT_PLND_DEP_TS','GMT_ACT_DEP_TS'])
df_flight_info_duplicated = df_flight_info_duplicated[['OPG_FLT_NO','GMT_PLND_DEP_TS','GMT_ACT_DEP_TS','ROUTE','PLND_ARR_STN_CD','ACT_ARR_STN_CD']]

df_flight_info_duplicated.head(10)

Unnamed: 0,OPG_FLT_NO,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,ROUTE,PLND_ARR_STN_CD,ACT_ARR_STN_CD


# Lounge Eligibility