## Connect to ICW:

In [1]:
import cadspy

In [2]:
username = str(input('User number'))
icw = cadspy.DatabaseConnection(system='ICW', user=username)

User number u243700
Enter Password:  ··············


<br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:salmon;font-family:Verdana,sans-serif;font-size:16px;">

<font size="4">Can't connect to ICW? Instructions on how to get access to it in the links below:

</font>
</div>

- Step 1: [Requesting Access to GitHub](https://baplc.sharepoint.com/sites/ask/SitePages/Requesting-Access-to-GitHub.aspx)

Once your access to GitHub has been aproved, you need to:

- Step 2: [Request access to British-Ent GitHub organisation](https://github.com/BritishAirways-Ent/insight-processes/blob/main/onboarding/Corporate_Directory_git.md)

And

- Step 3: [Setup Sagemaker Studio to access GitHub](https://github.com/BritishAirways-Ent/insight-processes/blob/main/onboarding/sagemaker_to_github_setup.md)

<br>

Below are some packages to get you started. You don't have to use them but you may find them useful!

In [3]:
import numpy as np
import pandas as pd
import datetime as dt

In [4]:
# diplay all rows and cols when using 'dataframe'.head() or 'dataframe'.tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

<br>

### Data

#### S19 Lounge Eligibility data

In [5]:
query = """

sel * from LDB_SBOX_OR.HACKATHON_OPS_LOUNGE_ELIGIBILITY

"""

df_lounge_eligibility = icw.queryToDataframe(query)

In [6]:
df_lounge_eligibility.head(2)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13


In [7]:
df_lounge_eligibility.shape

(1534750, 12)

In [8]:
# a look to a particular flight number and date

#mask = (df_lounge_eligibility['DISCHARGE_STN_CD'] == 'GCM   ') & (df_lounge_eligibility['GMT_UPLIFT_DT'] == dt.date(2019,9,12) )

#df_lounge_eligibility[mask]

#### S19 Flight info

In [9]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_FLIGHT_INFO

"""

df_flight_info = icw.queryToDataframe(query)

In [10]:
df_flight_info.head(2)
df_flight_info.dtypes

GMT_PLND_DEP_TS    datetime64[ns]
GMT_ACT_DEP_TS     datetime64[ns]
OPG_ALN_CD                 object
OPG_FLT_NO                  int64
ACT_DEP_STN_CD             object
ACT_DEP_TML_CD             object
PLND_ARR_STN_CD            object
ACT_ARR_STN_CD             object
IATA_AC_TYP_CD             object
ACT_AC_TYP_CD              object
ROUTE                      object
dtype: object

#### Station Code Decode

In [11]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_COUNTRY_DECODE

"""

df_country = icw.queryToDataframe(query)

In [12]:
df_country.head(2)

Unnamed: 0,ROUTE,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
0,LHRINV,GB,United Kingdom and Northern Ireland,UK,UK
1,LHRSVO,RU,Russia in Europe,EASTERN EUROPE,EUROPE EXC UK


#### Additional Station Decodes from ICW reference table

In [13]:
# There are lots of missing destinations which are in lounge_elig but not in df_country.
# They are all new (since ~2019) arrivals to Heathrow.
# They are not in sandbox country dataset, but they are in a reference table on ICW.
# Load that ICW reference table and filter for results not in our merged table, but that are in lounge_elig.
# This should result in zero nan values.
query = """
SELECT STN_CD, COUNTRY_CD, COUNTRY_NM, CORP_GEOG_CTRY_GRP_NM, CORP_GEOG_CONTINENT_NM
FROM REF_GEOG_LOC_HIERARCHY
"""
df_additional_country_decodes=icw.queryToDataframe(query)
df_additional_country_decodes.shape # we will remove destinations already in df_country and not in df_lounge_elig later.

(10975, 5)

In [14]:
df_additional_country_decodes.head(5)

Unnamed: 0,STN_CD,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM
0,RFS,NI,Nicaragua,CENTRAL AMERICA,SOUTH AMERICA INC CARIBBEAN
1,COO,BJ,Benin,WEST AFRICA,AFRICA
2,SFY,US,United States,USA,NORTH AMERICA
3,HGO,CI,Cote d'Ivoire,WEST AFRICA,AFRICA
4,RKA,PF,French Polynesia,PACIFIC ISLANDS,AUSTRALASIA PACIFIC


#### Aircraft Type

In [15]:
query = """

select * from LDB_SBOX_OR.HACKATHON_OPS_AC_TYPE

"""

df_acft_typ = icw.queryToDataframe(query)

In [16]:
df_acft_typ.head(2)

Unnamed: 0,IATA_AC_TYP_CD,ACT_AC_TYP_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
0,320,A3,NB,0,24,0,132
1,332,W9,WB,0,30,21,235


In [17]:
df_acft_typ.shape

(35, 7)

In [18]:
df_acft_typ.head(5)

Unnamed: 0,IATA_AC_TYP_CD,ACT_AC_TYP_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
0,320,A3,NB,0,24,0,132
1,332,W9,WB,0,30,21,235
2,339,W9,WB,0,30,21,234
3,777,K7,WB,0,48,24,203
4,781,X8,WB,7,49,35,165


<br>

### Pre-processing

*Hint:* It is always worth checking the format of each of the columns in your dataframes before trying to do any work with them. To do so, you can make use of the `headers_and_first_row` function below.


In [19]:
def headers_and_first_row(df):
    '''
    print headers and first row of a df to deal with data types
    '''
    
    headers = df.columns
    first_row = []

    for col in headers:
        first_row.append(df[col][0])
    
    dictionary = dict( zip( headers, first_row) )

    return dictionary

In [20]:
# applying headers_and_first_row to df_lounge_eligibility
format_df = headers_and_first_row(df_lounge_eligibility)
format_2 = headers_and_first_row(df_flight_info)

In [21]:
# Note that some columns have blank spaces!
format_df
format_2

{'GMT_PLND_DEP_TS': Timestamp('2023-04-30 20:25:00'),
 'GMT_ACT_DEP_TS': Timestamp('2023-04-30 20:34:00'),
 'OPG_ALN_CD': 'BA    ',
 'OPG_FLT_NO': 57,
 'ACT_DEP_STN_CD': 'LHR   ',
 'ACT_DEP_TML_CD': '5 ',
 'PLND_ARR_STN_CD': 'JNB   ',
 'ACT_ARR_STN_CD': 'JNB   ',
 'IATA_AC_TYP_CD': '388   ',
 'ACT_AC_TYP_CD': 'A8  ',
 'ROUTE': 'LHRJNB      '}

<br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:lightblue;font-family:Verdana,sans-serif;font-size:16px;">

<font size="3">**Exercise 1:** Pre-process **all** the tables above (df_lounge_eligibility, df_flight_info, df_country, df_acft_typ).

</font>

</ol>
</div>

We will first strip all frames of spaces. We will then go through each frame to properly prepare it.

We will then load some additional datasets that will be useful for analysis.

In [22]:
# Stripping strings

tables = [df_lounge_eligibility,df_flight_info,df_country,df_acft_typ,df_additional_country_decodes]

# Define a function that fine all string fields and remove all blak spaces
def data_cleaning_string(df):
    # Get names of all fields in a dataframe
    fields = df.columns
    # loop for all fields, if data type is string then remove blank spaces
    for f in fields:
        if type(df[f][0]) == str:
            df[f] = df[f].str.strip()
    return df

for i,t in enumerate(tables):
    tables[i] = data_cleaning_string(t)

In [23]:
# Removing duplucates of flight_info

# Sort the table
df_flight_info = df_flight_info.sort_values(['GMT_PLND_DEP_TS','OPG_FLT_NO','GMT_ACT_DEP_TS'])
# Remove all duplicates and keep the first rows of all duplicated
df_flight_info = df_flight_info.drop_duplicates(subset = ['OPG_FLT_NO','GMT_PLND_DEP_TS'],keep = "first")


In [24]:
# 2. Flight info
# Ian's code to clean dataset - identify latest departure date for
# duplicated flights and eliminate it.
df_flight_info['GMT_PLND_DEP_DT'] = df_flight_info['GMT_PLND_DEP_TS'].dt.date
df_flight_info['GMT_PLND_DEP_TIME'] = df_flight_info['GMT_PLND_DEP_TS'].dt.time

In [25]:
# 3. Country
df_country['DEP_STN_CD'] = df_country['ROUTE'].str.slice(0,3)
df_country['ARR_STN_CD'] = df_country['ROUTE'].str.slice(3,6)


In [26]:
# 4. Aircraft type
df_acft_typ.sort_values('FIRST_SEATS_QTY').tail(10)

Unnamed: 0,IATA_AC_TYP_CD,ACT_AC_TYP_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
8,789,L8,WB,8,42,39,127
21,77W,F7,WB,8,76,40,130
34,777,R7,WB,10,52,40,134
29,777,W7,WB,12,48,32,127
12,77W,G7,WB,14,56,44,183
13,744,S4,WB,14,86,30,145
22,744,V4,WB,14,52,36,235
15,388,A8,WB,14,97,55,303
27,777,V7,WB,14,48,40,124
24,777,P7,WB,17,48,24,127


In [27]:
df_lounge_eligibility.shape

(1534750, 12)

<br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:lightblue;font-family:Verdana,sans-serif;font-size:16px;">

<font size="3">**Exercise 2:** Join the tables below
   
    - df_flight_info
    - df_country
    - df_acft_typ
    
to the table df_lounge_eligibility to generate a final dataset.

</font>

</ol>
</div>

In [28]:
# df_lounge_eligibility.head(5)

In [29]:
df_lounge_elig_flight_info = pd.merge(df_lounge_eligibility,# left table
                                     df_flight_info, # right table
                                     left_on = ['OPERATING_AIRLINE_CD','OPERATING_FLT_NO','GMT_UPLIFT_DT'], # left on? e.g. which columns from the left table are you joining on to?
                                     right_on = ['OPG_ALN_CD','OPG_FLT_NO','GMT_PLND_DEP_DT'] , # right on? # left on? e.g. which columns from the right table are you joining on to?
                                     how = "left" # how? e.g. left, right, inner,etc
                                     )

df_lounge_elig_flight_info.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,2023-08-05 14:25:00,2023-08-05 14:57:00,BA,618.0,LHR,3,OLB,OLB,319,M4,LHROLB,2023-08-05,14:25:00
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13,2023-05-17 16:35:00,2023-05-17 16:43:00,BA,920.0,LHR,3,STR,STR,319,M4,LHRSTR,2023-05-17,16:35:00
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75,2019-04-15 18:25:00,2019-04-15 18:52:00,BA,770.0,LHR,5,OSL,OSL,32A,H3,LHROSL,2019-04-15,18:25:00
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6,2019-09-23 14:40:00,2019-09-23 14:45:00,BA,348.0,LHR,5,NCE,NCE,319,A4,LHRNCE,2019-09-23,14:40:00
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23,2019-09-16 16:15:00,2019-09-16 16:45:00,BA,155.0,LHR,5,CAI,CAI,343,W9,LHRCAI,2019-09-16,16:15:00


In [30]:
# your code here!
df_lounge_elig_flight_info.shape

(1534750, 25)

Get rows with any null values

In [31]:
print(df_lounge_elig_flight_info[df_lounge_elig_flight_info.isna().any(axis=1)].shape)
# There are currently 1183 rows for which flights are not available.

(1183, 25)


In [32]:
# Example of missing flight number.
print(df_flight_info[df_flight_info['OPG_FLT_NO'] == 8642].head(5))
print(df_lounge_elig_flight_info[df_lounge_elig_flight_info['OPERATING_FLT_NO']==8642]['Skew_Id'].head(5))

Empty DataFrame
Columns: [GMT_PLND_DEP_TS, GMT_ACT_DEP_TS, OPG_ALN_CD, OPG_FLT_NO, ACT_DEP_STN_CD, ACT_DEP_TML_CD, PLND_ARR_STN_CD, ACT_ARR_STN_CD, IATA_AC_TYP_CD, ACT_AC_TYP_CD, ROUTE, GMT_PLND_DEP_DT, GMT_PLND_DEP_TIME]
Index: []
300      2019-09-148642
876      2019-09-148642
1452     2019-09-148642
16459    2019-06-158642
17035    2019-06-158642
Name: Skew_Id, dtype: object


### Merge `df_country` to table

In [33]:
[headers_and_first_row(d) for d in [df_lounge_elig_flight_info,df_country]]

[{'Skew_Id': '2023-08-05618',
  'OPERATING_AIRLINE_CD': 'BA',
  'OPERATING_FLT_NO': 618,
  'GMT_UPLIFT_DT': datetime.date(2023, 8, 5),
  'UPLIFT_STN_CD': 'LHR',
  'DISCHARGE_STN_CD': 'OLB',
  'BOOKED_CABIN_CD': 'C',
  'TRAVEL_CABIN_CD': 'C',
  'BA_PAX_TIER': 'Gold For Life',
  'ONEWORLD_TIER': 'EMER',
  'Lounge_eligibility_tier': 'Tier 2',
  'pax': 2,
  'GMT_PLND_DEP_TS': Timestamp('2023-08-05 14:25:00'),
  'GMT_ACT_DEP_TS': Timestamp('2023-08-05 14:57:00'),
  'OPG_ALN_CD': 'BA',
  'OPG_FLT_NO': 618.0,
  'ACT_DEP_STN_CD': 'LHR',
  'ACT_DEP_TML_CD': '3',
  'PLND_ARR_STN_CD': 'OLB',
  'ACT_ARR_STN_CD': 'OLB',
  'IATA_AC_TYP_CD': '319',
  'ACT_AC_TYP_CD': 'M4',
  'ROUTE': 'LHROLB',
  'GMT_PLND_DEP_DT': datetime.date(2023, 8, 5),
  'GMT_PLND_DEP_TIME': datetime.time(14, 25)},
 {'ROUTE': 'LHRINV',
  'COUNTRY_CD': 'GB',
  'COUNTRY_NM': 'United Kingdom and Northern Ireland',
  'CORP_GEOG_CTRY_GRP_NM': 'UK',
  'CORP_GEOG_CONTINENT_NM': 'UK',
  'DEP_STN_CD': 'LHR',
  'ARR_STN_CD': 'INV'}]

In [34]:
# FIRST, organise the additional countries decoding.
df_additional_country_decodes.columns
df_country.columns

Index(['ROUTE', 'COUNTRY_CD', 'COUNTRY_NM', 'CORP_GEOG_CTRY_GRP_NM',
       'CORP_GEOG_CONTINENT_NM', 'DEP_STN_CD', 'ARR_STN_CD'],
      dtype='object')

In [35]:
# For efficiency, we should merge df_country and df_additional_country_decodes first.
# Firstly, organise df_additional_country_decodes to have the same columns.
# if statement to avoid throwing errors if column renaming from cell below has already been done.
if 'STN_CD' in df_additional_country_decodes.columns:
    station_code_col = 'STN_CD'
elif 'ARR_STN_CD' in df_additional_country_decodes.columns:
    station_code_col = 'ARR_STN_CD'
df_additional_country_decodes = df_additional_country_decodes[~df_additional_country_decodes[station_code_col].isin(df_country['ARR_STN_CD'])]
# for more efficiency, we can remove all those rows which are not needed as there are no lounge elig rows with that destinations.
df_additional_country_decodes = df_additional_country_decodes[df_additional_country_decodes[station_code_col].isin(df_lounge_elig_flight_info['DISCHARGE_STN_CD'])]
print(df_additional_country_decodes.shape)

(30, 5)


In [36]:
# We need df_additional_country_decodes to have the same column names and order as df_country, ready for pd.concat.
# By merging now, we eliminate the need to merge separately with df_lounge_elig.
df_additional_country_decodes['ROUTE'] = 'LHR' + df_additional_country_decodes['STN_CD']
df_additional_country_decodes['DEP_STN_CD'] = 'LHR'
# We must ensure column names and order of df_additional_country_decodes match df_country.
df_additional_country_decodes.rename(columns={'STN_CD':'ARR_STN_CD'},inplace=True)
df_additional_country_decodes = df_additional_country_decodes[df_country.columns]

In [37]:
# concat together.
df_country = pd.concat([df_country,df_additional_country_decodes])

In [38]:
df_lounge_elig_flight_info_country = pd.merge(df_lounge_elig_flight_info,# left table
                                     df_country, # right table
                                     left_on = ['DISCHARGE_STN_CD'], # left on? e.g. which columns from the left table are you joining on to?
                                     right_on = ['ARR_STN_CD'] , # right on? # left on? e.g. which columns from the right table are you joining on to?
                                     how = "left" # how? e.g. left, right, inner,etc
                                     )

# df_lounge_elig_flight_info = df_lounge_elig_flight_info.drop_duplicates()

df_lounge_elig_flight_info_country.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE_x,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME,ROUTE_y,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,DEP_STN_CD,ARR_STN_CD
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,2023-08-05 14:25:00,2023-08-05 14:57:00,BA,618.0,LHR,3,OLB,OLB,319,M4,LHROLB,2023-08-05,14:25:00,LHROLB,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,LHR,OLB
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13,2023-05-17 16:35:00,2023-05-17 16:43:00,BA,920.0,LHR,3,STR,STR,319,M4,LHRSTR,2023-05-17,16:35:00,LHRSTR,DE,Germany,WEST EUROPE,EUROPE EXC UK,LHR,STR
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75,2019-04-15 18:25:00,2019-04-15 18:52:00,BA,770.0,LHR,5,OSL,OSL,32A,H3,LHROSL,2019-04-15,18:25:00,LHROSL,NO,Norway,SCANDINAVIA,EUROPE EXC UK,LHR,OSL
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6,2019-09-23 14:40:00,2019-09-23 14:45:00,BA,348.0,LHR,5,NCE,NCE,319,A4,LHRNCE,2019-09-23,14:40:00,LHRNCE,FR,France,WEST EUROPE,EUROPE EXC UK,LHR,NCE
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23,2019-09-16 16:15:00,2019-09-16 16:45:00,BA,155.0,LHR,5,CAI,CAI,343,W9,LHRCAI,2019-09-16,16:15:00,LHRCAI,EG,Egypt,NORTH AFRICA,AFRICA,LHR,CAI


In [39]:
df_lounge_elig_flight_info_country.shape

(1534750, 32)

Check for null values

In [40]:
df_lounge_elig_flight_info_country[df_lounge_elig_flight_info_country['CORP_GEOG_CONTINENT_NM'].isna()].head(5)
# There are no null values arising from lack of country data.

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE_x,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME,ROUTE_y,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,DEP_STN_CD,ARR_STN_CD


### `df_acft_typ`

In [41]:
df_lounge_elig_flight_info_country_acft_typ = pd.merge(
    df_lounge_elig_flight_info_country,
    df_acft_typ,
    left_on = ['IATA_AC_TYP_CD','ACT_AC_TYP_CD'],
    right_on = ['IATA_AC_TYP_CD','ACT_AC_TYP_CD'],
    how = 'left'
)

df_lounge_elig_flight_info_country_acft_typ.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE_x,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME,ROUTE_y,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,DEP_STN_CD,ARR_STN_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,2023-08-05 14:25:00,2023-08-05 14:57:00,BA,618.0,LHR,3,OLB,OLB,319,M4,LHROLB,2023-08-05,14:25:00,LHROLB,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,LHR,OLB,NB,0.0,21.0,0.0,113.0
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13,2023-05-17 16:35:00,2023-05-17 16:43:00,BA,920.0,LHR,3,STR,STR,319,M4,LHRSTR,2023-05-17,16:35:00,LHRSTR,DE,Germany,WEST EUROPE,EUROPE EXC UK,LHR,STR,NB,0.0,21.0,0.0,113.0
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75,2019-04-15 18:25:00,2019-04-15 18:52:00,BA,770.0,LHR,5,OSL,OSL,32A,H3,LHROSL,2019-04-15,18:25:00,LHROSL,NO,Norway,SCANDINAVIA,EUROPE EXC UK,LHR,OSL,NB,0.0,20.0,0.0,150.0
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6,2019-09-23 14:40:00,2019-09-23 14:45:00,BA,348.0,LHR,5,NCE,NCE,319,A4,LHRNCE,2019-09-23,14:40:00,LHRNCE,FR,France,WEST EUROPE,EUROPE EXC UK,LHR,NCE,NB,0.0,16.0,0.0,119.0
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23,2019-09-16 16:15:00,2019-09-16 16:45:00,BA,155.0,LHR,5,CAI,CAI,343,W9,LHRCAI,2019-09-16,16:15:00,LHRCAI,EG,Egypt,NORTH AFRICA,AFRICA,LHR,CAI,WB,0.0,45.0,0.0,212.0


In [42]:
df_lounge_elig_flight_info_country_acft_typ.shape

(1534750, 37)

Check for null

In [43]:
df_lounge_elig_flight_info_country_acft_typ[df_lounge_elig_flight_info_country_acft_typ.isna().any(axis=1)].head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE_x,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME,ROUTE_y,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,DEP_STN_CD,ARR_STN_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
300,2019-09-148642,BA,8642,2019-09-14,LHR,ADB,M,M,Silver,SAPP,Tier 3,2,NaT,NaT,,,,,,,,,,,,LHRADB,TR,Turkey,MEDITERRANEAN,EUROPE EXC UK,LHR,ADB,,,,,
466,2019-09-298598,BA,8598,2019-09-29,LHR,CFU,M,M,,,Not eligible,43,NaT,NaT,,,,,,,,,,,,LHRCFU,GR,Greece,MEDITERRANEAN,EUROPE EXC UK,LHR,CFU,,,,,
876,2019-09-148642,BA,8642,2019-09-14,LHR,ADB,M,M,Silver,,Tier 3,1,NaT,NaT,,,,,,,,,,,,LHRADB,TR,Turkey,MEDITERRANEAN,EUROPE EXC UK,LHR,ADB,,,,,
1452,2019-09-148642,BA,8642,2019-09-14,LHR,ADB,M,M,,,Not eligible,144,NaT,NaT,,,,,,,,,,,,LHRADB,TR,Turkey,MEDITERRANEAN,EUROPE EXC UK,LHR,ADB,,,,,
11711,2019-06-168586,BA,8586,2019-06-16,LHR,KLX,M,M,Gold,EMER,Tier 2,1,NaT,NaT,,,,,,,,,,,,LHRKLX,GR,Greece,MEDITERRANEAN,EUROPE EXC UK,LHR,KLX,,,,,


In [44]:
df_acft_typ[df_acft_typ['ACT_AC_TYP_CD'] == 'M4']

Unnamed: 0,IATA_AC_TYP_CD,ACT_AC_TYP_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY
20,319,M4,NB,0,21,0,113


### More cleaning for columns redundancy

In [45]:
headers_and_first_row(df_lounge_elig_flight_info_country_acft_typ)

{'Skew_Id': '2023-08-05618',
 'OPERATING_AIRLINE_CD': 'BA',
 'OPERATING_FLT_NO': 618,
 'GMT_UPLIFT_DT': datetime.date(2023, 8, 5),
 'UPLIFT_STN_CD': 'LHR',
 'DISCHARGE_STN_CD': 'OLB',
 'BOOKED_CABIN_CD': 'C',
 'TRAVEL_CABIN_CD': 'C',
 'BA_PAX_TIER': 'Gold For Life',
 'ONEWORLD_TIER': 'EMER',
 'Lounge_eligibility_tier': 'Tier 2',
 'pax': 2,
 'GMT_PLND_DEP_TS': Timestamp('2023-08-05 14:25:00'),
 'GMT_ACT_DEP_TS': Timestamp('2023-08-05 14:57:00'),
 'OPG_ALN_CD': 'BA',
 'OPG_FLT_NO': 618.0,
 'ACT_DEP_STN_CD': 'LHR',
 'ACT_DEP_TML_CD': '3',
 'PLND_ARR_STN_CD': 'OLB',
 'ACT_ARR_STN_CD': 'OLB',
 'IATA_AC_TYP_CD': '319',
 'ACT_AC_TYP_CD': 'M4',
 'ROUTE_x': 'LHROLB',
 'GMT_PLND_DEP_DT': datetime.date(2023, 8, 5),
 'GMT_PLND_DEP_TIME': datetime.time(14, 25),
 'ROUTE_y': 'LHROLB',
 'COUNTRY_CD': 'IT',
 'COUNTRY_NM': 'Italy',
 'CORP_GEOG_CTRY_GRP_NM': 'MEDITERRANEAN',
 'CORP_GEOG_CONTINENT_NM': 'EUROPE EXC UK',
 'DEP_STN_CD': 'LHR',
 'ARR_STN_CD': 'OLB',
 'WB_NB_CAT': 'NB',
 'FIRST_SEATS_QTY': 0.0

In [46]:
# drop redundant columns.

df_final = df_lounge_elig_flight_info_country_acft_typ.copy()

to_drop = ['GMT_PLND_DEP_TS','GMT_ACT_DEP_TS','OPG_ALN_CD','OPG_FLT_NO','ACT_DEP_STN_CD','PLND_ARR_STN_CD','ACT_ARR_STN_CD','ROUTE_x',
          'GMT_PLND_DEP_DT','ROUTE_y','DEP_STN_CD','ARR_STN_CD']

df_final = df_final.drop(columns = to_drop,)
df_final['GMT_UPLIFT_YR'] = [dts.year for dts in df_final['GMT_UPLIFT_DT']]

print(df_final.shape)
df_final.head(5)

(1534750, 26)


Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,3,319,M4,14:25:00,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,NB,0.0,21.0,0.0,113.0,2023
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13,3,319,M4,16:35:00,DE,Germany,WEST EUROPE,EUROPE EXC UK,NB,0.0,21.0,0.0,113.0,2023
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75,5,32A,H3,18:25:00,NO,Norway,SCANDINAVIA,EUROPE EXC UK,NB,0.0,20.0,0.0,150.0,2019
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6,5,319,A4,14:40:00,FR,France,WEST EUROPE,EUROPE EXC UK,NB,0.0,16.0,0.0,119.0,2019
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23,5,343,W9,16:15:00,EG,Egypt,NORTH AFRICA,AFRICA,WB,0.0,45.0,0.0,212.0,2019


Save as CSV

In [47]:
#df_final.to_csv('scragg_merged_df_final_2023-11-01-0945.csv')

<br><br>
<div class="alert alert-" style = "border-radius:10px;border-width:3px;border-color:lightblue;font-family:Verdana,sans-serif;font-size:16px;">

<font size="3">Exercise 3: Based on S2019/S2023 data, provide a lookup table of Lounge eligibility assumptions that can be applied to a future schedule. To do so, answer each of the following questions in the Markdown cell provided below. 

- What level of granularity do you use?
- What metric do you use to come up with Lounge eligibility profiles?

    
Note 1: **Provide evidence for your assumptions.** This can be in the form of tables, graphs, correlation matrix, etc.
    
Note 2: Make use of the examples below to give structure to your answer. Feel free to attend the Hackathon Clinics if you have any questions. 
</font>


    
</ol>
</div>

**Reasoning (Example 1)**:


Assume:
- Data has been preprocessed.
- Data has been joined, and a final dataset has been created. This dataset is the result of joining the 4 tables.

The final dataset has been called `df_lounge_elig_flight_info_country_acft_typ`.

<u>What level of granularity do you use?</u>
- I have decided to split all flights in the network based on their aircraft type. I will therefore have a lookup table with two categories: Narrowbody and Widebody. 

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- I have sumed up all the passengers by Aircraft Type, by Tier. Then I have divided them by the total number of passengers by Aircraft Type. For example:
    - For NB aircrafts, and for Tier 1 passengers: In S19 we had 41,728 pax eligible for Tier 1 out of 7,222,830 pax flying on Narrowbody aircraft. This represents 0.6% of the costumers and I assume that this will be the number of costumers elegible for this specific Lounge in a future schedule.   

In [48]:
# your code here!

In [49]:
# columns that you want to group by
list_groupby = ['WB_NB_CAT','Lounge_eligibility_tier']

# grouping by WB_NB_CAT and Lounge_eligibility_tier
df_groupby_wb_nb = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

In [50]:
# a look at the data
df_groupby_wb_nb

Unnamed: 0,WB_NB_CAT,Lounge_eligibility_tier,pax_count
0,NB,Not eligible,9741188
1,NB,Tier 1,53292
2,NB,Tier 2,744829
3,NB,Tier 3,2240579
4,WB,Not eligible,5730922
5,WB,Tier 1,236775
6,WB,Tier 2,391264
7,WB,Tier 3,1523611


In [51]:
# getting the number of pax by aircraft type

# columns that you want to group by
list_groupby = ['WB_NB_CAT']

# grouping by WB_NB_CAT
df_groupby_wb_nb_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

In [52]:
# a look at the data
df_groupby_wb_nb_ttl

Unnamed: 0,WB_NB_CAT,pax_count_ttl
0,NB,12779888
1,WB,7882572


In [53]:
# now, let's left join df_groupby_wb_nb_ttl onto df_groupby_wb_nb
# this adds a new column to the df_groupby_wb_nb table (pax_count_ttl) that will be used to get the percentage of passenger eligible by Tier

df_groupby_wb_nb = pd.merge(df_groupby_wb_nb,
                            df_groupby_wb_nb_ttl,
                            on = ['WB_NB_CAT'],
                            how = 'left'
                           )

In [54]:
# a look at the data
df_groupby_wb_nb

Unnamed: 0,WB_NB_CAT,Lounge_eligibility_tier,pax_count,pax_count_ttl
0,NB,Not eligible,9741188,12779888
1,NB,Tier 1,53292,12779888
2,NB,Tier 2,744829,12779888
3,NB,Tier 3,2240579,12779888
4,WB,Not eligible,5730922,7882572
5,WB,Tier 1,236775,7882572
6,WB,Tier 2,391264,7882572
7,WB,Tier 3,1523611,7882572


In [55]:
# getting the percentage of pax elegible for each of the Tiers

df_groupby_wb_nb['pax_eligible%'] = (df_groupby_wb_nb['pax_count'] / df_groupby_wb_nb['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_wb_nb['pax_eligible%'] = df_groupby_wb_nb['pax_eligible%'].map('{:,.1f}%'.format)

In [56]:
# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_wb_nb.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

In [57]:
# a look at the data
df_groupby_wb_nb

Unnamed: 0,WB_NB_CAT,Lounge_eligibility_tier,pax_eligible%
0,NB,Not eligible,76.2%
1,NB,Tier 1,0.4%
2,NB,Tier 2,5.8%
3,NB,Tier 3,17.5%
4,WB,Not eligible,72.7%
5,WB,Tier 1,3.0%
6,WB,Tier 2,5.0%
7,WB,Tier 3,19.3%


In [58]:
# dropping 'not eligible' rows - not needed anymore
mask = df_groupby_wb_nb['Lounge_eligibility_tier'] == 'Not eligible'

df_groupby_wb_nb = df_groupby_wb_nb[~mask].copy()

<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.

In [59]:
# using set_index to come up with the final lookup table
df_groupby_wb_nb = df_groupby_wb_nb.set_index(['WB_NB_CAT','Lounge_eligibility_tier'],drop = True).unstack('Lounge_eligibility_tier')

In [60]:
# final table
df_groupby_wb_nb

Unnamed: 0_level_0,pax_eligible%,pax_eligible%,pax_eligible%
Lounge_eligibility_tier,Tier 1,Tier 2,Tier 3
WB_NB_CAT,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
NB,0.4%,5.8%,17.5%
WB,3.0%,5.0%,19.3%


<br>

**Feedback:** This analysis provides a lookup table in the format needed to be input into a future schedule. Nevertheless, this analysis is too high level and you haven't provided any evidence for your assumptions. To further enhance your answer use insights from the data and provide evidence for your assumptions. Please find some ideas below: 

- Using the same categories (WB,NB), plot data overtime to better understand the peaks for the different lounges. 
- Is there a way to split Widebody into more categories? Do the Haul, Region, Time of Day, or Country play a role in the number of passengers that are eligible in Tier 1, Tier 2 and Tier 3? Etc...

<br><br>

**Reasoning (Example 2)**:



<u>What level of granularity do you use?</u>
- I have decided to split all flights in the network based on their flight number. I will therefore have a lookup table with a lot of categories as each flight number is a category. 

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- I have sumed up all the passengers by flight number, by Tier. Then I have divided them by the total number of passengers flight number.  

In [61]:
# your code here!

In [62]:
# columns that you want to group by
list_groupby = ['OPERATING_FLT_NO','DISCHARGE_STN_CD','Lounge_eligibility_tier']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_flt_no = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

# a look at the data
df_groupby_flt_no.head(4)

Unnamed: 0,OPERATING_FLT_NO,DISCHARGE_STN_CD,Lounge_eligibility_tier,pax_count
0,5,HND,Not eligible,15825
1,5,HND,Tier 1,220
2,5,HND,Tier 2,888
3,5,HND,Tier 3,4469


In [63]:
# getting the number of pax by OPERATING_FLT_NO and DISCHARGE_STN_CD

# columns that you want to group by
list_groupby = ['OPERATING_FLT_NO','DISCHARGE_STN_CD']

# grouping by WB_NB_CAT
df_groupby_flt_no_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

In [64]:
# a look at the data
df_groupby_flt_no_ttl.head(2)

Unnamed: 0,OPERATING_FLT_NO,DISCHARGE_STN_CD,pax_count_ttl
0,5,HND,21402
1,5,NRT,38378


In [65]:
# now, let's left join df_groupby_flt_no_ttl onto df_groupby_flt_no
# this adds a new column to the df_groupby_flt_no table (pax_count_ttl) that will be used to get the percentage of passenger eligible by Tier

df_groupby_flt_no = pd.merge(df_groupby_flt_no,
                            df_groupby_flt_no_ttl,
                            on = ['OPERATING_FLT_NO','DISCHARGE_STN_CD'],
                            how = 'left'
                           )

In [66]:
# a look at the data
df_groupby_flt_no.head(4)

Unnamed: 0,OPERATING_FLT_NO,DISCHARGE_STN_CD,Lounge_eligibility_tier,pax_count,pax_count_ttl
0,5,HND,Not eligible,15825,21402
1,5,HND,Tier 1,220,21402
2,5,HND,Tier 2,888,21402
3,5,HND,Tier 3,4469,21402


In [67]:
# getting the percentage of pax elegible for each of the Tiers

df_groupby_flt_no['pax_eligible%'] = (df_groupby_flt_no['pax_count'] / df_groupby_flt_no['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_flt_no['pax_eligible%'] = df_groupby_flt_no['pax_eligible%'].map('{:,.1f}%'.format)

# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_flt_no.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

In [68]:
# a look at the data
df_groupby_flt_no.head(4)

Unnamed: 0,OPERATING_FLT_NO,DISCHARGE_STN_CD,Lounge_eligibility_tier,pax_eligible%
0,5,HND,Not eligible,73.9%
1,5,HND,Tier 1,1.0%
2,5,HND,Tier 2,4.1%
3,5,HND,Tier 3,20.9%


In [69]:
# dropping 'not eligible' rows - not needed anymore
mask = df_groupby_flt_no['Lounge_eligibility_tier'] == 'Not eligible'

df_groupby_flt_no = df_groupby_flt_no[~mask].copy()

<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.

In [70]:
# using set_index to come up with the final lookup table
df_groupby_flt_no = df_groupby_flt_no.set_index(['OPERATING_FLT_NO','DISCHARGE_STN_CD','Lounge_eligibility_tier'],drop = True).unstack('Lounge_eligibility_tier')

In [71]:
# final table
df_groupby_flt_no.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,pax_eligible%,pax_eligible%,pax_eligible%
Unnamed: 0_level_1,Lounge_eligibility_tier,Tier 1,Tier 2,Tier 3
OPERATING_FLT_NO,DISCHARGE_STN_CD,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
5,HND,1.0%,4.1%,20.9%
5,NRT,3.6%,4.8%,19.1%
7,HND,3.3%,5.2%,19.8%
9,BKK,0.3%,4.2%,15.9%


<br>

**Feedback:** This analysis goes at a very granular level, we might come up with missing values if we apply this lounge eligibility profiles to a future schedule. Here are some ideas to further enhance your answer: 

- What would happen if we fly to a new destination in the future? How do we ensure we have a lounge eligibility profile for this new route?
- As you can see in the example above: Pax eligible for Tier 1 for the BKK flight is significantly different from the rest. What's the most used aircraft type for this route? And why it differs that much from the rest? Is it because of the route characteristics instead?
- For SH routes, we might change the time of departure for a specific flight number from one year to the next. Explore the possibility of using a clasification that takes that into account, like using Time of Day instead of flight number. 

### Ian  
First of all I would like to be able to split the data into 2019/23 data separately, which can be done by adding a year field into the table.

In [72]:
# Original merged table
headers_and_first_row(df_lounge_elig_flight_info_country_acft_typ)

{'Skew_Id': '2023-08-05618',
 'OPERATING_AIRLINE_CD': 'BA',
 'OPERATING_FLT_NO': 618,
 'GMT_UPLIFT_DT': datetime.date(2023, 8, 5),
 'UPLIFT_STN_CD': 'LHR',
 'DISCHARGE_STN_CD': 'OLB',
 'BOOKED_CABIN_CD': 'C',
 'TRAVEL_CABIN_CD': 'C',
 'BA_PAX_TIER': 'Gold For Life',
 'ONEWORLD_TIER': 'EMER',
 'Lounge_eligibility_tier': 'Tier 2',
 'pax': 2,
 'GMT_PLND_DEP_TS': Timestamp('2023-08-05 14:25:00'),
 'GMT_ACT_DEP_TS': Timestamp('2023-08-05 14:57:00'),
 'OPG_ALN_CD': 'BA',
 'OPG_FLT_NO': 618.0,
 'ACT_DEP_STN_CD': 'LHR',
 'ACT_DEP_TML_CD': '3',
 'PLND_ARR_STN_CD': 'OLB',
 'ACT_ARR_STN_CD': 'OLB',
 'IATA_AC_TYP_CD': '319',
 'ACT_AC_TYP_CD': 'M4',
 'ROUTE_x': 'LHROLB',
 'GMT_PLND_DEP_DT': datetime.date(2023, 8, 5),
 'GMT_PLND_DEP_TIME': datetime.time(14, 25),
 'ROUTE_y': 'LHROLB',
 'COUNTRY_CD': 'IT',
 'COUNTRY_NM': 'Italy',
 'CORP_GEOG_CTRY_GRP_NM': 'MEDITERRANEAN',
 'CORP_GEOG_CONTINENT_NM': 'EUROPE EXC UK',
 'DEP_STN_CD': 'LHR',
 'ARR_STN_CD': 'OLB',
 'WB_NB_CAT': 'NB',
 'FIRST_SEATS_QTY': 0.0

In [73]:
# df_lounge_elig_flight_info_country_acft_typ['GMT_PLND_DEPT_YR'] = df_lounge_elig_flight_info_country_acft_typ['GMT_PLND_DEP_DT'].year

# headers_and_first_row(df_lounge_elig_flight_info_country_acft_typ)
df_lounge_elig_flight_info_country_acft_typ['GMT_UPLIFT_YR'] = [dts.year for dts in df_lounge_elig_flight_info_country_acft_typ['GMT_UPLIFT_DT']]

df_lounge_elig_flight_info_country_acft_typ.head(1)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE_x,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME,ROUTE_y,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,DEP_STN_CD,ARR_STN_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,2023-08-05 14:25:00,2023-08-05 14:57:00,BA,618.0,LHR,3,OLB,OLB,319,M4,LHROLB,2023-08-05,14:25:00,LHROLB,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,LHR,OLB,NB,0.0,21.0,0.0,113.0,2023


In [74]:
# columns that you want to group by
list_groupby = ['GMT_UPLIFT_YR','Lounge_eligibility_tier']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_year = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

# a look at the data
df_groupby_year

Unnamed: 0,GMT_UPLIFT_YR,Lounge_eligibility_tier,pax_count
0,2019,Not eligible,8147159
1,2019,Tier 1,196564
2,2019,Tier 2,635765
3,2019,Tier 3,1996226
4,2023,Not eligible,7372440
5,2023,Tier 1,93555
6,2023,Tier 2,501044
7,2023,Tier 3,1770087


In [75]:
df_final.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,3,319,M4,14:25:00,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,NB,0.0,21.0,0.0,113.0,2023
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13,3,319,M4,16:35:00,DE,Germany,WEST EUROPE,EUROPE EXC UK,NB,0.0,21.0,0.0,113.0,2023
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75,5,32A,H3,18:25:00,NO,Norway,SCANDINAVIA,EUROPE EXC UK,NB,0.0,20.0,0.0,150.0,2019
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6,5,319,A4,14:40:00,FR,France,WEST EUROPE,EUROPE EXC UK,NB,0.0,16.0,0.0,119.0,2019
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23,5,343,W9,16:15:00,EG,Egypt,NORTH AFRICA,AFRICA,WB,0.0,45.0,0.0,212.0,2019


### JAMIE

In [76]:
df_lounge_elig_flight_info_country_acft_typ.head(1)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,GMT_PLND_DEP_TS,GMT_ACT_DEP_TS,OPG_ALN_CD,OPG_FLT_NO,ACT_DEP_STN_CD,ACT_DEP_TML_CD,PLND_ARR_STN_CD,ACT_ARR_STN_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,ROUTE_x,GMT_PLND_DEP_DT,GMT_PLND_DEP_TIME,ROUTE_y,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,DEP_STN_CD,ARR_STN_CD,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,2023-08-05 14:25:00,2023-08-05 14:57:00,BA,618.0,LHR,3,OLB,OLB,319,M4,LHROLB,2023-08-05,14:25:00,LHROLB,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,LHR,OLB,NB,0.0,21.0,0.0,113.0,2023


In [77]:
# Creation of final pivot table with all relevant info.
# Numbers split long and short haul with lounge elig attached.

# add GMT Uplift Yr if needed

df_final_col_list = list(df_final.columns)
pivot_table_drop = ['BA_PAX_TIER','ONEWORLD_TIER','Lounge_eligibility_tier','pax','BOOKED_CABIN_CD','TRAVEL_CABIN_CD']

pivot_df_final = pd.pivot_table(df_final, values='pax', index=list(df_final.drop(pivot_table_drop,axis=1).columns),
                                columns='Lounge_eligibility_tier',aggfunc='sum',fill_value=0).reset_index()
pivot_df_final['TOTAL_PAX'] = pivot_df_final['Not eligible']+pivot_df_final['Tier 1']+pivot_df_final['Tier 2']+pivot_df_final['Tier 3']
pivot_df_final.head(5)

Lounge_eligibility_tier,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR,Not eligible,Tier 1,Tier 2,Tier 3,TOTAL_PAX
0,2019-03-31103,BA,103,2019-03-31,LHR,YYC,5,788,B8,17:30:00,CA,Canada,CANADA,NORTH AMERICA,WB,0.0,35.0,25.0,154.0,2019,154,1,9,33,197
1,2019-03-31105,BA,105,2019-03-31,LHR,DXB,5,777,V7,21:25:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,137,13,13,46,209
2,2019-03-31107,BA,107,2019-03-31,LHR,DXB,5,777,V7,11:50:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,126,11,18,46,201
3,2019-03-31109,BA,109,2019-03-31,LHR,DXB,5,777,V7,20:15:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,139,14,22,42,217
4,2019-03-3111,BA,11,2019-03-31,LHR,SIN,5,388,A8,18:55:00,SG,Singapore,FAR EAST JSA HUBS,AUSTRALASIA PACIFIC,WB,14.0,97.0,55.0,303.0,2019,308,17,43,75,443




**Create new column 'Status' based on BA and Oneworld Status.**

In [78]:
df_final_status_cols_list = list(df_final.columns)
df_final_status = df_final.copy()
df_final_status['STATUS'] = 'NA'

In [79]:
#
ba_emerald_list = ['Gold','Gold For Life','Gold Guest List','Gold Guest List For Life','Premier']

# calculate how many are of each OneWorld Status level. Order of operations is important, highest reported status prevails.
df_final_status.loc[(df_final_status['BA_PAX_TIER'].isin(['None','']))|(df_final_status['ONEWORLD_TIER']==''),'STATUS']='None'
df_final_status.loc[(df_final_status['BA_PAX_TIER']=='Silver')|(df_final_status['ONEWORLD_TIER']=='SAPP'),'STATUS']='Sapphire'
df_final_status.loc[(df_final_status['BA_PAX_TIER'].isin(ba_emerald_list))|(df_final_status['ONEWORLD_TIER']=='EMER'),'STATUS']='Emerald'

# set BA column only. If blank, we want it to read none.
df_final_status.loc[(df_final_status['BA_PAX_TIER']==''),'BA_PAX_TIER']='None'

In [80]:
df_final_status.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,BOOKED_CABIN_CD,TRAVEL_CABIN_CD,BA_PAX_TIER,ONEWORLD_TIER,Lounge_eligibility_tier,pax,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR,STATUS
0,2023-08-05618,BA,618,2023-08-05,LHR,OLB,C,C,Gold For Life,EMER,Tier 2,2,3,319,M4,14:25:00,IT,Italy,MEDITERRANEAN,EUROPE EXC UK,NB,0.0,21.0,0.0,113.0,2023,Emerald
1,2023-05-17920,BA,920,2023-05-17,LHR,STR,C,C,,,Tier 3,13,3,319,M4,16:35:00,DE,Germany,WEST EUROPE,EUROPE EXC UK,NB,0.0,21.0,0.0,113.0,2023,
2,2019-04-15770,BA,770,2019-04-15,LHR,OSL,M,M,,,Not eligible,75,5,32A,H3,18:25:00,NO,Norway,SCANDINAVIA,EUROPE EXC UK,NB,0.0,20.0,0.0,150.0,2019,
3,2019-09-23348,BA,348,2019-09-23,LHR,NCE,M,M,,EMER,Tier 2,6,5,319,A4,14:40:00,FR,France,WEST EUROPE,EUROPE EXC UK,NB,0.0,16.0,0.0,119.0,2019,Emerald
4,2019-09-16155,BA,155,2019-09-16,LHR,CAI,J,J,,,Tier 3,23,5,343,W9,16:15:00,EG,Egypt,NORTH AFRICA,AFRICA,WB,0.0,45.0,0.0,212.0,2019,


In [81]:
df_final_col_list = list(df_final.columns)
pivot_table_drop = ['BA_PAX_TIER','ONEWORLD_TIER','Lounge_eligibility_tier','pax','BOOKED_CABIN_CD','TRAVEL_CABIN_CD','STATUS']

pivot_df_final_status_OW = pd.pivot_table(df_final_status, values='pax', index=list(df_final_status.drop(pivot_table_drop,axis=1).columns),
                                columns='STATUS',aggfunc='sum',fill_value=0).reset_index()
pivot_df_final_status_OW.rename(columns={'None':'None_OW'},inplace=True)
pivot_df_final_status_OW.head(5)

STATUS,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR,Emerald,None_OW,Sapphire
0,2019-03-31103,BA,103,2019-03-31,LHR,YYC,5,788,B8,17:30:00,CA,Canada,CANADA,NORTH AMERICA,WB,0.0,35.0,25.0,154.0,2019,10,173,14
1,2019-03-31105,BA,105,2019-03-31,LHR,DXB,5,777,V7,21:25:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,22,153,34
2,2019-03-31107,BA,107,2019-03-31,LHR,DXB,5,777,V7,11:50:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,25,147,29
3,2019-03-31109,BA,109,2019-03-31,LHR,DXB,5,777,V7,20:15:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,32,153,32
4,2019-03-3111,BA,11,2019-03-31,LHR,SIN,5,388,A8,18:55:00,SG,Singapore,FAR EAST JSA HUBS,AUSTRALASIA PACIFIC,WB,14.0,97.0,55.0,303.0,2019,51,351,41


In [82]:
df_final_col_list = list(df_final.columns)
pivot_table_drop = ['BA_PAX_TIER','ONEWORLD_TIER','Lounge_eligibility_tier','pax','BOOKED_CABIN_CD','TRAVEL_CABIN_CD','STATUS']

pivot_df_final_status = pd.pivot_table(df_final_status, values='pax', index=list(df_final_status.drop(pivot_table_drop,axis=1).columns),
                                columns=['BA_PAX_TIER'],aggfunc='sum',fill_value=0).reset_index()
pivot_df_final_status.head(5)

BA_PAX_TIER,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR,Gold,Gold For Life,Gold Guest List,Gold Guest List For Life,None,Premier,Silver
0,2019-03-31103,BA,103,2019-03-31,LHR,YYC,5,788,B8,17:30:00,CA,Canada,CANADA,NORTH AMERICA,WB,0.0,35.0,25.0,154.0,2019,5,3,1,0,180,0,8
1,2019-03-31105,BA,105,2019-03-31,LHR,DXB,5,777,V7,21:25:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,17,0,4,0,156,0,32
2,2019-03-31107,BA,107,2019-03-31,LHR,DXB,5,777,V7,11:50:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,18,1,2,0,157,0,23
3,2019-03-31109,BA,109,2019-03-31,LHR,DXB,5,777,V7,20:15:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,28,0,1,0,163,0,25
4,2019-03-3111,BA,11,2019-03-31,LHR,SIN,5,388,A8,18:55:00,SG,Singapore,FAR EAST JSA HUBS,AUSTRALASIA PACIFIC,WB,14.0,97.0,55.0,303.0,2019,29,5,6,0,368,0,35


In [83]:
# FINAL OUTPUT
# NOTE: The values are not cumulative ie. the OneWorld and BA Tiers must be treated separately. OneWorld Tiers are inclusive of BA tiers.
pivot_df_final_status= pivot_df_final_status.merge(pivot_df_final_status_OW[['Skew_Id','Emerald','None_OW','Sapphire']]
                           , on='Skew_Id',how='left')
pivot_df_final_status.head(5)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR,Gold,Gold For Life,Gold Guest List,Gold Guest List For Life,None,Premier,Silver,Emerald,None_OW,Sapphire
0,2019-03-31103,BA,103,2019-03-31,LHR,YYC,5,788,B8,17:30:00,CA,Canada,CANADA,NORTH AMERICA,WB,0.0,35.0,25.0,154.0,2019,5,3,1,0,180,0,8,10,173,14
1,2019-03-31105,BA,105,2019-03-31,LHR,DXB,5,777,V7,21:25:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,17,0,4,0,156,0,32,22,153,34
2,2019-03-31107,BA,107,2019-03-31,LHR,DXB,5,777,V7,11:50:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,18,1,2,0,157,0,23,25,147,29
3,2019-03-31109,BA,109,2019-03-31,LHR,DXB,5,777,V7,20:15:00,AE,United Arab Emirates,SOUTHERN GULF,MIDDLE EAST,WB,14.0,48.0,40.0,124.0,2019,28,0,1,0,163,0,25,32,153,32
4,2019-03-3111,BA,11,2019-03-31,LHR,SIN,5,388,A8,18:55:00,SG,Singapore,FAR EAST JSA HUBS,AUSTRALASIA PACIFIC,WB,14.0,97.0,55.0,303.0,2019,29,5,6,0,368,0,35,51,351,41


In [84]:
pivot_df_final_status.sort_values(by='Premier',ascending=False).head(20)

Unnamed: 0,Skew_Id,OPERATING_AIRLINE_CD,OPERATING_FLT_NO,GMT_UPLIFT_DT,UPLIFT_STN_CD,DISCHARGE_STN_CD,ACT_DEP_TML_CD,IATA_AC_TYP_CD,ACT_AC_TYP_CD,GMT_PLND_DEP_TIME,COUNTRY_CD,COUNTRY_NM,CORP_GEOG_CTRY_GRP_NM,CORP_GEOG_CONTINENT_NM,WB_NB_CAT,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,GMT_UPLIFT_YR,Gold,Gold For Life,Gold Guest List,Gold Guest List For Life,None,Premier,Silver,Emerald,None_OW,Sapphire
130473,2023-10-11668,BA,668,2023-10-11,LHR,RAK,5,320,R3,09:15:00,MA,Morocco,NORTH AFRICA,AFRICA,NB,0.0,28.0,0.0,139.0,2023,35,12,11,0,62,7,22,72,50,27
56350,2019-09-16117,BA,117,2019-09-16,LHR,JFK,5,744,S4,07:25:00,US,United States,USA,NORTH AMERICA,WB,14.0,86.0,30.0,145.0,2019,44,3,14,1,165,5,28,71,157,32
17186,2019-05-21117,BA,117,2019-05-21,LHR,JFK,5,744,S4,07:25:00,US,United States,USA,NORTH AMERICA,WB,14.0,86.0,30.0,145.0,2019,17,4,8,0,170,5,24,41,161,26
7857,2019-04-23173,BA,173,2019-04-23,LHR,JFK,5,77W,G7,10:20:00,US,United States,USA,NORTH AMERICA,WB,14.0,56.0,44.0,183.0,2019,28,9,4,0,184,5,33,49,181,33
65210,2019-10-13203,BA,203,2019-10-13,LHR,BOS,5,777,P7,16:15:00,US,United States,USA,NORTH AMERICA,WB,17.0,48.0,24.0,127.0,2019,12,1,1,0,166,5,28,21,151,41
2752,2019-04-08179,BA,179,2019-04-08,LHR,JFK,5,777,V7,17:05:00,US,United States,USA,NORTH AMERICA,WB,14.0,48.0,40.0,124.0,2019,15,1,9,0,141,5,24,33,133,29
62872,2019-10-06183,BA,183,2019-10-06,LHR,JFK,5,744,S4,18:50:00,US,United States,USA,NORTH AMERICA,WB,14.0,86.0,30.0,145.0,2019,29,8,10,1,187,5,29,62,162,45
12491,2019-05-07117,BA,117,2019-05-07,LHR,JFK,5,744,S4,07:25:00,US,United States,USA,NORTH AMERICA,WB,14.0,86.0,30.0,145.0,2019,42,3,9,0,142,5,26,62,139,26
56078,2019-09-15179,BA,179,2019-09-15,LHR,JFK,5,77W,G7,17:05:00,US,United States,USA,NORTH AMERICA,WB,14.0,56.0,44.0,183.0,2019,24,5,3,1,194,5,36,44,184,40
52394,2019-09-02179,BA,179,2019-09-02,LHR,JFK,5,777,W7,17:05:00,US,United States,USA,NORTH AMERICA,WB,12.0,48.0,32.0,127.0,2019,12,3,9,0,170,5,10,33,157,19


In [85]:
# export dataframe with status and pax eligible for tier 1, 2 and 3
pivot_df_final_merged = pivot_df_final_status.merge(pivot_df_final[['GMT_UPLIFT_DT','Skew_Id','GMT_PLND_DEP_TIME','Not eligible','Tier 1','Tier 2','Tier 3','TOTAL_PAX']],on=['Skew_Id','GMT_PLND_DEP_TIME','GMT_UPLIFT_DT'],how='left')

In [86]:
# just check no rows were lost.
print(pivot_df_final_merged.shape)
print(pivot_df_final_status.shape)
print(pivot_df_final_status_OW.shape)
# should all be 130976.

# check this is the same as the unique skew id in the original dataset.
print(df_lounge_eligibility['Skew_Id'].nunique())
df_lounge_eligibility[~df_lounge_eligibility['Skew_Id'].isin(list(pivot_df_final_merged['Skew_Id'].unique()))].sort_values(by='Skew_Id').shape


(130976, 35)
(130976, 30)
(130976, 23)
131362


(1606, 12)

In [88]:
#pivot_df_final_merged.to_csv('../../scragg_pivot_df_final_merged.csv')

--

--

-

-

-

-

-

-


### Linear Regression

In [78]:
# import numpy as np
# from sklearn import datasets, linear_model
# import matplotlib.pyplot as plt

In [79]:
# headers_and_first_row(df_lounge_elig_flight_info_country_acft_typ)

Separate the large tables to different tiers

In [80]:
# df_merged_T1 = df_lounge_elig_flight_info_country_acft_typ[df_lounge_elig_flight_info_country_acft_typ['Lounge_eligibility_tier'] == 'Tier 1'].reset_index()
# df_merged_T2 = df_lounge_elig_flight_info_country_acft_typ[df_lounge_elig_flight_info_country_acft_typ['Lounge_eligibility_tier'] == 'Tier 2'].reset_index()
# df_merged_T3 = df_lounge_elig_flight_info_country_acft_typ[df_lounge_elig_flight_info_country_acft_typ['Lounge_eligibility_tier'] == 'Tier 3'].reset_index()

# df_merged_T1 = df_merged_T1.rename(columns={'pax':'Tier_1_pax'})
# df_merged_T2 = df_merged_T2.rename(columns={'pax':'Tier_2_pax'})
# df_merged_T3 = df_merged_T3.rename(columns={'pax':'Tier_3_pax'})

In [81]:
# df_merged_T3.columns
# headers_and_first_row(df_merged_T1)


In [82]:
# y = np.array([1,2,3])
# x = np.array([-1,0,1])

# # y = y.to_numpy()
# # x = x.to_numpy().reshape(-1, 1)
# x = x.reshape(-1,1)

# # print(x.shape, y.shape)

# regr = linear_model.LinearRegression()
# regr.fit(x,y)

# regr.coef_, regr.intercept_, regr.rank_

### Compare by Year

In [83]:
# columns that you want to group by
list_groupby = ['GMT_UPLIFT_YR']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_year_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

# a look at the data
df_groupby_year_ttl

Unnamed: 0,GMT_UPLIFT_YR,pax_count_ttl
0,2019,10975714
1,2023,9737126


In [84]:
df_groupby_year = pd.merge(df_groupby_year,
                            df_groupby_year_ttl,
                            on = ['GMT_UPLIFT_YR'],
                            how = 'left'
                           )

df_groupby_year

Unnamed: 0,GMT_UPLIFT_YR,Lounge_eligibility_tier,pax_count,pax_count_ttl
0,2019,Not eligible,8147159,10975714
1,2019,Tier 1,196564,10975714
2,2019,Tier 2,635765,10975714
3,2019,Tier 3,1996226,10975714
4,2023,Not eligible,7372440,9737126
5,2023,Tier 1,93555,9737126
6,2023,Tier 2,501044,9737126
7,2023,Tier 3,1770087,9737126


In [85]:
# getting the percentage of pax elegible for each of the Tiers

df_groupby_year['pax_eligible%'] = (df_groupby_year['pax_count'] / df_groupby_year['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_year['pax_eligible%'] = df_groupby_year['pax_eligible%'].map('{:,.1f}%'.format)

# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_year.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

In [86]:
df_groupby_year = df_groupby_year[df_groupby_year['Lounge_eligibility_tier'] != "Not eligible"]

df_groupby_year

Unnamed: 0,GMT_UPLIFT_YR,Lounge_eligibility_tier,pax_eligible%
1,2019,Tier 1,1.8%
2,2019,Tier 2,5.8%
3,2019,Tier 3,18.2%
5,2023,Tier 1,1.0%
6,2023,Tier 2,5.1%
7,2023,Tier 3,18.2%


Not much difference

### Compare by location and year

In [88]:
# columns that you want to group by
list_groupby = ['CORP_GEOG_CONTINENT_NM','CORP_GEOG_CTRY_GRP_NM','GMT_UPLIFT_YR','Lounge_eligibility_tier']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_country_group_year = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

list_groupby = ['CORP_GEOG_CONTINENT_NM','CORP_GEOG_CTRY_GRP_NM','GMT_UPLIFT_YR']

df_groupby_country_group_year_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

df_groupby_country_group_year = pd.merge(df_groupby_country_group_year,
                                         df_groupby_country_group_year_ttl,
                                         on = list_groupby,
                                         how = "left")

df_groupby_country_group_year = df_groupby_country_group_year[df_groupby_country_group_year['Lounge_eligibility_tier'] != 'Not eligible']

df_groupby_country_group_year['pax_eligible%'] = (df_groupby_country_group_year['pax_count'] / df_groupby_country_group_year['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_country_group_year['pax_eligible%'] = df_groupby_country_group_year['pax_eligible%'].map('{:,.1f}%'.format)

# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_country_group_year.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

df_groupby_country_group_year.head(6)


Unnamed: 0,CORP_GEOG_CONTINENT_NM,CORP_GEOG_CTRY_GRP_NM,GMT_UPLIFT_YR,Lounge_eligibility_tier,pax_eligible%
1,AFRICA,EAST AND CENTRAL AFRICA,2019,Tier 1,3.7%
2,AFRICA,EAST AND CENTRAL AFRICA,2019,Tier 2,4.0%
3,AFRICA,EAST AND CENTRAL AFRICA,2019,Tier 3,17.6%
5,AFRICA,EAST AND CENTRAL AFRICA,2023,Tier 1,0.2%
6,AFRICA,EAST AND CENTRAL AFRICA,2023,Tier 2,4.3%
7,AFRICA,EAST AND CENTRAL AFRICA,2023,Tier 3,16.0%


In [89]:
df_groupby_country_group_year = df_groupby_country_group_year.set_index(['CORP_GEOG_CONTINENT_NM','CORP_GEOG_CTRY_GRP_NM','GMT_UPLIFT_YR','Lounge_eligibility_tier'],drop = True).unstack('Lounge_eligibility_tier')

df_groupby_country_group_year

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pax_eligible%,pax_eligible%,pax_eligible%
Unnamed: 0_level_1,Unnamed: 1_level_1,Lounge_eligibility_tier,Tier 1,Tier 2,Tier 3
CORP_GEOG_CONTINENT_NM,CORP_GEOG_CTRY_GRP_NM,GMT_UPLIFT_YR,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AFRICA,EAST AND CENTRAL AFRICA,2019,3.7%,4.0%,17.6%
AFRICA,EAST AND CENTRAL AFRICA,2023,0.2%,4.3%,16.0%
AFRICA,NORTH AFRICA,2019,0.5%,4.9%,20.9%
AFRICA,NORTH AFRICA,2023,0.3%,3.6%,19.4%
AFRICA,SOUTHERN AFRICA,2019,3.8%,5.3%,20.2%
AFRICA,SOUTHERN AFRICA,2023,2.4%,4.4%,18.7%
AFRICA,WEST AFRICA,2019,3.7%,4.5%,16.0%
AFRICA,WEST AFRICA,2023,2.3%,4.5%,17.2%
ASIA FAR EAST,CHINA,2019,4.6%,4.2%,18.8%
ASIA FAR EAST,CHINA,2023,0.2%,2.6%,19.4%


### By continent

In [90]:
# columns that you want to group by
list_groupby = ['CORP_GEOG_CONTINENT_NM','GMT_UPLIFT_YR','Lounge_eligibility_tier']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_cont_year = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count = ('pax','sum')
).reset_index()

list_groupby = ['CORP_GEOG_CONTINENT_NM','GMT_UPLIFT_YR']

df_groupby_cont_year_ttl = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                pax_count_ttl = ('pax','sum')
).reset_index()

df_groupby_cont_year = pd.merge(df_groupby_cont_year,
                                df_groupby_cont_year_ttl,
                                on = list_groupby,
                                how = "left")

df_groupby_cont_year = df_groupby_cont_year[df_groupby_cont_year['Lounge_eligibility_tier'] != 'Not eligible']

df_groupby_cont_year['pax_eligible%'] = (df_groupby_cont_year['pax_count'] / df_groupby_cont_year['pax_count_ttl'] )*100 

# getting the pax_elegible% column in the right format
df_groupby_cont_year['pax_eligible%'] = df_groupby_cont_year['pax_eligible%'].map('{:,.1f}%'.format)

# dropping pax_count, pax_count_ttl columns - not needed anymore
df_groupby_cont_year.drop(columns=['pax_count','pax_count_ttl'],inplace = True)

df_groupby_cont_year.head(6)

Unnamed: 0,CORP_GEOG_CONTINENT_NM,GMT_UPLIFT_YR,Lounge_eligibility_tier,pax_eligible%
1,AFRICA,2019,Tier 1,3.5%
2,AFRICA,2019,Tier 2,4.8%
3,AFRICA,2019,Tier 3,18.5%
5,AFRICA,2023,Tier 1,1.7%
6,AFRICA,2023,Tier 2,4.3%
7,AFRICA,2023,Tier 3,18.2%


In [91]:
df_groupby_cont_year = df_groupby_cont_year.set_index(['CORP_GEOG_CONTINENT_NM','GMT_UPLIFT_YR','Lounge_eligibility_tier'],drop = True).unstack('Lounge_eligibility_tier')

df_groupby_cont_year

Unnamed: 0_level_0,Unnamed: 1_level_0,pax_eligible%,pax_eligible%,pax_eligible%
Unnamed: 0_level_1,Lounge_eligibility_tier,Tier 1,Tier 2,Tier 3
CORP_GEOG_CONTINENT_NM,GMT_UPLIFT_YR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AFRICA,2019,3.5%,4.8%,18.5%
AFRICA,2023,1.7%,4.3%,18.2%
ASIA FAR EAST,2019,3.6%,4.8%,18.3%
ASIA FAR EAST,2023,1.0%,3.5%,18.3%
ASIA SOUTH EAST AND CENTRAL,2019,2.8%,4.0%,19.1%
ASIA SOUTH EAST AND CENTRAL,2023,0.4%,3.0%,17.0%
AUSTRALASIA PACIFIC,2019,2.3%,5.6%,18.3%
AUSTRALASIA PACIFIC,2023,3.4%,7.4%,26.3%
EUROPE EXC UK,2019,0.6%,6.4%,17.1%
EUROPE EXC UK,2023,0.4%,5.5%,18.1%


### By aircraft

In [92]:
list_groupby = ['IATA_AC_TYP_CD','ACT_AC_TYP_CD','FIRST_SEATS_QTY','CLUB_SEATS_QTY','PREM_ECONOMY_SEATS_QTY','ECONOMY_SEATS_QTY','Lounge_eligibility_tier']

# grouping by OPERATING_FLT_NO, DISCHARGE_STN_CD and Lounge_eligibility_tier
df_groupby_acft = df_lounge_elig_flight_info_country_acft_typ.groupby(list_groupby).agg(
                mean_pax_count = ('pax','mean')).reset_index()

df_groupby_acft['mean_pax_count'] = df_groupby_acft['mean_pax_count'].map('{:,.1f}'.format)

df_groupby_acft = df_groupby_acft[df_groupby_acft['Lounge_eligibility_tier'] != 'Not eligible']

df_groupby_acft = df_groupby_acft.set_index(list_groupby,drop = True).unstack('Lounge_eligibility_tier')

df_groupby_acft.fillna(0, inplace=True)

df_groupby_acft


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,mean_pax_count,mean_pax_count,mean_pax_count
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Lounge_eligibility_tier,Tier 1,Tier 2,Tier 3
IATA_AC_TYP_CD,ACT_AC_TYP_CD,FIRST_SEATS_QTY,CLUB_SEATS_QTY,PREM_ECONOMY_SEATS_QTY,ECONOMY_SEATS_QTY,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
319,A4,0.0,16.0,0.0,119.0,1.2,2.1,4.6
319,M4,0.0,21.0,0.0,113.0,1.1,1.8,4.3
320,A3,0.0,24.0,0.0,132.0,1.2,2.2,5.0
320,E9,0.0,29.0,0.0,130.0,1.2,2.0,5.7
320,M2,0.0,24.0,0.0,144.0,1.1,1.9,4.9
320,R3,0.0,28.0,0.0,139.0,1.2,2.1,5.4
320,T3,0.0,26.0,0.0,142.0,1.2,1.9,5.3
321,M6,0.0,23.0,0.0,131.0,1.3,2.0,3.9
321,V6,0.0,38.0,0.0,160.0,1.2,2.5,5.9
321,W6,0.0,42.0,0.0,154.0,1.3,2.4,5.8


<br><br><br>

In [93]:
#--------------------------------
#
# Your turn!!!
#
#--------------------------------

<u>What level of granularity do you use?</u>
- ... (your answer here)

<u>What metric do you use to come up with Lounge eligibility profiles?</u>
- ... (your answer here)

In [None]:
# your code here

<br>

Please save your final lookup table below in the form of a pandas dataframe. It must contain the categories you have come up with as rows, and the Tier 1, Tier 2, and Tier 3 percentage of costumers as columns.