# Notebook objectives:
Create a file that can be used for:
- Comparing yearly, weekly ins attachment rate and RPF
- Comparing yearly, monthly ins attachment rate and RPF

# Change log

- 2019/01/14 - NM - Included a column that indicates if customers' residency is NL or not
- 2019/01/14 - NM - Renamed cancelled orders Phase 1&2 variable 
- 2019/08/07 - NM - Rename column Travix_booking to distributor_purchase so that it can be more generic
- 2019/08/07 - NM - Changed BI_drive network export location as Travix different channels files were moved to one folder 
- 2019/08/12 - NM - Removing duplicated rows in Travix data as a result of insurance purchasers column indicating an additional row with admin cost of 0  and insurance premium cost as the sum of premium and admin cost
- 2019/08/27 - MW - Update BI server IP address from 192.168.140.69 to 192.168.8.142 
- 2019/09/18 -NM - Change location from which the data is imported to local computer while waiting for BI drive to be     updated
- 2019/09/18 - NM - Update BI server IP address from 192.168.0.112 to 192.168.140.130
- 2019/09/20 -NM - Changed the code for exporting Travix data such that it merge all Travix datadump gz folders in Hepstar files into one


# Preparation

## Import packages and data

In [1]:
import numpy as np
import pandas as pd
from datetime import date
import glob
import errno
import os

# Data location Nicola
input_local = r'C:\\Users\\dataintern.HEPSTAR\\Documents\\Data Intern NM\\# Travix\\# YoY\\# Input\\'
export_local = r'C:\\Users\\dataintern.HEPSTAR\\Documents\\Data Intern NM\\\# Travix\\# YoY\\# Output\\'

In [2]:
# Set working directory to export location, for consequent data exports
# Name local external folder location
export_folder_name = os.getcwd() + '\\data'

# Create export folder based on specified name. Skip if already exists
try:
    os.mkdir(export_folder_name)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

In [3]:
# Set working directory for interim location, for consequent data inputs
# Name local input folder location
interim_folder_name = os.getcwd() + '\\data\\interim'

# Create input folder based on specified name. Skip if already exists
try:
    os.mkdir(interim_folder_name)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

In [4]:
# Set working directory to export location, for consequent data exports
# Name local external folder location
export_folder_name = os.getcwd() + '\\data\\external'
## Remove # before folder name

# Create export folder based on specified name. Skip if already exists
try:
    os.mkdir(export_folder_name)
except OSError as exc:
    if exc.errno != errno.EEXIST:
        raise
    pass

In [4]:
# Network locations for data import & export
input_TravixDatadump_BI = r'\\192.168.140.37\Business-Share\bi-report\\Data analysis resources\\Travix\\DatadumpTravix\\'
export_BI_drive= r'\\192.168.140.37\\Business-Share\\BI resources\\Dashboards\\Travix\\# Travix NL\\# Data\\'

##Data location Michiel
input_local = r'C:\\Users\\michiel.HEPSTAR\\Documents\\Hepstar data analysis\\Travix data\\20180801 Travix y2y segmentation\\# input\\'
export_local = r'C:\\Users\\michiel.HEPSTAR\\Documents\\Hepstar data analysis\\Travix data\\20180801 Travix y2y segmentation\\# output\\'
bireport_export= r'\\hepstar-files\\Business-Share\\bi-report\Dashboards\\Travix NL\\Data\\'

In [5]:
# Import data
#travix_df = pd.read_csv(input_TravixDatadump_BI +'hepstar_export.csv.gz', compression='gzip',low_memory=False)
# Import Travix data
# Determine files to import from Hepstar files server
AllData = glob.glob(input_TravixDatadump_BI + "/*.gz")
# Merge all Travix datadumps into one
Travix_data_df = pd.concat((pd.read_csv(f,compression='gzip',low_memory=False) for f in AllData))

In [6]:
#Select only Channels of interest so that the script does not take long to run
travix_df=Travix_data_df.loc[Travix_data_df['Brand'].isin(['CHEAPTICKETSNL','BUDGETAIR','VLIEGWINKEL','BUDGETAIRCA'])]
#make a copy of the data
travix_df=travix_df.copy()

In [7]:
# Select data range starting from the time we went live
Start_date = ('2018-05-29 00:00:00')
Start_date = pd.to_datetime(Start_date)
Start_date

Timestamp('2018-05-29 00:00:00')

In [8]:
travix_df['PolicyType'].value_counts()

HEPSTAR-CANCELLATION          27206
CANCELLATION-INSURANCE        13132
Cancellation Insurance         7269
HEPSTAR-TRAVELCANCELLATION     5467
HEPSTAR-TRAVEL                 5056
TRAVEL-INSURANCE               2900
COMBINED-INSURANCE             2898
Travel Insurance               2022
COMBINED INSURANCE CANADA       702
EXTRAS                          218
reissverzekering                  1
Name: PolicyType, dtype: int64

# Data Cleaning

In [9]:
#Excluding time from the Order Date and changing orderdate to date format
travix_df['OrderDateUTC'] = pd.to_datetime(travix_df['OrderDateUTC'],infer_datetime_format=True,exact=False).dt.date
travix_df['OrderDateUTC'] = pd.to_datetime(travix_df['OrderDateUTC'])

In [10]:
#Check the maximum date in the dataset
travix_df['OrderDateUTC'].min()

Timestamp('2018-01-01 00:00:00')

In [11]:
#make a copy of the dataset
travix_base_df=travix_df.copy()

In [12]:
#Removing duplicated rows in Travix data as a result of insurance purchasers column indicating an additional row with total premium and admin cost  
#Create a column that indicates duplicates by ordernumber 
travix_base_df['Dups'] = travix_base_df.duplicated(subset=['OrderNumber'], keep=False).astype(int).astype(str)

In [13]:
# Remove duplicated insurance purchase information
travix_base_df.drop(travix_base_df[(travix_base_df['PolicyNumber'].notnull()) & (travix_base_df['InsuranceCost_AdministrativeCost']==0) & (travix_base_df['Dups']=='1')].index, inplace=True)

## Removing incomplete last day

In [14]:
# end date report
End_date = travix_base_df['OrderDateUTC'].max()
End_date

Timestamp('2019-10-28 00:00:00')

In [15]:
# Remove incomplete last day in a dataset
travix_base_df=travix_base_df[travix_base_df['OrderDateUTC']< End_date]

In [16]:
#Check maximum date after removing incomplete last date
travix_base_df['OrderDateUTC'].max()

Timestamp('2019-10-27 00:00:00')

## Selecting NL brand only

In [17]:
#Limit data to NL brand only
#travix_base_df=travix_base_df[travix_base_df['Brand'].isin(['CHEAPTICKETSNL', 'BUDGETAIR', 'VLIEGWINKEL'])]
#check brands that are now available in the dataset
#travix_base_df['Brand'].value_counts()

In [18]:
#rename Brand to Brand/Channel 
travix_base_df=travix_base_df.rename(columns={'Brand':'Brand/Channel'})

## Haul

In [19]:
#Check haul column before adding 'No insurance' category
travix_base_df['Haul'].value_counts()

Short Haul    668039
Long Haul     350131
Name: Haul, dtype: int64

In [20]:
travix_base_df.columns

Index(['Brand/Channel', 'OrderNumber', 'SessionId', 'ChannelType',
       'CustomerIdentifier', 'RepeatCustomer', 'TotalNumberOfOrders',
       'ReceivedDiscount', 'OrderDateUTC', 'DepartureDate', 'ReturnDate',
       'Duration', 'OneWayOrReturn', 'Haul', 'DomesticOrInternational',
       'Airline', 'NumberOfPassengers', 'NumberOfInfants', 'NumberOfChildren',
       'NumberOfAdults', 'BookerDateOfBirth', 'BookerCountry_Code',
       'OriginAirportCode', 'OriginCountryCode', 'DestinationAirportCode',
       'DestinationCountryCode', 'FlewEconomyClass', 'FlewFirstClass',
       'FlewBusinessClass', 'FlewPremiumEconomyClass', 'Insured', 'PolicyType',
       'PolicyName', 'InsuranceStartDate', 'InsuranceEndDate', 'Supplier',
       'PolicyNumber', 'InsuranceSalesCurrency',
       'InsuranceCost_AdministrativeCost', 'InsuranceCost_Premium',
       'InsuranceCost_InsuranceTax', 'InsuranceCost_AdditionalCoverageCost',
       'TotalInsuranceCost', 'InsuredAmount', 'TotalTicketSale_EUR',
      

In [21]:
# Creating a new column that also indicates 'No haul' if haul type is not indicated in the Haul column
#fill missing values with 'No insurance'
travix_base_df.Haul.fillna(value='No haul', inplace=True)

In [22]:
#Check haul column after adding 'No insurance' category
travix_base_df['Haul'].value_counts()

Short Haul    668039
Long Haul     350131
No haul        38999
Name: Haul, dtype: int64

## Policy Type

In [23]:
#Creating a new column that also indicates 'No insurance' if a customer didnt buy insurance
#making a copy of Policy type
travix_base_df['Policy_type']=travix_base_df['PolicyType']
#fill missing values with 'No insurance'
travix_base_df.Policy_type.fillna(value='No insurance', inplace=True)

In [24]:
#rename policy type categories to travel, cancellation and combi insurance
travix_base_df['Policy_type'].replace(to_replace=('HEPSTAR-CANCELLATION','HEPSTAR-COMBINED-INSURANCE','HEPSTAR-TRAVEL', 'HEPSTAR-TRAVELCANCELLATION', 'reissverzekering', 'Travel Insurance ', 'cancellationinsurance'), value=('Cancellation Insurance','Combi Insurance','Travel Insurance', 'Combi Insurance', 'Travel Insurance', 'Travel Insurance', 'Cancellation Insurance' ), inplace=True)
#Check
travix_base_df['Policy_type'].value_counts()

No insurance                 999857
Cancellation Insurance        33795
CANCELLATION-INSURANCE         7371
Travel Insurance               6885
Combi Insurance                5322
TRAVEL-INSURANCE               1621
COMBINED-INSURANCE             1614
COMBINED INSURANCE CANADA       699
EXTRAS                            5
Name: Policy_type, dtype: int64

## Travix booking

In [25]:
# Creating a new column that indicates 1 if a customer bought flight ticket 
travix_base_df['distributor_purchase'] = travix_base_df['CustomerIdentifier'].where (travix_base_df['CustomerIdentifier'].isnull(), 1).fillna(0).astype(int)
#check if the new column is correct
travix_base_df['CustomerIdentifier'].count() - travix_base_df['distributor_purchase'].sum()

0

In [26]:
# Creating a new column that indicates 1 if a customer bought flight ticket 
travix_base_df['distributor_purchase'] = travix_base_df['CustomerIdentifier'].where (travix_base_df['CustomerIdentifier'].isnull(), 1).fillna(0).astype(int)
#check if the new column is correct
travix_base_df['distributor_purchase'].count() - travix_base_df['distributor_purchase'].sum()

0

In [27]:
#check if every row has brand identified
travix_base_df['Brand/Channel'].count()

1057169

## Insurance Purchase

In [28]:
# Creating a new column that indicates 1 if a customer bought insurance and 0 otherwise
#setting insurance purchase to either 1 or zero
#travix_base_df['Insurance_purch'] = travix_base_df['PolicyType'].where (travix_base_df['PolicyType'].isnull(), 1).fillna(0).astype(int)
#check if the new column is correct
#travix_base_df['PolicyType'].count() - travix_base_df['Insurance_purch'].sum()

In [29]:
# Create variable confirming insurance purchase
def Insurance_purchase(travix_base_df):
    if travix_base_df['TotalInsuranceCost'] > 0:
        return 1
    else:
        return 0
travix_base_df['Insurance_purch'] = travix_base_df.apply (lambda row: Insurance_purchase (row), axis=1)

## Test product

In [30]:
travix_base_df['Policy_type'].value_counts()

No insurance                 999857
Cancellation Insurance        33795
CANCELLATION-INSURANCE         7371
Travel Insurance               6885
Combi Insurance                5322
TRAVEL-INSURANCE               1621
COMBINED-INSURANCE             1614
COMBINED INSURANCE CANADA       699
EXTRAS                            5
Name: Policy_type, dtype: int64

In [31]:
#Remove test products 
#travix_base_df.drop(travix_base_df[travix_base_df['Policy_type']=='test'].index, inplace=True)

## Year, Month Week variables

In [32]:
# create order month/year variable
travix_base_df['Order_month'] = travix_base_df['OrderDateUTC'].dt.month
travix_base_df['Order_year'] = travix_base_df['OrderDateUTC'].dt.year.astype(str)
# create  order week variable
travix_base_df['Orderweek'] = travix_base_df['OrderDateUTC'].dt.week
# create order month and order year variable
travix_base_df['YearMonth'] = travix_base_df['OrderDateUTC'].apply(lambda x:x.strftime('%Y-%m'))
#create year and week variable
travix_base_df['Yearweek'] = travix_base_df['OrderDateUTC'].dt.year.astype(str) + '-' + travix_base_df['OrderDateUTC'].dt.week.astype(str)

In [33]:
#Change the week number of the last day of the year given as week 1
def change_year_week(travix_base_df):
    if (travix_base_df['Order_month'] == 12) and (travix_base_df['Orderweek'] == 1):
        return travix_base_df['Order_year']+'-'+'52'
    else:
        return travix_base_df['Yearweek']
travix_base_df['Yearweek'] = travix_base_df.apply (lambda row: change_year_week (row),axis=1)

In [34]:
#Check max date for the date
travix_base_df['OrderDateUTC'].max()

Timestamp('2019-10-27 00:00:00')

In [35]:
#convert order_year to string (so as to be able to later join a multiindex level when creating a pivot table)
travix_base_df['Order_year']=travix_base_df['Order_year'].apply(str)

## Duplicates

In [36]:
#select data from the day we went live
Tr_live_df = travix_base_df[travix_base_df['OrderDateUTC']>='2018-05-29 00:00:00']
Tr_live_df=Tr_live_df.copy()

In [37]:
Tr_live_df.shape

(929193, 62)

In [38]:
#select data before we went live
Tr_B4_live= travix_base_df[travix_base_df['OrderDateUTC']<'2018-05-29 00:00:00']
Tr_B4_live=Tr_B4_live.copy()

In [39]:
#drop duplicates with the same ordernumber and policy type from the beggining of phase 1
Tr_live_df.drop_duplicates(['OrderNumber', 'PolicyType'],keep='first',inplace=True)
#Dups=Travix_df[Travix_df.duplicated(subset=['OrderNumber','sessionid'], keep=False)]

In [40]:
# create a list of columns which are constant per order number
constant_columns = ['OrderNumber', 'OrderDateUTC', 'Brand/Channel', 'OneWayOrReturn', 'ReceivedDiscount','DepartureDate', 'ReturnDate', 'Duration',
                    'Insured', 'Haul','SessionId', 'ChannelType', 'CustomerIdentifier', 'TotalNumberOfOrders', 'DomesticOrInternational',
                     'Airline','NumberOfPassengers','NumberOfInfants','NumberOfChildren','NumberOfAdults', 'BookerDateOfBirth','OriginAirportCode',
                   'FlewEconomyClass','FlewFirstClass','FlewBusinessClass','FlewPremiumEconomyClass','Order_year', 'Orderweek','Order_month', 
                    'PolicyNumber', 'distributor_purchase', 'Insurance_purch','InsuranceSalesCurrency', 'OrderCanceled', 'Policy_type', 'YearMonth', 'Yearweek',
                   'BookerCountry_Code','OriginCountryCode', 'DestinationAirportCode', 'DestinationCountryCode',
                    'PolicyName', 'InsuranceStartDate', 'InsuranceEndDate', 'Supplier','PolicyNumber']                            

In [41]:
#join the dataset with the constant columns
excl_dups_df =Tr_live_df[constant_columns]
# create index based on the 'OrderNumber' column
excl_dups_df.set_index('OrderNumber', inplace=True)

In [42]:
#create a new database with a list of names for columns to be summed per order number
columns_to_sum = ['InsuranceCost_AdministrativeCost',
                  'InsuranceCost_Premium', 'InsuranceCost_InsuranceTax',
                  'InsuranceCost_AdditionalCoverageCost', 'TotalInsuranceCost',
                  'InsuredAmount','TotalTicketSale_EUR']                             

In [43]:
# create a new dataframe with sums for each order number
sums_excl_df =Tr_live_df.groupby('OrderNumber')[columns_to_sum].sum()

In [44]:
Tr_live_df['PolicyType'].fillna('Nan',inplace=True)

In [45]:
# create a series with concatenated policy types to identify products sold later
policy_series = Tr_live_df.groupby('OrderNumber')['PolicyType'].apply(lambda x: 
                            ' + '.join(x.sort_values()) if x.any() else np.nan)

In [46]:
 #MERGE CONSTANT AND TRANSFORMED DATASETS
# merge the base data frame with the one containing sums
excl_dups_df = excl_dups_df.merge(sums_excl_df, left_index=True, right_index=True)
# add policy names
excl_dups_df['Policy_names'] = policy_series

In [47]:
#Reset index so as to merge the two different datasets
excl_dups_df.reset_index(inplace=True)

In [48]:
#drop duplicates by ordernumber
excl_dups_df.drop_duplicates(['OrderNumber'],keep='first',inplace=True)

In [49]:
#output_df[output_df['Policy_names']=='HEPSTAR-TRAVEL + HEPSTAR-TRAVELCANCELLATION']['OrderNumber']

In [50]:
#Merge the two dataset (before and after we went live)
Travix_data = pd.concat([excl_dups_df,Tr_B4_live], join_axes = [excl_dups_df.columns])

In [51]:
#Rename supplier name so as to have same format of the names
Travix_data['Supplier'].replace('Unigarant verzekeringen', 'Unigarant Verzekeringen',inplace=True)

## Travix revenue

In [52]:
Travix_data.columns

Index(['OrderNumber', 'OrderDateUTC', 'Brand/Channel', 'OneWayOrReturn',
       'ReceivedDiscount', 'DepartureDate', 'ReturnDate', 'Duration',
       'Insured', 'Haul', 'SessionId', 'ChannelType', 'CustomerIdentifier',
       'TotalNumberOfOrders', 'DomesticOrInternational', 'Airline',
       'NumberOfPassengers', 'NumberOfInfants', 'NumberOfChildren',
       'NumberOfAdults', 'BookerDateOfBirth', 'OriginAirportCode',
       'FlewEconomyClass', 'FlewFirstClass', 'FlewBusinessClass',
       'FlewPremiumEconomyClass', 'Order_year', 'Orderweek', 'Order_month',
       'PolicyNumber', 'distributor_purchase', 'Insurance_purch',
       'InsuranceSalesCurrency', 'OrderCanceled', 'Policy_type', 'YearMonth',
       'Yearweek', 'BookerCountry_Code', 'OriginCountryCode',
       'DestinationAirportCode', 'DestinationCountryCode', 'PolicyName',
       'InsuranceStartDate', 'InsuranceEndDate', 'Supplier', 'PolicyNumber',
       'InsuranceCost_AdministrativeCost', 'InsuranceCost_Premium',
       'Insu

In [53]:
#Calculate 58% Insurance premium
Travix_data['58%Insurance_Premium']=0.58*Travix_data['InsuranceCost_Premium']

In [54]:
#check
test= Travix_data[['58%Insurance_Premium', 'InsuranceCost_Premium']]
test[test.InsuranceCost_Premium.notnull()].head(5)

Unnamed: 0,58%Insurance_Premium,InsuranceCost_Premium
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [55]:
#function to add a columns with masterpolicyfee
Travix_data['Masterpolicy_fee'] = Travix_data['Policy_type'].where (Travix_data['Policy_type'].isnull(), 6).fillna(0).astype(int)
Travix_data['Masterpolicy_fee'].value_counts()

6    825003
Name: Masterpolicy_fee, dtype: int64

In [56]:
#function to add a columns indicating the phase in which the orders occured
def Travix_Phase(Travix_data):
    if (Travix_data['OrderDateUTC']>= Start_date)  and (Travix_data['Supplier']=='Unigarant Verzekeringen'):
        return 'Phase1&2'
    else:
        return ''
Travix_data['Phase'] = Travix_data.apply (lambda row: Travix_Phase (row),axis=1)

In [57]:
#function to add a columns with masterpolicyfee without tax (at the beginning of Phase 2a going onwards)
def MP_fee_exc_tax(Travix_data):
    if (Travix_data['Insurance_purch'] == 1) and (Travix_data['OrderDateUTC']>= Start_date) and (Travix_data['Supplier']=='Unigarant Verzekeringen'):
        return 5.8
    if (Travix_data['Insurance_purch'] == 1) and (Travix_data['OrderDateUTC']< Start_date) and (Travix_data['Supplier']=='Unigarant Verzekeringen'):
        return 6
    else:
        return 0
Travix_data['MP_fee_exc_tax&HS'] = Travix_data.apply (lambda row: MP_fee_exc_tax (row),axis=1)

In [58]:
Travix_data['Supplier'].value_counts()

Unigarant Verzekeringen    43541
AIG Canada                   699
Unigarant verzekeringen        6
Refund Protect                 4
Name: Supplier, dtype: int64

#Travix Revenue
#Calculate Distributor's revenue per supplier
def Ditributor_revenue(Travix_data):
    if Travix_data['Supplier'] == 'unigarant verzekeringen':
        return Travix_data['MP_fee_exc_tax&HS']+Travix_data['58%Insurance_Premium']
    if Travix_data['Supplier'].str.lower() == 'refund protect':
        return  Travix_data['45%Insurance_Premium']
    else:
        return 0
Travix_data['HS_purchased_dist_revenue'] = Travix_data.apply (lambda row: Ditributor_revenue (row),axis=1)

In [59]:
#Calculate Distributor's revenue per supplier
Travix_data['Travix_revenue']=0
Travix_data['Travix_revenue']= np.where(Travix_data['Supplier'].str.lower()=='unigarant verzekeringen', Travix_data['MP_fee_exc_tax&HS']+Travix_data['58%Insurance_Premium'],Travix_data['Travix_revenue'])
Travix_data['Travix_revenue']= np.where(Travix_data['Supplier']=='Refund protect', Travix_data['InsuranceCost_Premium']*0.5, Travix_data['Travix_revenue'])

In [60]:
#create a column indicating the total retail value
Travix_data['TotalRetail_value']=Travix_data['TotalInsuranceCost']

## Cancelled Orders

In [61]:
# Removing Cancelled Orders starting from phase 1 going onwards
#function to add a columns that indicates if a policy was cancelled or not
def Order_cancelled(Travix_data):
    if (Travix_data['OrderCanceled'] == True) and (Travix_data['OrderDateUTC']>= Start_date):
        return True
    else:
        return False
Travix_data['OrderCancel_P1&2'] = Travix_data.apply (lambda row: Order_cancelled (row),axis=1)

In [62]:
#Check number of cancelled orders in Phase 1&2 using the created variable
Travix_data['OrderCancel_P1&2'].value_counts()

False    798780
True      26223
Name: OrderCancel_P1&2, dtype: int64

In [63]:
#Check if indication of cancelled orders is in Phase 1 and 2 only
Travix_data[Travix_data['Phase']=='Phase1&2']['OrderCanceled'].value_counts()

False    32786
True      1210
Name: OrderCanceled, dtype: int64

In [64]:
#drop cancelled orders in Phase 1&2
#travix_base_df.drop(travix_base_df[travix_base_df['CancelledOrders_Phase1&2']==True].index, inplace=True)

## Booker Country 

In [65]:
# Creating a new column that indicates NL if a customer's booking country code is NL and non_NL otherwise
#function to add a columns that indicates Booker country
def Booker_Country(Travix_data):
    if (Travix_data['BookerCountry_Code'] == 'NL'):
        return 'NL'
    else:
        return 'non_NL'
Travix_data['Booker_Country'] = Travix_data.apply (lambda row: Booker_Country (row),axis=1)

In [66]:
#Check the maximum date in the dataset
Travix_data['OrderDateUTC'].max()

Timestamp('2019-10-27 00:00:00')

# Create a pivot table

In [67]:
#select columns for the pivot table
clmns_to_keep_df=Travix_data[['OrderDateUTC','Order_year','Order_month','Orderweek','Yearweek','Haul','Policy_type','PolicyName','Insurance_purch','distributor_purchase','58%Insurance_Premium', 'Brand/Channel', 'Travix_revenue', 'MP_fee_exc_tax&HS', 'OrderCancel_P1&2', 'TotalRetail_value','OneWayOrReturn','Booker_Country']]

## NL Channels

In [68]:
TravixNL_df=clmns_to_keep_df.loc[clmns_to_keep_df['Brand/Channel'].isin(['CHEAPTICKETSNL','BUDGETAIR','VLIEGWINKEL'])]

In [69]:
#segment data per orderdate, year, week, haul and policy type
TravixNL_pivot_df=TravixNL_df.pivot_table(index=['OrderDateUTC','Order_year','Yearweek','Order_month','Orderweek','OrderCancel_P1&2','Haul', 'Brand/Channel','Policy_type','OneWayOrReturn', 'Booker_Country'], aggfunc='sum')

In [70]:
#Reset the index for the pivot table 
TravixNL_pivot_df.reset_index(inplace=True)

### Export data

In [71]:
#export to bi-report drive
TravixNL_pivot_df.to_csv(export_BI_drive+'Travix YoY Analysis Data excluding duplicates.csv')
#export to local
TravixNL_pivot_df.to_csv(export_folder_name + '\\' + +'Travix YoY Analysis Data excluding duplicates.csv')

## CA Channel

In [72]:
TravixCA_df=clmns_to_keep_df.loc[clmns_to_keep_df['Brand/Channel'].isin(['BUDGETAIRCA'])]

In [74]:
#segment data per orderdate, year, week, haul and policy type
TravixCA_pivot_df=TravixCA_df.pivot_table(index=['OrderDateUTC','Order_year','Yearweek','Order_month','Orderweek','OrderCancel_P1&2','Haul', 'Brand/Channel','Policy_type','OneWayOrReturn', 'Booker_Country'], aggfunc='sum')

In [75]:
#Reset the index for the pivot table 
TravixCA_pivot_df.reset_index(inplace=True)

### Export data

In [76]:
#export to bi-report drive
TravixCA_pivot_df.to_csv(export_BI_drive+'TravixCA YoY Analysis Data excluding duplicates.csv')
#export to local
TravixCA_pivot_df.to_csv(export_folder_name + '\\' ++'TravixCA YoY Analysis Data excluding duplicates.csv')