# Notebook objectives:
Create a file that can be used for:
- Comparing yearly, weekly ins attachment rate and RPF
- Comparing yearly, monthly ins attachment rate and RPF

# Change log

- 2019/01/14 - NM - Included a column that indicates if customers' residency is NL or not
- 2019/01/14 - NM - Renamed cancelled orders Phase 1&2 variable 
- 2019/08/07 - NM - Rename column Travix_booking to distributor_purchase so that it can be more generic
- 2019/08/07 - NM - Changed BI_drive network export location as Travix different channels files were moved to one folder 
- 2019/08/12 - NM - Removing duplicated rows in Travix data as a result of insurance purchasers column indicating an additional row with admin cost of 0  and insurance premium cost as the sum of premium and admin cost
- 2019/08/27 - MW - Update BI server IP address from 192.168.140.69 to 192.168.8.142 
- 2019/09/18 -NM - Change location from which the data is imported to local computer while waiting for BI drive to be     updated
- 2019/09/18 - NM - Update BI server IP address from 192.168.0.112 to 192.168.140.130
- 2019/09/20 -NM - Changed the code for exporting Travix data such that it merge all Travix datadump gz folders in Hepstar files into one
- 2020/03/25  - NM - Import Travix datadump csv files and merge them with the zipped files
- 2020/03/25  - NM - Include data imported from SFTP server


# Preparation

## Import packages and data

In [1]:
from datetime import date
import glob
import pandas as pd
import numpy as np
from datetime import date
from datetime import datetime, timedelta
import time
import pytz
import os
import errno

In [2]:
# Data location 
folder_external = r'C:\Users\BI-LOCAL\Documents\Python Scripts\Travix\data\external\\'

In [3]:
# Network locations for data import & export### Import data from BI drive
input_TravixDatadump = r'\\192.168.140.37\\Business-Share\\BI resources\\Dashboards\\Travix\\# Data\\Travix Datadump\\'
export_BI_drive= r'\\192.168.140.37\\Business-Share\\BI resources\\Dashboards\\Travix\\# Travix NL\\# Data\\'
Hepstar_path =r'\\192.168.140.37\\Business-Share\\bi-report\\'

### Import data from BI drive

In [4]:
#Specify all bi-report files directory 
All_TR_Data = glob.glob(input_TravixDatadump + "/*.csv")
#Specify files you want to exclude
Data_TR_2018= glob.glob(input_TravixDatadump + "/2018*.csv")
Data_TR_201901= glob.glob(input_TravixDatadump + "/201901*.csv")
#Data_TR_201904= glob.glob(input_TravixDatadump + "/201904*.csv")
# Specify relevant di-report files 
Select_TR_Data = set(All_TR_Data)-set(Data_TR_2018)-set(Data_TR_201901)

In [5]:
#Import historical data
Travix_base_df = pd.concat((pd.read_csv(f,low_memory=False) for f in Select_TR_Data),sort=True)

### Import downloaded SFTP Archive data 

In [6]:
# Import all distributor data from local folder 
AllData = glob.glob(folder_external + "/external_travix*.csv")
Travix_SFTP_df = pd.concat((pd.read_csv(f, low_memory=False) for f in AllData))

In [7]:
# extract the date & time from string in LastUpdatedDate column and convert type of column to datetime64
Travix_SFTP_df['LastUpdatedDate'] = pd.to_datetime(Travix_SFTP_df['LastUpdatedDate'].astype(str).str[:19],format='%Y-%m-%d %H:%M:%S')

In [8]:
# Select the latest session ID based on last updated date column and drop the rest
Travix_SFTP_df = Travix_SFTP_df.sort_values('LastUpdatedDate').drop_duplicates(['SessionId', 'OrderNumber'], keep='last')

### Merge Travix data files

In [9]:
# Merge Travix csv files in folders with other csv files not in folders
travix_df = pd.concat([Travix_base_df,Travix_SFTP_df], sort = False)

### Travix data cleaning

In [10]:
# Change date variable to format that can be used for data analysis
######### RUNTIME NEEDS TO BE DECREASED, NOW >3 min #############################
#Travix_df['OrderDateUTC'] = pd.to_datetime(Travix_df['OrderDateUTC'],infer_datetime_format=True,exact=False,utc=True).dt.date
travix_df['OrderDateUTC'] = pd.to_datetime(travix_df['OrderDateUTC'].astype(str).str[:19],format='%Y-%m-%d')
travix_df['OrderDateUTC'].max()

Timestamp('2020-05-12 00:00:00')

In [11]:
# Select data range starting from the time we went live
Start_date = ('2018-05-29 00:00:00')
Start_date = pd.to_datetime(Start_date)
Start_date

Timestamp('2018-05-29 00:00:00')

In [12]:
travix_df['PolicyType'].value_counts()

TRIP CANCELLATION INSURANCE                                               50812
Travel Insurance                                                          47323
CANCELLATION-INSURANCE                                                    22636
Trip insurance                                                            16102
EXTRAS                                                                    13814
COMBINED-INSURANCE                                                         9083
COMBINED INSURANCE WORLDWIDE                                               8291
HEPSTAR-CANCELLATION                                                       6804
COMBINED INSURANCE EUROPE                                                  5138
TRAVEL-INSURANCE                                                           4409
TRAVEL INSURANCE WORLDWIDE                                                 4285
HEPSTAR-TRAVEL                                                             2472
HEPSTAR-COMBINED-INSURANCE              

# Data Cleaning

In [13]:
#Excluding time from the Order Date and changing orderdate to date format
travix_df['OrderDateUTC'] = pd.to_datetime(travix_df['OrderDateUTC'],infer_datetime_format=True,exact=False,utc=True).dt.date
travix_df['OrderDateUTC'] = pd.to_datetime(travix_df['OrderDateUTC'])

In [14]:
#Check the maximum date in the dataset
travix_df['OrderDateUTC'].max()

Timestamp('2020-05-12 00:00:00')

In [15]:
#make a copy of the dataset
travix_base_df=travix_df.copy()

In [16]:
#Removing duplicated rows in Travix data as a result of insurance purchasers column indicating an additional row with total premium and admin cost  
#Create a column that indicates duplicates by ordernumber 
travix_base_df['Dups'] = travix_base_df.duplicated(subset=['OrderNumber'], keep=False).astype(int).astype(str)

In [17]:
# Remove duplicated insurance purchase information
travix_base_df.drop(travix_base_df[(travix_base_df['PolicyNumber'].notnull()) & (travix_base_df['InsuranceCost_AdministrativeCost']==0) & (travix_base_df['Dups']=='1')].index, inplace=True)

## Removing incomplete last day

In [18]:
# end date report
End_date = travix_base_df['OrderDateUTC'].max()
End_date

Timestamp('2020-05-12 00:00:00')

In [19]:
# Remove incomplete last day in a dataset
travix_base_df=travix_base_df[travix_base_df['OrderDateUTC']< End_date]

In [20]:
#Check maximum date after removing incomplete last date
travix_base_df['OrderDateUTC'].max()

Timestamp('2020-05-11 00:00:00')

## Selecting NL brand only

In [21]:
#Limit data to NL brand only
travix_base_df=travix_base_df[travix_base_df['Brand'].isin(['CHEAPTICKETSNL', 'BUDGETAIR', 'VLIEGWINKEL'])]
#check brands that are now available in the dataset
travix_base_df['Brand'].value_counts()

CHEAPTICKETSNL    267415
BUDGETAIR          86839
VLIEGWINKEL        49291
Name: Brand, dtype: int64

In [22]:
#rename Brand to Brand/Channel 
travix_base_df=travix_base_df.rename(columns={'Brand':'Brand/Channel'})

## Haul

In [23]:
#Check haul column before adding 'No insurance' category
travix_base_df['Haul'].value_counts()

Short Haul    255754
Long Haul     132084
Name: Haul, dtype: int64

In [24]:
travix_base_df.columns

Index(['AirhelpPlusMargin', 'Airline', 'BookerCountry_Code',
       'BookerDateOfBirth', 'Brand/Channel', 'CancelledDate', 'ChannelType',
       'CustomerIdentifier', 'DepartureDate', 'DestinationAirportCode',
       'DestinationCountryCode', 'DeviceType', 'DomesticOrInternational',
       'Duration', 'FlewBusinessClass', 'FlewEconomyClass', 'FlewFirstClass',
       'FlewPremiumEconomyClass', 'HasAirhelpPlus', 'HasServicePackage',
       'HasTicketGuarantee', 'Haul', 'InsuranceCost_AdditionalCoverageCost',
       'InsuranceCost_AdministrativeCost', 'InsuranceCost_InsuranceTax',
       'InsuranceCost_Premium', 'InsuranceEndDate', 'InsuranceSalesCurrency',
       'InsuranceStartDate', 'Insured', 'InsuredAmount', 'LastUpdatedDate',
       'NumberOfAdults', 'NumberOfChildren', 'NumberOfInfants',
       'NumberOfPassengers', 'OneWayOrReturn', 'OrderCanceled', 'OrderDateUTC',
       'OrderNumber', 'OriginAirportCode', 'OriginCountryCode', 'PolicyName',
       'PolicyNumber', 'PolicyType', 'R

In [25]:
# Creating a new column that also indicates 'No haul' if haul type is not indicated in the Haul column
#fill missing values with 'No insurance'
travix_base_df.Haul.fillna(value='No haul', inplace=True)

In [26]:
#Check haul column after adding 'No insurance' category
travix_base_df['Haul'].value_counts()

Short Haul    255754
Long Haul     132084
No haul        15707
Name: Haul, dtype: int64

## Policy Type

In [27]:
#Creating a new column that also indicates 'No insurance' if a customer didnt buy insurance
#making a copy of Policy type
travix_base_df['Policy_type']=travix_base_df['PolicyType']
#fill missing values with 'No insurance'
travix_base_df.Policy_type.fillna(value='No insurance', inplace=True)

In [28]:
#rename policy type categories to travel, cancellation and combi insurance
travix_base_df['Policy_type'].replace(to_replace=('HEPSTAR-CANCELLATION','HEPSTAR-COMBINED-INSURANCE','HEPSTAR-TRAVEL', 'HEPSTAR-TRAVELCANCELLATION', 'reissverzekering', 'Travel Insurance ', 'cancellationinsurance'), value=('Cancellation Insurance','Combi Insurance','Travel Insurance', 'Combi Insurance', 'Travel Insurance', 'Travel Insurance', 'Cancellation Insurance' ), inplace=True)
#Check
travix_base_df['Policy_type'].value_counts()

No insurance              376491
CANCELLATION-INSURANCE     13481
Cancellation Insurance      5947
COMBINED-INSURANCE          2460
TRAVEL-INSURANCE            2089
Combi Insurance             1338
Travel Insurance            1324
EXTRAS                       415
Name: Policy_type, dtype: int64

## Travix booking

In [29]:
# Creating a new column that indicates 1 if a customer bought flight ticket 
travix_base_df['distributor_purchase'] = travix_base_df['CustomerIdentifier'].where (travix_base_df['CustomerIdentifier'].isnull(), 1).fillna(0).astype(int)
#check if the new column is correct
travix_base_df['CustomerIdentifier'].count() - travix_base_df['distributor_purchase'].sum()

0

In [30]:
# Creating a new column that indicates 1 if a customer bought flight ticket 
travix_base_df['distributor_purchase'] = travix_base_df['CustomerIdentifier'].where (travix_base_df['CustomerIdentifier'].isnull(), 1).fillna(0).astype(int)
#check if the new column is correct
travix_base_df['distributor_purchase'].count() - travix_base_df['distributor_purchase'].sum()

0

In [31]:
#check if every row has brand identified
travix_base_df['Brand/Channel'].count()

403545

## Insurance Purchase

In [32]:
# Creating a new column that indicates 1 if a customer bought insurance and 0 otherwise
#setting insurance purchase to either 1 or zero
#travix_base_df['Insurance_purch'] = travix_base_df['PolicyType'].where (travix_base_df['PolicyType'].isnull(), 1).fillna(0).astype(int)
#check if the new column is correct
#travix_base_df['PolicyType'].count() - travix_base_df['Insurance_purch'].sum()

In [33]:
# Create variable confirming insurance purchase
def Insurance_purchase(travix_base_df):
    if travix_base_df['TotalInsuranceCost'] > 0:
        return 1
    else:
        return 0
travix_base_df['Insurance_purch'] = travix_base_df.apply (lambda row: Insurance_purchase (row), axis=1)

## Test product

In [34]:
travix_base_df['Policy_type'].value_counts()

No insurance              376491
CANCELLATION-INSURANCE     13481
Cancellation Insurance      5947
COMBINED-INSURANCE          2460
TRAVEL-INSURANCE            2089
Combi Insurance             1338
Travel Insurance            1324
EXTRAS                       415
Name: Policy_type, dtype: int64

In [35]:
#Remove test products 
#travix_base_df.drop(travix_base_df[travix_base_df['Policy_type']=='test'].index, inplace=True)

## Year, Month Week variables

In [36]:
# create order month/year variable
travix_base_df['Order_month'] = travix_base_df['OrderDateUTC'].dt.month
travix_base_df['Order_year'] = travix_base_df['OrderDateUTC'].dt.year.astype(str)
# create  order week variable
travix_base_df['Orderweek'] = travix_base_df['OrderDateUTC'].dt.week
# create order month and order year variable
travix_base_df['YearMonth'] = travix_base_df['OrderDateUTC'].apply(lambda x:x.strftime('%Y-%m'))
#create year and week variable
travix_base_df['Yearweek'] = travix_base_df['OrderDateUTC'].dt.year.astype(str) + '-' + travix_base_df['OrderDateUTC'].dt.week.astype(str)

In [37]:
#Change the week number of the last day of the year given as week 1
def change_year_week(travix_base_df):
    if (travix_base_df['Order_month'] == 12) and (travix_base_df['Orderweek'] == 1):
        return travix_base_df['Order_year']+'-'+'52'
    else:
        return travix_base_df['Yearweek']
travix_base_df['Yearweek'] = travix_base_df.apply (lambda row: change_year_week (row),axis=1)

In [38]:
#Check max date for the date
travix_base_df['OrderDateUTC'].max()

Timestamp('2020-05-11 00:00:00')

In [39]:
#convert order_year to string (so as to be able to later join a multiindex level when creating a pivot table)
travix_base_df['Order_year']=travix_base_df['Order_year'].apply(str)

## Duplicates

In [40]:
#select data from the day we went live
Tr_live_df = travix_base_df[travix_base_df['OrderDateUTC']>='2018-05-29 00:00:00']
Tr_live_df=Tr_live_df.copy()

In [41]:
Tr_live_df.shape

(403545, 65)

In [42]:
#select data before we went live
Tr_B4_live= travix_base_df[travix_base_df['OrderDateUTC']<'2018-05-29 00:00:00']
Tr_B4_live=Tr_B4_live.copy()

In [43]:
#drop duplicates with the same ordernumber and policy type from the beggining of phase 1
Tr_live_df.drop_duplicates(['OrderNumber', 'PolicyType'],keep='first',inplace=True)

In [44]:
# create a list of columns which are constant per order number
constant_columns = ['OrderNumber', 'OrderDateUTC', 'Brand/Channel', 'OneWayOrReturn', 'ReceivedDiscount','DepartureDate', 'ReturnDate', 'Duration',
                    'Insured', 'Haul','SessionId', 'ChannelType', 'CustomerIdentifier', 'TotalNumberOfOrders', 'DomesticOrInternational',
                     'Airline','NumberOfPassengers','NumberOfInfants','NumberOfChildren','NumberOfAdults', 'BookerDateOfBirth','OriginAirportCode',
                   'FlewEconomyClass','FlewFirstClass','FlewBusinessClass','FlewPremiumEconomyClass','Order_year', 'Orderweek','Order_month', 
                    'PolicyNumber', 'distributor_purchase', 'Insurance_purch','InsuranceSalesCurrency', 'OrderCanceled', 'Policy_type', 'YearMonth', 'Yearweek',
                   'BookerCountry_Code','OriginCountryCode', 'DestinationAirportCode', 'DestinationCountryCode',
                    'PolicyName', 'InsuranceStartDate', 'InsuranceEndDate', 'Supplier','PolicyNumber']                            

In [45]:
#join the dataset with the constant columns
excl_dups_df =Tr_live_df[constant_columns]
# create index based on the 'OrderNumber' column
excl_dups_df.set_index('OrderNumber', inplace=True)

In [46]:
#create a new database with a list of names for columns to be summed per order number
columns_to_sum = ['InsuranceCost_AdministrativeCost',
                  'InsuranceCost_Premium', 'InsuranceCost_InsuranceTax',
                  'InsuranceCost_AdditionalCoverageCost', 'TotalInsuranceCost',
                  'InsuredAmount','TotalTicketSale_EUR']                             

In [47]:
# create a new dataframe with sums for each order number
sums_excl_df =Tr_live_df.groupby('OrderNumber')[columns_to_sum].sum()

In [48]:
Tr_live_df['PolicyType'].fillna('Nan',inplace=True)

In [49]:
# create a series with concatenated policy types to identify products sold later
policy_series = Tr_live_df.groupby('OrderNumber')['PolicyType'].apply(lambda x: 
                            ' + '.join(x.sort_values()) if x.any() else np.nan)

In [50]:
 #MERGE CONSTANT AND TRANSFORMED DATASETS
# merge the base data frame with the one containing sums
excl_dups_df = excl_dups_df.merge(sums_excl_df, left_index=True, right_index=True)
# add policy names
excl_dups_df['Policy_names'] = policy_series

In [51]:
#Reset index so as to merge the two different datasets
excl_dups_df.reset_index(inplace=True)

In [52]:
#drop duplicates by ordernumber
excl_dups_df.drop_duplicates(['OrderNumber'],keep='first',inplace=True)

In [53]:
#output_df[output_df['Policy_names']=='HEPSTAR-TRAVEL + HEPSTAR-TRAVELCANCELLATION']['OrderNumber']

In [54]:
#Merge the two dataset (before and after we went live)
Travix_data = pd.concat([excl_dups_df,Tr_B4_live], join_axes = [excl_dups_df.columns])

  


In [55]:
Travix_data =Travix_data.copy()

In [56]:
#Rename supplier name so as to have same format of the names
Travix_data['Supplier'].replace('Unigarant verzekeringen', 'Unigarant Verzekeringen',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


## Travix revenue

In [57]:
Travix_data.columns

Index(['OrderNumber', 'OrderDateUTC', 'Brand/Channel', 'OneWayOrReturn',
       'ReceivedDiscount', 'DepartureDate', 'ReturnDate', 'Duration',
       'Insured', 'Haul', 'SessionId', 'ChannelType', 'CustomerIdentifier',
       'TotalNumberOfOrders', 'DomesticOrInternational', 'Airline',
       'NumberOfPassengers', 'NumberOfInfants', 'NumberOfChildren',
       'NumberOfAdults', 'BookerDateOfBirth', 'OriginAirportCode',
       'FlewEconomyClass', 'FlewFirstClass', 'FlewBusinessClass',
       'FlewPremiumEconomyClass', 'Order_year', 'Orderweek', 'Order_month',
       'PolicyNumber', 'distributor_purchase', 'Insurance_purch',
       'InsuranceSalesCurrency', 'OrderCanceled', 'Policy_type', 'YearMonth',
       'Yearweek', 'BookerCountry_Code', 'OriginCountryCode',
       'DestinationAirportCode', 'DestinationCountryCode', 'PolicyName',
       'InsuranceStartDate', 'InsuranceEndDate', 'Supplier', 'PolicyNumber',
       'InsuranceCost_AdministrativeCost', 'InsuranceCost_Premium',
       'Insu

In [58]:
#Calculate 58% Insurance premium
Travix_data['58%Insurance_Premium']=0.58*Travix_data['InsuranceCost_Premium']

In [59]:
#check
test= Travix_data[['58%Insurance_Premium', 'InsuranceCost_Premium']]
test[test.InsuranceCost_Premium.notnull()].head(5)

Unnamed: 0,58%Insurance_Premium,InsuranceCost_Premium
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [60]:
#function to add a columns with masterpolicyfee
Travix_data['Masterpolicy_fee'] = Travix_data['Policy_type'].where (Travix_data['Policy_type'].isnull(), 6).fillna(0).astype(int)
Travix_data['Masterpolicy_fee'].value_counts()

6    336686
Name: Masterpolicy_fee, dtype: int64

In [61]:
#function to add a columns indicating the phase in which the orders occured
#Start_date = ('2018-06-01 00:00:00')
def Travix_Phase(Travix_data):
    if (Travix_data['OrderDateUTC']>= Start_date)  and (Travix_data['Supplier']=='Unigarant Verzekeringen'):
        return 'Phase1&2'
    else:
        return ''
Travix_data['Phase'] = Travix_data.apply (lambda row: Travix_Phase (row),axis=1)

In [62]:
#function to add a columns with masterpolicyfee without tax (at the beginning of Phase 2a going onwards)
def MP_fee_exc_tax(Travix_data):
    if (Travix_data['Insurance_purch'] == 1) and (Travix_data['OrderDateUTC']>= Start_date) and (Travix_data['Supplier']=='Unigarant Verzekeringen'):
        return 5.8
    if (Travix_data['Insurance_purch'] == 1) and (Travix_data['OrderDateUTC']< Start_date) and (Travix_data['Supplier']=='Unigarant Verzekeringen'):
        return 6
    else:
        return 0
Travix_data['MP_fee_exc_tax&HS'] = Travix_data.apply (lambda row: MP_fee_exc_tax (row),axis=1)

In [63]:
Travix_data['Supplier'].value_counts()

Unigarant Verzekeringen    20450
Refund Protect               411
Name: Supplier, dtype: int64

#Travix Revenue
#Calculate Distributor's revenue per supplier
def Ditributor_revenue(Travix_data):
    if Travix_data['Supplier'] == 'unigarant verzekeringen':
        return Travix_data['MP_fee_exc_tax&HS']+Travix_data['58%Insurance_Premium']
    if Travix_data['Supplier'].str.lower() == 'refund protect':
        return  Travix_data['45%Insurance_Premium']
    else:
        return 0
Travix_data['HS_purchased_dist_revenue'] = Travix_data.apply (lambda row: Ditributor_revenue (row),axis=1)

In [64]:
#Calculate Distributor's revenue per supplier
Travix_data['Travix_revenue']=0
Travix_data['Travix_revenue']= np.where(Travix_data['Supplier'].str.lower()=='unigarant verzekeringen', Travix_data['MP_fee_exc_tax&HS']+Travix_data['58%Insurance_Premium'],Travix_data['Travix_revenue'])
Travix_data['Travix_revenue']= np.where(Travix_data['Supplier']=='Refund protect', Travix_data['InsuranceCost_Premium']*0.5, Travix_data['Travix_revenue'])

In [65]:
#create a column indicating the total retail value
Travix_data['TotalRetail_value']=Travix_data['TotalInsuranceCost']

## Cancelled Orders

In [66]:
# Removing Cancelled Orders starting from phase 1 going onwards
#function to add a columns that indicates if a policy was cancelled or not
def Order_cancelled(Travix_data):
    if (Travix_data['OrderCanceled'] == True) and (Travix_data['OrderDateUTC']>= Start_date):
        return True
    else:
        return False
Travix_data['OrderCancel_P1&2'] = Travix_data.apply (lambda row: Order_cancelled (row),axis=1)

In [67]:
#Check number of cancelled orders in Phase 1&2 using the created variable
Travix_data['OrderCancel_P1&2'].value_counts()

False    324910
True      11776
Name: OrderCancel_P1&2, dtype: int64

In [68]:
#Check if indication of cancelled orders is in Phase 1 and 2 only
Travix_data[Travix_data['Phase']=='Phase1&2']['OrderCanceled'].value_counts()

False    19615
True       835
Name: OrderCanceled, dtype: int64

In [69]:
#drop cancelled orders in Phase 1&2
#travix_base_df.drop(travix_base_df[travix_base_df['CancelledOrders_Phase1&2']==True].index, inplace=True)

## Booker Country 

In [70]:
# Creating a new column that indicates NL if a customer's booking country code is NL and non_NL otherwise
#function to add a columns that indicates Booker country
def Booker_Country(Travix_data):
    if (Travix_data['BookerCountry_Code'] == 'NL'):
        return 'NL'
    else:
        return 'non_NL'
Travix_data['Booker_Country'] = Travix_data.apply (lambda row: Booker_Country (row),axis=1)

In [71]:
#Check the maximum date in the dataset
Travix_data['OrderDateUTC'].max()

Timestamp('2020-05-11 00:00:00')

# Create a pivot table

In [72]:
#select columns for the pivot table
clmns_to_keep_df=Travix_data[['OrderDateUTC','Order_year','Order_month','Orderweek','Yearweek','Haul','Policy_type','PolicyName','Insurance_purch','distributor_purchase','58%Insurance_Premium', 'Brand/Channel', 'Travix_revenue', 'MP_fee_exc_tax&HS', 'OrderCancel_P1&2', 'TotalRetail_value','OneWayOrReturn','Booker_Country']]

## NL Channels

In [73]:
TravixNL_df=clmns_to_keep_df.loc[clmns_to_keep_df['Brand/Channel'].isin(['CHEAPTICKETSNL','BUDGETAIR','VLIEGWINKEL'])]

In [74]:
#segment data per orderdate, year, week, haul and policy type
TravixNL_pivot_df=TravixNL_df.pivot_table(index=['OrderDateUTC','Order_year','Yearweek','Order_month','Orderweek','OrderCancel_P1&2','Haul', 'Brand/Channel','Policy_type','OneWayOrReturn', 'Booker_Country'], aggfunc='sum')

In [75]:
#Reset the index for the pivot table 
TravixNL_pivot_df.reset_index(inplace=True)

In [76]:
TravixNL_pivot_df['OrderDateUTC'].max()

Timestamp('2020-05-11 00:00:00')

In [77]:
TravixNL_pivot_df = TravixNL_pivot_df[TravixNL_pivot_df['OrderDateUTC']>='2019-01-01']

### Export data

In [78]:
#export to bi-report drive
TravixNL_pivot_df.to_csv(export_BI_drive+'Travix YoY Analysis Data excluding duplicates.csv')
#export to local
#TravixNL_pivot_df.to_csv(export_local+'Travix YoY Analysis Data excluding duplicates.csv')
TravixNL_pivot_df.to_csv(os.path.join('Travix YoY Analysis Data excluding duplicates.csv'))