In [1]:
import pandas as pd
import numpy as np


In [2]:
#load data
data = pd.read_csv('Data/OptiFly_data.csv')
data.columns


Index(['OriginApt', 'OriginCty', 'OriginCtry', 'DestinationApt',
       'DestinationCty', 'DestinationCtry', 'TravelHorizonDays',
       'TravelDistanceKm', 'SelfTransfer', 'Stops', 'DurationMin',
       'Seg_0_OriginIATA', 'Seg_0_OriginName', 'Seg_0_DestinationIATA',
       'Seg_0_DestinationName', 'Seg_0_OperatingCarrierIATA',
       'Seg_0_TravelDistanceKm', 'Seg_1_OriginIATA', 'Seg_1_OriginName',
       'Seg_1_DestinationIATA', 'Seg_1_DestinationName',
       'Seg_1_OperatingCarrierIATA', 'Seg_1_TravelDistanceKm',
       'Seg_2_OriginIATA', 'Seg_2_OriginName', 'Seg_2_DestinationIATA',
       'Seg_2_DestinationName', 'Seg_2_OperatingCarrierIATA',
       'Seg_2_TravelDistanceKm', 'Seg_3_OriginIATA', 'Seg_3_OriginName',
       'Seg_3_DestinationIATA', 'Seg_3_DestinationName',
       'Seg_3_OperatingCarrierIATA', 'Seg_3_TravelDistanceKm', 'passengers',
       'SearchLowestRatio', 'ItinLowestRatio', 'Price', 'PricePerPax',
       'ItineraryRedirects', 'ODRedirects'],
      dtype='object

In [3]:
#keep only relevant columns for now
keep_columns = ['OriginApt', 'DestinationApt',
       'TravelDistanceKm', 'Stops', 'DurationMin', 'PricePerPax']
data_2 = data[keep_columns]
data_2


Unnamed: 0,OriginApt,DestinationApt,TravelDistanceKm,Stops,DurationMin,PricePerPax
0,TLV,DLM,767.0,0,105,175.356667
1,LIS,GRU,7935.0,0,600,465.965000
2,PNH,SIN,1136.0,0,125,86.700000
3,KSY,SAW,1160.0,0,130,137.510000
4,LGW,BCN,1109.0,0,130,116.275000
...,...,...,...,...,...,...
2497506,FOR,GIG,2176.0,0,195,80.410000
2497507,HEL,IAD,6937.0,1,800,1230.750000
2497508,ISG,NGO,1689.0,1,1200,124.580000
2497509,JAI,BLR,1528.0,0,145,64.990000


In [4]:
#create column that combine origin and destination
data_2['Itinerary'] = data_2['OriginApt'] + data_2['DestinationApt']
data_2 = data_2.dropna()
test_data = data_2.sample(100000)
test_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2['Itinerary'] = data_2['OriginApt'] + data_2['DestinationApt']


Unnamed: 0,OriginApt,DestinationApt,TravelDistanceKm,Stops,DurationMin,PricePerPax,Itinerary
1199788,HKT,LGW,9893.0,1,1090,1105.76000,HKTLGW
1273646,DEL,BOM,1140.0,0,140,57.08000,DELBOM
862008,CDG,ARN,1540.0,0,155,95.01000,CDGARN
716399,MAN,JED,4960.0,1,720,355.41000,MANJED
29551,FUK,PUS,226.0,0,60,163.25500,FUKPUS
...,...,...,...,...,...,...,...
2202167,PSA,TIA,804.0,0,100,46.14500,PSATIA
2340709,AMD,BHJ,303.0,0,65,36.51000,AMDBHJ
2281847,NAP,CDG,1290.0,0,155,61.56000,NAPCDG
505973,DPS,DRW,1764.0,1,980,203.12500,DPSDRW


In [5]:
# create a function to score the itinerary on duration and price
def calculate_score(group):
    conditions = [
        group >= group.quantile(0.9),
        group >= group.quantile(0.8),
        group >= group.quantile(0.7),
        group >= group.quantile(0.6),
        group <= group.quantile(0.1),
        group <= group.quantile(0.2),
        group <= group.quantile(0.3),
        group <= group.quantile(0.4),
        group <= group.quantile(0.5),
    ]
    choices = [-5, -4, -3, -2, 5, 4, 3, 2, 1]
    return np.select(conditions, choices, default=0)

test_data['Duration_score'] = test_data.groupby('Itinerary')['DurationMin'].transform(calculate_score)
test_data['Price_score'] = test_data.groupby('Itinerary')['PricePerPax'].transform(calculate_score)


In [6]:
test_data['Stops_score'] = np.select(
    [
        test_data['Stops'] == 3,
        test_data['Stops'] == 2,
        test_data['Stops'] == 1,
        test_data['Stops'] == 0,


    ],
    [-6, -3, 3, 6],
    default=0
)


In [7]:
test_data['Total_score'] = \
    test_data['Stops_score'] + \
    test_data['Duration_score'] + \
    test_data['Price_score']


In [8]:
test_data


Unnamed: 0,OriginApt,DestinationApt,TravelDistanceKm,Stops,DurationMin,PricePerPax,Itinerary,Duration_score,Price_score,Stops_score,Total_score
1199788,HKT,LGW,9893.0,1,1090,1105.76000,HKTLGW,-2,-5,3,-4
1273646,DEL,BOM,1140.0,0,140,57.08000,DELBOM,-4,-3,6,-1
862008,CDG,ARN,1540.0,0,155,95.01000,CDGARN,2,-2,6,6
716399,MAN,JED,4960.0,1,720,355.41000,MANJED,-2,-3,3,-2
29551,FUK,PUS,226.0,0,60,163.25500,FUKPUS,-4,-5,6,-3
...,...,...,...,...,...,...,...,...,...,...,...
2202167,PSA,TIA,804.0,0,100,46.14500,PSATIA,-5,1,6,2
2340709,AMD,BHJ,303.0,0,65,36.51000,AMDBHJ,-5,-5,6,-4
2281847,NAP,CDG,1290.0,0,155,61.56000,NAPCDG,-5,-2,6,-1
505973,DPS,DRW,1764.0,1,980,203.12500,DPSDRW,-5,-5,3,-7
