## Set up environment

In [None]:
from ast import literal_eval
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from queryrunner_client import Client

plt.style.use('ggplot')
qr = Client(user_email='abhishek.sharma@uber.com')

In [None]:
# First go through this presentation to get an overview of what's happening here
# https://docs.google.com/presentation/d/1HM1wt3vnSfcT6tW3tYr8hqXAucc9tG9E3zyV91O4AU8/edit?usp=sharing
# Some terminology differences between the notebook and the presentation:
# - candidate segments in the code is equivalent to prefiltered segments in the presentation

In [None]:
city_filter = "in (20, 8, 14, 198, 5, 12, 134, 26, 23, 25, 24, 208, 27, 1541, 7, 6, 45, 227, 4, 35)" # cities to consider
builduuid = '6dc8e44c-be20-11ea-bb97-000af7f88c50' # umm build uuid, it should be very close to when tickets were reported
db = 'maps_automation' # where the tables computed by spark are
email = 'abhishek.sharma@uber.com'

# Week 1
experiment = '_23_07_29_07_20c' # experiment name, refers to one week worth of tickets
date_filter = "between '2020-07-23' and '2020-07-29'" # looking at tickets reported within these dates
date_filter_streaks1 = "between '2020-07-16' and '2020-07-22'" # looking at aggregated streaks within these dates
date_filter_streaks2 = "between '2020-07-09' and '2020-07-15'" # and these dates

config1 = {'experiment': experiment, 'date_filter': date_filter, 'date_filter_streaks1': date_filter_streaks1, 
          'date_filter_streaks2': date_filter_streaks2, 'city_filter': city_filter, 'builduuid': builduuid, 'db': db}


# Week 2
experiment = '_30_07_05_08_20c2' # experiment name, refers to one week worth of tickets
date_filter = "between '2020-07-30' and '2020-08-05'" # looking at tickets reported within these dates
date_filter_streaks1 = "between '2020-07-23' and '2020-07-29'" # looking at aggregated streaks within these dates
date_filter_streaks2 = "between '2020-07-16' and '2020-07-22'" # and these dates

config2 = {'experiment': experiment, 'date_filter': date_filter, 'date_filter_streaks1': date_filter_streaks1, 
          'date_filter_streaks2': date_filter_streaks2, 'city_filter': city_filter, 'builduuid': builduuid, 'db': db}

# Week 3
experiment = '_06_08_12_08_20c' # experiment name, refers to one week worth of tickets
date_filter = "between '2020-08-06' and '2020-08-12'" # looking at tickets reported within these dates
date_filter_streaks1 = "between '2020-07-30' and '2020-08-05'" # looking at aggregated streaks within these dates
date_filter_streaks2 = "between '2020-07-23' and '2020-07-29'" # and these dates

config3 = {'experiment': experiment, 'date_filter': date_filter, 'date_filter_streaks1': date_filter_streaks1, 
          'date_filter_streaks2': date_filter_streaks2, 'city_filter': city_filter, 'builduuid': builduuid, 'db': db}

# Collecting features

In [None]:
# This dictionary (in format name: query_template) contains all the queries templates to calculate features
# (and training label) for a particular experiment. You could probably put this dictionary in another file and import it.

features_query_templates = {
# query to load candidate segments from table
'candidate_segments': '''
  select
    map_ticket_id,
    trip_id,
    city_id,
    report_lat,
    report_long,
    report_date,
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
''',
    
# query to find distance between candidate segment and report location
'report_location_dist': '''
with candidate_segments as (
  select distinct
    map_ticket_id,
    report_lat,
    report_long,
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
subset_segments as (
  select
    uuid as segment_id,
    geometry.polyline.points as polyline_points
  FROM
    umm.map_feature_segments_tomtom t
    inner JOIN dwh.dim_city c on ST_Contains(
      ST_GeometryFromText(c.simplified_shape),
      ST_Point(
        geometry.polyline.points[1].lnge7 / 1e7,
        geometry.polyline.points[1].late7 / 1e7
      )
    )
  where
    builduuid = '{builduuid}'
    and c.city_id {city_filter}
)
select
  t.map_ticket_id,
  t.candidate_segment,
  ST_DISTANCE(
    ST_LineString(
      transform(
        s.polyline_points,
        x -> ST_POINT(x.lnge7 / 1e7, x.late7 / 1e7)
      )
    ),
    ST_POINT(t.report_long, t.report_lat)
  ) * 111321 * cos(radians(t.report_lat)) as report_location_dist
from
  subset_segments s
  join candidate_segments t on s.segment_id = t.candidate_segment
''',
    
# query to find if the candidate segment is suggested anywhere in the trip
'segment_suggested': '''SET session join_distribution_type = BROADCAST;
  
with candidate_segments as (
  select
    distinct trip_id,
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
suggested_segments as (
  select
    trip_id,
    flatten(array_agg(transform(msg.segments, x -> x.segmentuuid))) as suggested_segments
  from
    rawdata_user.kafka_hp_gurafu_route_logs_nodedup s
    join {db}.candidate_segments{experiment} t on s.msg.tripuuid = t.trip_id
  where
    s.datestr {date_filter}
    and s.msg.cityid {city_filter}
  group by
    1
)
select
  c.trip_id,
  c.candidate_segment,
  contains(suggested_segments, candidate_segment) as segment_suggested
from
  suggested_segments s
  join candidate_segments c on s.trip_id = c.trip_id''',
    
# query to check if the candidate_segment is in the path
'segment_traversed': '''SET session join_distribution_type = BROADCAST;

with candidate_segments as (
  select
    distinct trip_id,
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
traversed_segments as (
  select
    trip_id,
    array_distinct(flatten(array_agg(transform(msg.fittedlocations, x -> x.segmentuuid)))) as traversed_segments
  from
    rawdata_user.kafka_hp_gmatching_map_matched_trips_nodedup s
    join {db}.candidate_segments{experiment} t on s.msg.tripuuid = t.trip_id
  where
    s.datestr {date_filter}
    and s.msg.cityid {city_filter}
  group by
    1
)
select
  c.trip_id,
  c.candidate_segment,
  contains(traversed_segments, candidate_segment) as segment_traversed
from
  traversed_segments s
  join candidate_segments c on s.trip_id = c.trip_id'''
,
    
# Calculates average speed on segments traversed in the trip
'avg_trip_speed' : '''SET session join_distribution_type = BROADCAST;

with candidate_segments as (
  select
    distinct trip_id,
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
subset_segments as (
  select
    uuid as segment_id,
    data.segment.startjunctionuuid,
    data.segment.endjunctionuuid
  from
    umm.map_feature_segments_tomtom s
    join candidate_segments t on s.uuid = t.candidate_segment
  where
    builduuid = '{builduuid}'
),
trip_streaks as (
  select
    trip_id,
    msg.graphsegment.segmentuuid as segment_id,
    msg.graphsegment.startjunctionuuid,
    msg.graphsegment.endjunctionuuid,
    count(uuid) as streaks,
    avg(msg.speedkmph) as avg_speed
  from
    rawdata_user.kafka_hp_maps_historical_streaks_tomtom_nodedup s
    join maps_automation.candidate_segments{experiment} c on s.msg.jobuuid = c.trip_id
  where
    datestr {date_filter}
  group by
    1, 2, 3, 4
)
select
  t.trip_id,
  t.segment_id,
  sum(case when (t.startjunctionuuid = s.startjunctionuuid and t.endjunctionuuid = s.endjunctionuuid) then t.streaks else 0 end) as trip_streaks_forward,
  sum(case when (t.startjunctionuuid = s.endjunctionuuid and t.endjunctionuuid = s.startjunctionuuid) then t.streaks else 0 end) as trip_streaks_backward,
  sum(case when (t.startjunctionuuid = s.startjunctionuuid and t.endjunctionuuid = s.endjunctionuuid) then t.avg_speed else 0 end) as trip_avg_speed_forward,
  sum(case when (t.startjunctionuuid = s.endjunctionuuid and t.endjunctionuuid = s.startjunctionuuid) then t.avg_speed else 0 end) as trip_avg_speed_backward
from
  subset_segments s
  join trip_streaks t on s.segment_id = t.segment_id
group by
  1, 2
''',
    
# query to find segment features
'segment_features': '''
with candidate_segments as (
  select distinct
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
subset_segments as (
  select
    s.uuid as segment_id,
    s.data.segment.carriageway,
    s.data.segment.controlledaccessroad,
    s.data.segment.defaultspeednegkph,
    s.data.segment.defaultspeedposkph,
    s.data.segment.drivingside,
    s.data.segment.groundlevel,
    s.data.segment.hovroad,
    s.data.segment.inintersection,
    s.data.segment.lanecount,
    s.data.segment.privateroad,
    s.data.segment.roadclass,
    s.data.segment.roadusage,
    s.data.segment.segmentattributes.maxspeednegkph,
    s.data.segment.segmentattributes.maxspeedposkph,
    s.data.segment.segmentattributes.nothroughtraffic,
    s.data.segment.segmentattributes.trafficdirection,
    s.data.segment.surfacetype,
    s.data.segment.toll,
    s.data.segment.type
  FROM
    umm.map_feature_segments_tomtom s
    inner JOIN dwh.dim_city c on ST_Contains(
      ST_GeometryFromText(c.simplified_shape),
      ST_Point(
        s.geometry.polyline.points [1].lnge7 / 1e7,
        s.geometry.polyline.points [1].late7 / 1e7
      )
    )
  where
    builduuid = '{builduuid}'
    and c.city_id {city_filter}
)
select
  t.candidate_segment,
  carriageway,
  controlledaccessroad,
  defaultspeednegkph,
  defaultspeedposkph,
  drivingside,
  groundlevel,
  hovroad,
  inintersection,
  lanecount,
  privateroad,
  roadclass,
  roadusage,
  maxspeednegkph,
  maxspeedposkph,
  nothroughtraffic,
  trafficdirection,
  surfacetype,
  toll,
  type
from
  subset_segments s
  join candidate_segments t on s.segment_id = t.candidate_segment
''',
    
# query to find actual streaks on the segment
'actual_streaks': '''
with candidate_segments as (
  select distinct
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
subset_segments as (
  select
    uuid as segment_id,
    data.segment.startjunctionuuid,
    data.segment.endjunctionuuid
  from
    umm.map_feature_segments_tomtom s
    join candidate_segments t on s.uuid = t.candidate_segment
  where
    builduuid = '{builduuid}'
),
subset_aggregated_actual_streaks_w1 as (
  select
    msg.graphsegment.segmentuuid,
    msg.graphsegment.startjunctionuuid,
    msg.graphsegment.endjunctionuuid,
    count(uuid) as streaks,
    avg(msg.speedkmph) as avg_speed
  from
    rawdata_user.kafka_hp_maps_historical_streaks_tomtom_nodedup
    join candidate_segments on msg.graphsegment.segmentuuid = candidate_segment
  where
    datestr {date_filter_streaks1}
    and msg.classification = 'valid'
  group by
    1, 2, 3
),
subset_aggregated_actual_streaks_w2 as (
  select
    msg.graphsegment.segmentuuid,
    msg.graphsegment.startjunctionuuid,
    msg.graphsegment.endjunctionuuid,
    count(uuid) as streaks,
    avg(msg.speedkmph) as avg_speed
  from
    rawdata_user.kafka_hp_maps_historical_streaks_tomtom_nodedup
    join candidate_segments on msg.graphsegment.segmentuuid = candidate_segment
  where
    datestr {date_filter_streaks2}
    and msg.classification = 'valid'
  group by
    1, 2, 3
)
select
  s.segment_id as segment_id,
  sum(case when (aacs1.startjunctionuuid = s.startjunctionuuid and aacs1.endjunctionuuid = s.endjunctionuuid) then aacs1.streaks else 0 end) as actual_streaks_forward_w1,
  sum(case when (aacs2.startjunctionuuid = s.startjunctionuuid and aacs2.endjunctionuuid = s.endjunctionuuid) then aacs2.streaks else 0 end) as actual_streaks_forward_w2,
  sum(case when (aacs1.startjunctionuuid = s.endjunctionuuid and aacs1.endjunctionuuid = s.startjunctionuuid) then aacs1.streaks else 0 end) as actual_streaks_backward_w1,
  sum(case when (aacs2.startjunctionuuid = s.endjunctionuuid and aacs2.endjunctionuuid = s.startjunctionuuid) then aacs2.streaks else 0 end) as actual_streaks_backward_w2,  
  sum(case when (aacs1.startjunctionuuid = s.startjunctionuuid and aacs1.endjunctionuuid = s.endjunctionuuid) then aacs1.avg_speed else 0 end) as avg_speed_forward_w1,
  sum(case when (aacs2.startjunctionuuid = s.startjunctionuuid and aacs2.endjunctionuuid = s.endjunctionuuid) then aacs2.avg_speed else 0 end) as avg_speed_forward_w2,
  sum(case when (aacs1.startjunctionuuid = s.endjunctionuuid and aacs1.endjunctionuuid = s.startjunctionuuid) then aacs1.avg_speed else 0 end) as avg_speed_backward_w1,
  sum(case when (aacs2.startjunctionuuid = s.endjunctionuuid and aacs2.endjunctionuuid = s.startjunctionuuid) then aacs2.avg_speed else 0 end) as avg_speed_backward_w2
from
  subset_segments s
  join subset_aggregated_actual_streaks_w1 aacs1 on s.segment_id = aacs1.segmentuuid
  join subset_aggregated_actual_streaks_w2 aacs2 on s.segment_id = aacs2.segmentuuid
group by
  1
''',
    
# query to find suggested streaks on the segment
'suggested_streaks': '''
with candidate_segments as (
  select
    distinct candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
subset_segments as (
  select
    uuid as segment_id,
    data.segment.startjunctionuuid,
    data.segment.endjunctionuuid
  from
    umm.map_feature_segments_tomtom s
    join candidate_segments t on s.uuid = t.candidate_segment
  where
    builduuid = '{builduuid}'
)
select
  asus.segmentuuid as segment_id,
  sum(case when ((asus.startjunctionuuid = s.startjunctionuuid) and (asus.endjunctionuuid = s.endjunctionuuid) and (asus.datestr {date_filter_streaks1})) then asus.suggested_streaks else 0 end) as suggested_streaks_forward_w1,
  sum(case when ((asus.startjunctionuuid = s.startjunctionuuid) and (asus.endjunctionuuid = s.endjunctionuuid) and (asus.datestr {date_filter_streaks2})) then asus.suggested_streaks else 0 end) as suggested_streaks_forward_w2,
  sum(case when ((asus.startjunctionuuid = s.endjunctionuuid) and (asus.endjunctionuuid = s.startjunctionuuid) and (asus.datestr {date_filter_streaks1})) then asus.suggested_streaks else 0 end) as suggested_streaks_backward_w1,
  sum(case when ((asus.startjunctionuuid = s.endjunctionuuid) and (asus.endjunctionuuid = s.startjunctionuuid) and (asus.datestr {date_filter_streaks2})) then asus.suggested_streaks else 0 end) as suggested_streaks_backward_w2
from
  {db}.agg_suggested_streaks{experiment} asus
  join subset_segments s on s.segment_id = asus.segmentuuid
group by
  1
''',
    
# query to find if the candidate segment was fixed
'fixed_segments': '''
with candidate_segments as (
  select distinct
    map_ticket_id,
    candidate_segment
  from
    {db}.candidate_segments{experiment} t
    cross join unnest(t.candidate_segments) as temp (candidate_segment)
),
fixed_segments as (
  select distinct
    msg.map_ticket_id,
    msg.fixed_map_feature_ids as fixed_segment_ids
  from
    rawdata_user.kafka_hp_umm_hotfix_backend_living_maps_nodedup
  where
    msg.fixed_map_feature_ids is not null
    and cardinality(msg.fixed_map_feature_ids) > 0
)
select
  t.map_ticket_id,
  t.candidate_segment,
  cast(contains(f.fixed_segment_ids, t.candidate_segment) as tinyint) as fix
from
  candidate_segments t
  join fixed_segments f on t.map_ticket_id = f.map_ticket_id
'''
}

In [None]:
def make_queries(config):
    "Takes a config object, replaces the parameters in the query templates, returns query dictionary (name: query)"
    queries = {}
    for name, query_template in features_query_templates.items():
        queries[name] = query_template.format(**config)
    return queries

In [None]:
def collect_features(queries, sleep_time=120):
    "Runs all the queries for a experiment and returns a dictionary of dataframes (name: dataframe)"
    
    print('Submitting queries...')
    execution_ids = {}
    for name, query in queries.items():
        execution_id = qr.submit_execution('presto', query, datacenter='dca1', user_email=email)
        execution_ids[name] = execution_id
    print('Queries submitted.')

    while (1):
        status = []
        for execution_id in execution_ids.values():
            status.append(qr.is_complete(execution_id))

        if all(status):
            print('All queries executed. Collecting results...')
            dfs = {}
            for name, execution_id in execution_ids.items():
                dfs[name] = pd.DataFrame(qr.get_result(execution_id).load_data())
            print('Results collected.')
            return dfs
        else:
            print('{}/{} queries executed.'.format(sum(status), len(status)))
        
        time.sleep(sleep_time)

In [None]:
# Collecting features for week 1
queries = make_queries(config1)
dfs1 = collect_features(queries)

In [None]:
# Collecting features for week 2
queries = make_queries(config2)
dfs2 = collect_features(queries)

In [None]:
# Collecting features for week 3
queries = make_queries(config3)
dfs3 = collect_features(queries)

# Merging features

In [None]:
def merge_features(dfs):
    # Merges all the features for an experiment
    
    # Ticket based features
    merged_df = pd.merge(dfs['candidate_segments'], dfs['report_location_dist'], on=['map_ticket_id', 'candidate_segment'], how='left')
    
    # Trip based features
    merged_df = pd.merge(merged_df, dfs['segment_suggested'], on=['trip_id', 'candidate_segment'], how='left')
    merged_df = pd.merge(merged_df, dfs['segment_traversed'], on=['trip_id', 'candidate_segment'], how='left')
    merged_df = pd.merge(merged_df, dfs['avg_trip_speed'], left_on=['trip_id', 'candidate_segment'], right_on=['trip_id', 'segment_id'], how='left')
    merged_df.drop(['segment_id'], axis=1, inplace=True)
    
    # Segment based features
    merged_df = pd.merge(merged_df, dfs['segment_features'], on='candidate_segment', how='left')

    # Aggregate streak based features
    merged_df = pd.merge(merged_df, dfs['actual_streaks'], left_on=['candidate_segment'], right_on=['segment_id'], how='left')
    merged_df.drop(['segment_id'], axis=1, inplace=True)
    merged_df = pd.merge(merged_df, dfs['suggested_streaks'], left_on=['candidate_segment'], right_on=['segment_id'], how='left')
    merged_df.drop(['segment_id'], axis=1, inplace=True)

    # Training label
    merged_df = pd.merge(merged_df, dfs['fixed_segments'], on=['map_ticket_id', 'candidate_segment'], how='left')
    
    assert len(dfs['candidate_segments']) == len(merged_df) # Number of rows should remain same
    return merged_df

In [None]:
merged_df1 = merge_features(dfs1)
merged_df2 = merge_features(dfs2)
merged_df3 = merge_features(dfs3)
merged_df = pd.concat([merged_df1, merged_df2, merged_df3]) # Concatenate merged features for all experiments

In [None]:
import pickle
with open('merged_df.p', 'wb') as f:
    pickle.dump(merged_df, f) # Save the merged df for later use
    
# with open('three_weeks.p', 'rb') as f:
#     merged_df = pickle.load(f)

In [None]:
# We drop all the rows with no fix value available
df = merged_df[merged_df['fix'].notnull()].reset_index(drop=True).copy() 
df['fix'] = pd.to_numeric(df['fix'])

In [None]:
df['fix'].value_counts()

## Split data

In [None]:
# test_dates = ['2020-07-23', '2020-07-24', '2020-07-25']
test_dates = ['2020-07-27', '2020-07-28', '2020-07-29']
train_df = df[~df['report_date'].isin(test_dates)].copy()
test_df = df[df['report_date'].isin(test_dates)].copy()

## Process features

In [None]:
# These columns will be one hot encoded
categorical_features = ['carriageway', 'drivingside', 'groundlevel', 'roadclass',
                        'roadusage', 'trafficdirection', 'surfacetype', 'toll', 'type']
# These two lists should be changed in tandem
categories = [['SINGLE_CARRIAGE_WAY', 'DUAL_CARRIAGE_WAY'],
              ['LEFT', 'RIGHT'],
              ['TUNNEL', 'BRIDGE'],
              ['MOTORWAY' , 'MINOR_HIGHWAY', 'LOCAL_ROAD', 'MAJOR_ARTERY', 'MINOR_ARTERY', 'HIGHWAY'],
              ['SLIP_ROAD', 'WALKWAY', 'PARKING_ROAD', 'CONNECTOR', 'RAMP', 'STAIRS', 'ROUNDABOUT'],
              ['BOTH', 'BACKWARD', 'NONE', 'FORWARD'],
              ['POOR', 'PAVED'],
              ['FORWARD', 'BOTH', 'BACKWARD'],
              ['ROAD', 'FERRY']]

from sklearn.impute import SimpleImputer
categorical_imputer = SimpleImputer(strategy='constant', fill_value='na', missing_values=np.nan)
categorical_imputer = categorical_imputer.fit(train_df[categorical_features])

from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(categories=categories, handle_unknown='ignore', sparse=False)
one_hot_encoder = one_hot_encoder.fit(categorical_imputer.transform(train_df[categorical_features]))

In [None]:
def process_features(t_df, return_feature_names=False):
    drop_cols = ['map_ticket_id', 'trip_id', 'city_id', 'report_lat', 'report_long',
                 'report_date', 'candidate_segment']
    t_df.drop(drop_cols, axis=1, inplace=True)

    convert_to_numeric_cols = ['segment_suggested', 'segment_traversed', 'controlledaccessroad', 'hovroad',
                               'inintersection', 'privateroad', 'nothroughtraffic']
    for col in convert_to_numeric_cols:
        t_df[col] = pd.to_numeric(t_df[col], errors='coerce').astype('float')

    t_df[categorical_features] = categorical_imputer.transform(t_df[categorical_features])
    
    # forward by backward ratios
    t_df['actual_streaks_fb_ratio_w1'] = t_df['actual_streaks_forward_w1'] / t_df['actual_streaks_backward_w1']
    t_df['actual_streaks_fb_ratio_w2'] = t_df['actual_streaks_forward_w2'] / t_df['actual_streaks_backward_w2']
    t_df['avg_speed_fb_ratio_w1'] = t_df['avg_speed_forward_w1'] / t_df['avg_speed_backward_w1']
    t_df['avg_speed_fb_ratio_w2'] = t_df['avg_speed_forward_w2'] / t_df['avg_speed_backward_w2']
    t_df['suggested_streaks_fb_ratio_w1'] = t_df['suggested_streaks_forward_w1'] / t_df['suggested_streaks_backward_w1']
    t_df['suggested_streaks_fb_ratio_w2'] = t_df['suggested_streaks_forward_w2'] / t_df['suggested_streaks_backward_w2']

    # suggested by actual ratios
    t_df['suggested_actual_forward_ratio_w1'] = t_df['suggested_streaks_forward_w1'] / t_df['actual_streaks_forward_w1']
    t_df['suggested_actual_forward_ratio_w2'] = t_df['suggested_streaks_forward_w2'] / t_df['actual_streaks_forward_w2']
    t_df['suggested_actual_backward_ratio_w1'] = t_df['suggested_streaks_backward_w1'] / t_df['actual_streaks_backward_w1']
    t_df['suggested_actual_backward_ratio_w2'] = t_df['suggested_streaks_backward_w2'] / t_df['actual_streaks_backward_w2']

    # w1 by w2 ratios
    t_df['actual_streaks_forward_ratio12'] = t_df['actual_streaks_forward_w1'] / t_df['actual_streaks_forward_w2']
    t_df['actual_streaks_backward_ratio12'] = t_df['actual_streaks_backward_w1'] / t_df['actual_streaks_backward_w2']
    t_df['avg_speed_forward_ratio12'] = t_df['avg_speed_forward_w1'] / t_df['avg_speed_forward_w2']
    t_df['avg_speed_backward_ratio12'] = t_df['avg_speed_backward_w1'] / t_df['avg_speed_backward_w2']
    t_df['suggested_streaks_forward_ratio12'] = t_df['suggested_streaks_forward_w1'] / t_df['suggested_streaks_forward_w2']
    t_df['suggested_streaks_backward_ratio12'] = t_df['suggested_streaks_backward_w1'] / t_df['suggested_streaks_backward_w2']

    # trip by aggregate ratios
    t_df['avg_speed_forward_ratio_trip_agg1'] = t_df['trip_avg_speed_forward'] / t_df['avg_speed_forward_w1']
    t_df['avg_speed_forward_ratio_trip_agg2'] = t_df['trip_avg_speed_forward'] / t_df['avg_speed_forward_w2']
    t_df['avg_speed_backward_ratio_trip_agg1'] = t_df['trip_avg_speed_backward'] / t_df['avg_speed_backward_w1']
    t_df['avg_speed_backward_ratio_trip_agg2'] = t_df['trip_avg_speed_backward'] / t_df['avg_speed_backward_w2']

    X_t = np.concatenate([one_hot_encoder.transform(t_df[categorical_features]),
                          t_df.drop(categorical_features, axis=1).values], axis=1)
    
    if return_feature_names:
        feature_names = list(one_hot_encoder.get_feature_names()) + list(t_df.drop(categorical_features, axis=1).columns)
        return X_t, feature_names
    else:
        return X_t

## Training model

In [None]:
y = train_df['fix']
X, feature_names = process_features(train_df.drop(['fix'], axis=1), return_feature_names=True)

In [None]:
from itertools import groupby
import xgboost as xgb

# required for ranking model
group_lengths = [len(list(group)) for key, group in groupby(train_df['map_ticket_id'])]

# Split into train and val
total_groups = len(group_lengths) # in train and val
val_groups = group_lengths[-1 * (total_groups // 5):]
train_groups = group_lengths[:-1 * (total_groups // 5)]

X_train, y_train = X[:sum(train_groups)], y[:sum(train_groups)]
X_val, y_val = X[sum(train_groups):], y[sum(train_groups):]

train_dmatrix = xgb.DMatrix(X_train, y_train, feature_names=feature_names)
valid_dmatrix = xgb.DMatrix(X_val, y_val, feature_names=feature_names)

# for ranking
train_dmatrix.set_group(train_groups)
valid_dmatrix.set_group(val_groups)

In [None]:
params = {'objective': 'rank:map',
          'eval_metric': 'map',
          'max_depth': 5, # increase to overfit (default: 6)
          'min_child_weight': 1, # increase to underfit (default: 1)
          'min_split_loss': 0, # increase to underfit (default: 0)
          'subsample': 0.8, # decrease to underfit (default: 1)
          'colsample_bytree': 0.5, # decrease to underfit (default: 1)
          'learning_rate': 0.1, # decrease to underfit
          'verbosity': 3} 

model = xgb.train(params, train_dmatrix, maximize=True, num_boost_round=200, early_stopping_rounds=20,
                  verbose_eval=True, evals=[(train_dmatrix, 'train'), (valid_dmatrix, 'validation')])

## Evaluating

In [None]:
# We get fixed segments for all the map tickets

query = '''
select distinct
  msg.map_ticket_id,
  msg.fixed_map_feature_ids as fixed_segment_ids
from
  rawdata_user.kafka_hp_umm_hotfix_backend_living_maps_nodedup
where
  msg.fixed_map_feature_ids is not null
  and cardinality(msg.fixed_map_feature_ids) > 0
'''

query_result = qr.execute('presto', query)

fixed_segments_dict = {} # map_ticket_id: [fixed_segment, fixed_segment, ...]
for dict_ in query_result.load_data():
    fixed_segments_dict[dict_['map_ticket_id']] = dict_['fixed_segment_ids'].split('\x02')

In [None]:
# Process test set
X_test = process_features(test_df.drop(['fix'], axis=1))
test_dmatrix = xgb.DMatrix(X_test, feature_names=feature_names)

# Run model
confidence_df = test_df[['map_ticket_id', 'candidate_segment']].copy()
confidence_df['confidence'] = model.predict(test_dmatrix, ntree_limit=model.best_ntree_limit) 
# confidence is not really a "confidence", but an arbitrary score

# We group all the segments with their confidence scores for 
confidence_scores = {} # {map_ticket_id: [(candidate_segment, score), (candidate_segment, score), ...]}
for ix, s in confidence_df.iterrows():
    if s['map_ticket_id'] in confidence_scores:
        confidence_scores[s['map_ticket_id']].append((s['candidate_segment'], s['confidence']))
    else:
        confidence_scores[s['map_ticket_id']] = [(s['candidate_segment'], s['confidence'])]

# Sort the candidate segments according to the scores
for map_ticket_id in confidence_scores:
    confidence_scores[map_ticket_id] = sorted(confidence_scores[map_ticket_id], key=lambda x: x[1], reverse=True)

In [None]:
def show_results(k):
    df = []
    for map_ticket_id in confidence_scores:
        fixed_segments = set(fixed_segments_dict[map_ticket_id])
        candidate_segments = set([x[0] for x in confidence_scores[map_ticket_id]])
        top_k_segments = set([x[0] for x in confidence_scores[map_ticket_id]][:k])
        df.append({'map_ticket_id': map_ticket_id,
                   'candidate_segments': candidate_segments, 
                   'fixed_segments': fixed_segments,
                   'top_k_segments': top_k_segments})
    result = pd.DataFrame(df).set_index('map_ticket_id')
    
    def intersection_cols(df, col1, col2): 
        return [a.intersection(b) for a, b in zip(df[col1], df[col2])]
    
    result['candidate_fixed_intersection'] = intersection_cols(result, 'candidate_segments', 'fixed_segments')
    result['top_k_fixed_intersection'] = intersection_cols(result, 'top_k_segments', 'fixed_segments')
    return result

In [None]:
for k in range(1, 6):
    results = show_results(k = k)
    print("k: {}, precision: {}".format(k, (results['top_k_fixed_intersection'].apply(len) > 0).mean()))

## Analysis

In [None]:
xgb.plot_importance(model, color='#0e9453', importance_type='weight')
fig = plt.gcf()
fig.set_size_inches(20, 15)