### **Inference**

In [1]:
import json
import pandas as pd
import numpy as np
import joblib

threshold = 0.48
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 50)

In [2]:
kmeans_model = joblib.load('../server/models/kmeans_model.joblib')
lgbm_model = joblib.load('../server/models/lgbm.joblib')

In [3]:
with open('../payload.json') as f:
    data = json.load(f)

user_attributes_df = pd.DataFrame(data['user_attributes'], index=[0])
user_gpx_fixes_df = pd.DataFrame(data['user_gpx_fixes'])

In [4]:
user_gpx_fixes_df['cluster'] = kmeans_model.predict(user_gpx_fixes_df[['latitude', 'longitude']])
user_gpx_fixes_df.set_index('id', inplace=True)
user_gpx_fixes_df

Unnamed: 0_level_0,gps_fix_at,server_upload_at,longitude,latitude,accuracy,altitude,bearing,location_provider,user_id,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2017-06-22 09:37:20,2017-06-22 09:43:42,36.840540,-1.294342,68.4,0.0,0.0,fused,1,5
2,2017-08-14 07:50:27,2017-08-14 09:05:27,36.895270,-1.341928,1409.0,0.0,0.0,fused,1,5
3,2017-06-13 10:34:29,2017-06-13 10:54:48,36.811903,-1.307220,68.4,0.0,0.0,fused,1,5
4,2017-06-18 12:16:20,2017-06-18 12:16:24,36.907049,-1.309984,1581.0,0.0,0.0,fused,1,5
5,2017-06-28 09:39:08,2017-06-28 09:58:12,36.839396,-1.280310,1396.0,0.0,0.0,fused,1,5
...,...,...,...,...,...,...,...,...,...,...
66,2017-06-25 18:24:01,2017-06-25 19:14:58,37.525418,-2.796975,4152.0,0.0,0.0,fused,1,20
67,2017-04-05 08:38:57,2017-04-05 08:39:00,36.991799,-1.261228,5.4,1509.7,276.0,fused,1,5
68,2017-06-28 06:00:35,2017-06-28 07:08:54,36.874843,-1.341133,2007.0,0.0,0.0,fused,1,5
69,2017-01-24 15:27:24,2017-01-24 15:28:32,36.903354,-1.331759,39.0,0.0,0.0,fused,1,5


#### **Inference Feature Extractor**

In [11]:
def feature_extractor(df):
    def feature_extraction(df):
        df['gps_fix_at'] = pd.to_datetime(df['gps_fix_at'])
        df['server_upload_at'] = pd.to_datetime(df['server_upload_at'])
        
        
        df = df.sort_values(by=['gps_fix_at'])  
    
        if df is None or df.empty:
            return None

        # time difference calc
        df['time_in_out_diff'] = (df['server_upload_at'] - df['gps_fix_at']).dt.total_seconds()
        df['time_gps_fix_shift'] = df['gps_fix_at'].diff().dt.total_seconds() / 86400  
        df['time_gps_fix_shift'] = df['time_gps_fix_shift'].fillna(0)
        df['gps_first_server_last_diff'] = (df['server_upload_at'].iloc[-1] - df['gps_fix_at'].iloc[0]).days  
        
        # count location_provider
        df = df.replace({'location_provider': {'fused': 0, 'gps': 1, 'network': 2, 'local_database': 3}})
        location_provider_count = dict(df['location_provider'].value_counts())
        df['location_provider_count_fused'] = location_provider_count.get(0, 0)
        df['location_provider_count_gps'] = location_provider_count.get(1, 0)
        df['location_provider_count_network'] = location_provider_count.get(2, 0)
        df['location_provider_count_local_database'] = location_provider_count.get(3, 0)
        
        # altitude
        zero_count = (df['altitude'] == 0).sum()
        non_zero_count = (df['altitude'] != 0).sum()
        # [0 => city native, 1 => hilly native]
        df['altitude_native'] = 0 if zero_count > non_zero_count else 1
        df['num_travels'] = (df['bearing'] != 0).sum()
        
        # drop "gpu_fix_at", "server_upload_at" columns
        df.drop(columns=["gps_fix_at", "server_upload_at"], inplace=True)
        return df

    df = df.groupby('user_id').apply(feature_extraction).reset_index(drop=True)
    
    def aggregate_dataframe(df):
        # Handle NaN values
        df = df.fillna(0)

        # Define custom aggregation functions
        def max_(x):
            return np.max(x)
        def min_(x):
            return np.min(x)
        def std_(x):
            return np.std(x) if len(x) > 1 else np.NaN
        def mean_(x):
            return np.mean(x)
        def _(x):
            return int(np.mean(x))
        def quantile_01_(x):
            return np.quantile(x, 0.1)
        def quantile_03_(x):
            return np.quantile(x, 0.3)
        def quantile_05_(x):
            return np.quantile(x, 0.5)
        def quantile_07_(x):
            return np.quantile(x, 0.7)
        def quantile_09_(x):
            return np.quantile(x, 0.9)


        aggregations = {
            'time_in_out_diff': [max_, min_, std_, mean_, quantile_01_, quantile_03_, quantile_05_, quantile_07_, quantile_09_],
            'time_gps_fix_shift': [max_, std_, mean_,  quantile_01_, quantile_03_, quantile_05_, quantile_07_, quantile_09_],
            'longitude': [std_],
            'latitude': [std_],
            'altitude': [mean_, std_,  quantile_01_, quantile_03_, quantile_05_, quantile_07_, quantile_09_],
            'bearing': [mean_, std_,  quantile_01_, quantile_03_, quantile_05_, quantile_07_, quantile_09_],
            'accuracy': [mean_, std_,  quantile_01_, quantile_03_, quantile_05_, quantile_07_, quantile_09_],
            'gps_first_server_last_diff': [_],
            'location_provider_count_fused': [_],
            'location_provider_count_gps': [_],
            'location_provider_count_network': [_],
            'location_provider_count_local_database': [_],
            'altitude_native': [_],
            'num_travels': [_],
        }

        grouped = df.groupby('user_id')
        df_agg = grouped.agg(aggregations).reset_index()
        
        # Count occurrences of each cluster for each user_id
        for i in range(24):
            df_agg[f'cluster_{i}_count'] = df.groupby('user_id')['cluster'].apply(lambda x: (x == i).sum()).values

        # Join the multi-level column index into a single-level index
        df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
        df_agg.rename(columns={'user_id_': 'user_id'}, inplace=True)
        return df_agg

    df_agg = aggregate_dataframe(df)
    df_agg.dropna()
    
    return df_agg

In [12]:
df = feature_extractor(user_gpx_fixes_df)

In [13]:
df

Unnamed: 0,user_id,time_in_out_diff_max_,time_in_out_diff_min_,time_in_out_diff_std_,time_in_out_diff_mean_,time_in_out_diff_quantile_01_,time_in_out_diff_quantile_03_,time_in_out_diff_quantile_05_,time_in_out_diff_quantile_07_,time_in_out_diff_quantile_09_,time_gps_fix_shift_max_,time_gps_fix_shift_std_,time_gps_fix_shift_mean_,time_gps_fix_shift_quantile_01_,time_gps_fix_shift_quantile_03_,time_gps_fix_shift_quantile_05_,time_gps_fix_shift_quantile_07_,time_gps_fix_shift_quantile_09_,longitude_std_,latitude_std_,altitude_mean_,altitude_std_,altitude_quantile_01_,altitude_quantile_03_,altitude_quantile_05_,...,num_travels__,cluster_0_count_,cluster_1_count_,cluster_2_count_,cluster_3_count_,cluster_4_count_,cluster_5_count_,cluster_6_count_,cluster_7_count_,cluster_8_count_,cluster_9_count_,cluster_10_count_,cluster_11_count_,cluster_12_count_,cluster_13_count_,cluster_14_count_,cluster_15_count_,cluster_16_count_,cluster_17_count_,cluster_18_count_,cluster_19_count_,cluster_20_count_,cluster_21_count_,cluster_22_count_,cluster_23_count_
0,1,6735.0,3.0,1361.84606,999.1,9.7,44.5,389.0,1230.7,3093.0,28.945139,5.111458,2.900228,0.00736,0.110153,0.544126,1.812738,10.698556,0.539118,0.528805,75.69,323.532169,0.0,0.0,0.0,...,3,0,0,0,0,0,63,2,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,1


In [14]:
user_attributes_df['user_id'] = data['user_id']

In [15]:
data

{'user_id': 1,
 'user_attributes': {'age': 42, 'cash_incoming_30days': 8988.12},
 'user_gpx_fixes': [{'id': 1,
   'gps_fix_at': '2017-06-22 09:37:20',
   'server_upload_at': '2017-06-22 09:43:42',
   'longitude': 36.8405401,
   'latitude': -1.2943417,
   'accuracy': 68.4,
   'altitude': 0.0,
   'bearing': 0.0,
   'location_provider': 'fused',
   'user_id': 1,
   'cluster': 5},
  {'id': 2,
   'gps_fix_at': '2017-08-14 07:50:27',
   'server_upload_at': '2017-08-14 09:05:27',
   'longitude': 36.8952702,
   'latitude': -1.3419282,
   'accuracy': 1409.0,
   'altitude': 0.0,
   'bearing': 0.0,
   'location_provider': 'fused',
   'user_id': 1,
   'cluster': 5},
  {'id': 3,
   'gps_fix_at': '2017-06-13 10:34:29',
   'server_upload_at': '2017-06-13 10:54:48',
   'longitude': 36.8119029,
   'latitude': -1.3072201,
   'accuracy': 68.4,
   'altitude': 0.0,
   'bearing': 0.0,
   'location_provider': 'fused',
   'user_id': 1,
   'cluster': 5},
  {'id': 4,
   'gps_fix_at': '2017-06-18 12:16:20',
   '

In [16]:
df = df.merge(user_attributes_df, on='user_id')
df

Unnamed: 0,user_id,time_in_out_diff_max_,time_in_out_diff_min_,time_in_out_diff_std_,time_in_out_diff_mean_,time_in_out_diff_quantile_01_,time_in_out_diff_quantile_03_,time_in_out_diff_quantile_05_,time_in_out_diff_quantile_07_,time_in_out_diff_quantile_09_,time_gps_fix_shift_max_,time_gps_fix_shift_std_,time_gps_fix_shift_mean_,time_gps_fix_shift_quantile_01_,time_gps_fix_shift_quantile_03_,time_gps_fix_shift_quantile_05_,time_gps_fix_shift_quantile_07_,time_gps_fix_shift_quantile_09_,longitude_std_,latitude_std_,altitude_mean_,altitude_std_,altitude_quantile_01_,altitude_quantile_03_,altitude_quantile_05_,...,cluster_1_count_,cluster_2_count_,cluster_3_count_,cluster_4_count_,cluster_5_count_,cluster_6_count_,cluster_7_count_,cluster_8_count_,cluster_9_count_,cluster_10_count_,cluster_11_count_,cluster_12_count_,cluster_13_count_,cluster_14_count_,cluster_15_count_,cluster_16_count_,cluster_17_count_,cluster_18_count_,cluster_19_count_,cluster_20_count_,cluster_21_count_,cluster_22_count_,cluster_23_count_,age,cash_incoming_30days
0,1,6735.0,3.0,1361.84606,999.1,9.7,44.5,389.0,1230.7,3093.0,28.945139,5.111458,2.900228,0.00736,0.110153,0.544126,1.812738,10.698556,0.539118,0.528805,75.69,323.532169,0.0,0.0,0.0,...,0,0,0,0,63,2,0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,1,42,8988.12


In [17]:
int(lgbm_model.predict_proba(df.drop(columns=['user_id']))[0][1] > threshold)

1

In [19]:
lgbm_model.predict_proba(df.drop(columns=['user_id']))

array([[0.4942382, 0.5057618]])