In [77]:
import pyspark
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.ml.pipeline import PipelineModel
import pyspark.ml.feature as ML
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassificationModel

In [78]:
account_name = "REDACTED"
account_key = "REDACTED"

spark = (
    SparkSession
        .builder
        .master('local[*]')
        .appName("Ingestion")
        .config("spark.driver.memory", "4g")
        .config("fs.azure.account.auth.type." + account_name + ".dfs.core.windows.net", "SharedKey")
        .config("fs.azure.account.key." + account_name + ".dfs.core.windows.net", account_key)
        .getOrCreate()
)

sc = spark.sparkContext

In [79]:
container_name = 'data'
path_to_table = '/features/'

def readDataframeFromAdls(spark_session, container_name, path_to_table, table_name):
    return (
        spark_session
            .read.parquet(f"abfss://{container_name}@REDACTED.dfs.core.windows.net{path_to_table}{table_name}")
    )

In [80]:
df_features = readDataframeFromAdls(spark, container_name, path_to_table, 'df_features').drop('ID_FILE', 'ID_STATE', 'DAY', 'MONTH', 'YEAR', 'STIME', 'STIME_SEC', 'LTIME', 'LON', 'LAT', 'LTIME_SEC', 'SLOT', 'LAG_LAT', 'LAG_LON', 'WEEKDAY', 'HOUR')

In [81]:
path = '/model/'
name = 'rfc_model'
rfc_model = RandomForestClassificationModel.load(f"abfss://{container_name}@REDACTED.dfs.core.windows.net{path}{name}")

In [82]:
df_duplicates = df_features.drop_duplicates()

In [83]:
types = ['GSM','LTE','UNKNOWN','WCDMA']
gps_sources = ['FUSED', 'GPS','NET']
stypes = ['LOCATION' ,'NF_BOOST_TECHNOLOGY', 'NF_SERVICE', 'NF_BOOST_COVERAGE','TIMER', 'WORKER','UNKNOWN']
battery_states = ['CHARGING' ,'FULL', 'NOT_CHARGING','UNCHARGING', 'UNKNOWN']
activity_states = ['IN_VEHICLE','ON_BICYCLE','ON_FOOT','STILL','TILTING','WALKING', 'UNKNOWN']
times_of_day = ['MORNING', 'AFTERNOON', 'NIGHT']

In [84]:
def filterCategoricalFeatures(df, column_name, feature_list, else_value='UNKNOWN'):
    return (
        df.withColumn(column_name, 
            F.when(F.col(column_name).isin(feature_list), F.col(column_name))
            .otherwise(F.lit(else_value))
        )
    )

In [85]:
df_activity = filterCategoricalFeatures(df_duplicates, 'ACTIVITY_STATE', activity_states)
df_lag_activity = filterCategoricalFeatures(df_activity, 'LAG_ACTIVITY', activity_states)
df_lead_activity = filterCategoricalFeatures(df_lag_activity, 'LEAD_ACTIVITY', activity_states)
df_battery = filterCategoricalFeatures(df_lead_activity, 'BATT_CHARGE', battery_states)
df_stypes = filterCategoricalFeatures(df_battery, 'STYPE', stypes)
df_source = filterCategoricalFeatures(df_stypes, 'SOURCE', gps_sources)
df_types = filterCategoricalFeatures(df_source, 'TYPE', types)
df_time_day = filterCategoricalFeatures(df_types, 'TIME_DAY', times_of_day)

In [86]:
df_cast = (
    df_time_day
        .withColumn('WIFI_CONNECTED_INT', F.col('WIFI_CONNECTED').cast('int'))
        .drop('WIFI_CONNECTED')
)

In [87]:
df_fill_categories = (
    df_cast
        .fillna('UNKNOWN', 
                subset=['ACTIVITY_STATE', 'LAG_ACTIVITY', 'LEAD_ACTIVITY', 'BATT_CHARGE', 'STYPE', 'SOURCE', 'TYPE'])
)

df_fill_numerical = (
    df_fill_categories
        .fillna(-1)
)

In [88]:
df_sorted = df_fill_numerical.select(sorted(df_fill_numerical.columns))
df_sorted.columns

['ACCURACY',
 'ACTIVITY_CONFID',
 'ACTIVITY_STATE',
 'BATT_CHARGE',
 'DISTANCE',
 'GPS_DELAY',
 'LAG_ACTIVITY',
 'LAG_LTIME',
 'LAG_SPEED',
 'LEAD_ACTIVITY',
 'LEAD_LTIME',
 'LEAD_SPEED',
 'LEVEL',
 'LEVEL_MW',
 'LIGHT',
 'MAGNET_X',
 'MAGNET_Y',
 'MAGNET_Z',
 'PROXIMITY',
 'QUAL',
 'SOURCE',
 'SPEED',
 'STYPE',
 'TIME_DAY',
 'TYPE',
 'WEEKEND',
 'WIFI_CONNECTED_INT']

In [91]:
string_indexer_input = ['STYPE', 'TYPE','SOURCE','BATT_CHARGE', 'ACTIVITY_STATE', 'LAG_ACTIVITY', 'LEAD_ACTIVITY', 'TIME_DAY']
string_indexer_output = [f'{col}_index' for col in string_indexer_input]

#one_hot_encoder_input = string_indexer_output + ['WIFI_CONNECTED_INT', 'WEEKEND']
one_hot_encoder_input = string_indexer_output
one_hot_encoder_output = [f'{col}_one_hot' for col in one_hot_encoder_input]

# Get all columns which aren't transformed 
remaining_cols = list(set(df_sorted.columns) - set(string_indexer_input))
# Combine with transformed columns
all_columns = remaining_cols +  one_hot_encoder_output
# Remove y
X_columns = [col for col in all_columns if col not in ['y']]

In [100]:
X_columns = ['ACTIVITY_CONFID',
 'WEEKEND',
 'LEVEL_MW',
 'ACCURACY',
 'LEAD_LTIME',
 'LAG_SPEED',
 'LEAD_SPEED',
 'LAG_LTIME',
 'WIFI_CONNECTED_INT',
 'LEVEL',
 'QUAL',
 'GPS_DELAY',
 'SPEED',
 'DISTANCE',
 'LIGHT',
 'MAGNET_X',
 'MAGNET_Y',
 'MAGNET_Z',
 'PROXIMITY',
 'STYPE_index_one_hot',
 'TYPE_index_one_hot',
 'SOURCE_index_one_hot',
 'BATT_CHARGE_index_one_hot',
 'ACTIVITY_STATE_index_one_hot',
 'LAG_ACTIVITY_index_one_hot',
 'LEAD_ACTIVITY_index_one_hot',
 'TIME_DAY_index_one_hot']

In [101]:
string_indexer = (
    ML.StringIndexer(
        inputCols=string_indexer_input,
        outputCols=string_indexer_output,
    )
)

In [102]:
one_hot_encoder = (
    ML.OneHotEncoder(
        inputCols=one_hot_encoder_input,
        outputCols=one_hot_encoder_output,
    )
)

In [103]:
vector_assembler = (
    ML.VectorAssembler(
        inputCols=X_columns,
        outputCol='X'
    )
)

In [104]:
#Create pipeline and pass all stages
pipeline = (
        Pipeline(
                stages=[
                       string_indexer,
                       one_hot_encoder,
                       vector_assembler,
                       rfc_model
                ]
        )
)

In [105]:
df_transformed = pipeline.fit(df_fill_numerical).transform(df_fill_numerical)

In [106]:
df_transformed.groupBy('prediction').count().orderBy('count').show()

+----------+------+
|prediction| count|
+----------+------+
|       1.0|375487|
|       0.0|413176|
+----------+------+



In [39]:
df_transformed.groupBy('LEAD_ACTIVITY_index_one_hot_IN_VEHICLE').count().orderBy('count').show()

AnalysisException: cannot resolve '`LEAD_ACTIVITY_index_one_hot_IN_VEHICLE`' given input columns: [ACCURACY, ACTIVITY_CONFID, ACTIVITY_STATE, ACTIVITY_STATE_index, ACTIVITY_STATE_index_one_hot, BATT_CHARGE, BATT_CHARGE_index, BATT_CHARGE_index_one_hot, DISTANCE, GPS_DELAY, ID_STATE, LAG_ACTIVITY, LAG_ACTIVITY_index, LAG_ACTIVITY_index_one_hot, LAG_LTIME, LAG_SPEED, LEAD_ACTIVITY, LEAD_ACTIVITY_index, LEAD_ACTIVITY_index_one_hot, LEAD_LTIME, LEAD_SPEED, LEVEL, LEVEL_MW, LIGHT, MAGNET_X, MAGNET_Y, MAGNET_Z, PROXIMITY, QUAL, SOURCE, SOURCE_index, SOURCE_index_one_hot, SPEED, STYPE, STYPE_index, STYPE_index_one_hot, TIME_DAY, TIME_DAY_index, TIME_DAY_index_one_hot, TYPE, TYPE_index, TYPE_index_one_hot, WEEKEND, WIFI_CONNECTED_INT, X, prediction, probability, rawPrediction];
'Aggregate ['LEAD_ACTIVITY_index_one_hot_IN_VEHICLE], ['LEAD_ACTIVITY_index_one_hot_IN_VEHICLE, count(1) AS count#2522L]
+- Project [ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, ACTIVITY_CONFID#758L, SOURCE#696, SPEED#759L, ACCURACY#760L, TYPE#697, LEVEL#761L, QUAL#762L, LIGHT#763, MAGNET_X#764, MAGNET_Y#765, MAGNET_Z#766, PROXIMITY#767, BATT_CHARGE#698, LAG_SPEED#768L, LEAD_SPEED#769L, LAG_LTIME#770, LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, DISTANCE#772, GPS_DELAY#773, ... 24 more fields]
   +- Project [ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, ACTIVITY_CONFID#758L, SOURCE#696, SPEED#759L, ACCURACY#760L, TYPE#697, LEVEL#761L, QUAL#762L, LIGHT#763, MAGNET_X#764, MAGNET_Y#765, MAGNET_Z#766, PROXIMITY#767, BATT_CHARGE#698, LAG_SPEED#768L, LEAD_SPEED#769L, LAG_LTIME#770, LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, DISTANCE#772, GPS_DELAY#773, ... 23 more fields]
      +- Project [ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, ACTIVITY_CONFID#758L, SOURCE#696, SPEED#759L, ACCURACY#760L, TYPE#697, LEVEL#761L, QUAL#762L, LIGHT#763, MAGNET_X#764, MAGNET_Y#765, MAGNET_Z#766, PROXIMITY#767, BATT_CHARGE#698, LAG_SPEED#768L, LEAD_SPEED#769L, LAG_LTIME#770, LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, DISTANCE#772, GPS_DELAY#773, ... 22 more fields]
         +- Project [ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, ACTIVITY_CONFID#758L, SOURCE#696, SPEED#759L, ACCURACY#760L, TYPE#697, LEVEL#761L, QUAL#762L, LIGHT#763, MAGNET_X#764, MAGNET_Y#765, MAGNET_Z#766, PROXIMITY#767, BATT_CHARGE#698, LAG_SPEED#768L, LEAD_SPEED#769L, LAG_LTIME#770, LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, DISTANCE#772, GPS_DELAY#773, ... 21 more fields]
            +- Project [ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, ACTIVITY_CONFID#758L, SOURCE#696, SPEED#759L, ACCURACY#760L, TYPE#697, LEVEL#761L, QUAL#762L, LIGHT#763, MAGNET_X#764, MAGNET_Y#765, MAGNET_Z#766, PROXIMITY#767, BATT_CHARGE#698, LAG_SPEED#768L, LEAD_SPEED#769L, LAG_LTIME#770, LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, DISTANCE#772, GPS_DELAY#773, ... 20 more fields]
               +- Project [ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, ACTIVITY_CONFID#758L, SOURCE#696, SPEED#759L, ACCURACY#760L, TYPE#697, LEVEL#761L, QUAL#762L, LIGHT#763, MAGNET_X#764, MAGNET_Y#765, MAGNET_Z#766, PROXIMITY#767, BATT_CHARGE#698, LAG_SPEED#768L, LEAD_SPEED#769L, LAG_LTIME#770, LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, DISTANCE#772, GPS_DELAY#773, ... 12 more fields]
                  +- Project [coalesce(ID_STATE#232, cast(-1.0 as int)) AS ID_STATE#757, STYPE#694, ACTIVITY_STATE#695, coalesce(ACTIVITY_CONFID#239L, cast(-1.0 as bigint)) AS ACTIVITY_CONFID#758L, SOURCE#696, coalesce(SPEED#243L, cast(-1.0 as bigint)) AS SPEED#759L, coalesce(ACCURACY#244L, cast(-1.0 as bigint)) AS ACCURACY#760L, TYPE#697, coalesce(LEVEL#249L, cast(-1.0 as bigint)) AS LEVEL#761L, coalesce(QUAL#250L, cast(-1.0 as bigint)) AS QUAL#762L, coalesce(LIGHT#253, cast(-1.0 as int)) AS LIGHT#763, coalesce(nanvl(MAGNET_X#254, cast(null as double)), cast(-1.0 as double)) AS MAGNET_X#764, coalesce(nanvl(MAGNET_Y#255, cast(null as double)), cast(-1.0 as double)) AS MAGNET_Y#765, coalesce(nanvl(MAGNET_Z#256, cast(null as double)), cast(-1.0 as double)) AS MAGNET_Z#766, coalesce(PROXIMITY#257, cast(-1.0 as int)) AS PROXIMITY#767, BATT_CHARGE#698, coalesce(LAG_SPEED#259L, cast(-1.0 as bigint)) AS LAG_SPEED#768L, coalesce(LEAD_SPEED#260L, cast(-1.0 as bigint)) AS LEAD_SPEED#769L, coalesce(LAG_LTIME#261, cast(-1.0 as int)) AS LAG_LTIME#770, coalesce(LEAD_LTIME#262, cast(-1.0 as int)) AS LEAD_LTIME#771, LAG_ACTIVITY#699, LEAD_ACTIVITY#700, coalesce(DISTANCE#267, cast(-1.0 as int)) AS DISTANCE#772, coalesce(GPS_DELAY#268, cast(-1.0 as int)) AS GPS_DELAY#773, ... 4 more fields]
                     +- Project [ID_STATE#232, coalesce(STYPE#492, cast(UNKNOWN as string)) AS STYPE#694, coalesce(ACTIVITY_STATE#376, cast(UNKNOWN as string)) AS ACTIVITY_STATE#695, ACTIVITY_CONFID#239L, coalesce(SOURCE#521, cast(UNKNOWN as string)) AS SOURCE#696, SPEED#243L, ACCURACY#244L, coalesce(TYPE#550, cast(UNKNOWN as string)) AS TYPE#697, LEVEL#249L, QUAL#250L, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, coalesce(BATT_CHARGE#463, cast(UNKNOWN as string)) AS BATT_CHARGE#698, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, coalesce(LAG_ACTIVITY#405, cast(UNKNOWN as string)) AS LAG_ACTIVITY#699, coalesce(LEAD_ACTIVITY#434, cast(UNKNOWN as string)) AS LEAD_ACTIVITY#700, DISTANCE#267, GPS_DELAY#268, ... 4 more fields]
                        +- Project [ID_STATE#232, STYPE#492, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#521, SPEED#243L, ACCURACY#244L, TYPE#550, LEVEL#249L, QUAL#250L, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, GPS_DELAY#268, ... 4 more fields]
                           +- Project [ID_STATE#232, STYPE#492, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#521, SPEED#243L, ACCURACY#244L, TYPE#550, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, ... 5 more fields]
                              +- Project [ID_STATE#232, STYPE#492, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#521, SPEED#243L, ACCURACY#244L, TYPE#550, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, ... 4 more fields]
                                 +- Project [ID_STATE#232, STYPE#492, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#521, SPEED#243L, ACCURACY#244L, CASE WHEN TYPE#248 IN (GSM,LTE,UNKNOWN,WCDMA) THEN TYPE#248 ELSE UNKNOWN END AS TYPE#550, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, ... 4 more fields]
                                    +- Project [ID_STATE#232, STYPE#492, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, CASE WHEN SOURCE#242 IN (FUSED,GPS,NET) THEN SOURCE#242 ELSE UNKNOWN END AS SOURCE#521, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, ... 4 more fields]
                                       +- Project [ID_STATE#232, CASE WHEN STYPE#237 IN (LOCATION,NF_BOOST_TECHNOLOGY,NF_SERVICE,NF_BOOST_COVERAGE,TIMER,WORKER,UNKNOWN) THEN STYPE#237 ELSE UNKNOWN END AS STYPE#492, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#242, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, ... 4 more fields]
                                          +- Project [ID_STATE#232, STYPE#237, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#242, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, CASE WHEN BATT_CHARGE#258 IN (CHARGING,FULL,NOT_CHARGING,UNCHARGING,UNKNOWN) THEN BATT_CHARGE#258 ELSE UNKNOWN END AS BATT_CHARGE#463, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, LEAD_ACTIVITY#434, DISTANCE#267, ... 4 more fields]
                                             +- Project [ID_STATE#232, STYPE#237, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#242, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#258, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#405, CASE WHEN LEAD_ACTIVITY#264 IN (IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,UNKNOWN) THEN LEAD_ACTIVITY#264 ELSE UNKNOWN END AS LEAD_ACTIVITY#434, DISTANCE#267, ... 4 more fields]
                                                +- Project [ID_STATE#232, STYPE#237, ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#242, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#258, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, CASE WHEN LAG_ACTIVITY#263 IN (IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,UNKNOWN) THEN LAG_ACTIVITY#263 ELSE UNKNOWN END AS LAG_ACTIVITY#405, LEAD_ACTIVITY#264, DISTANCE#267, ... 4 more fields]
                                                   +- Project [ID_STATE#232, STYPE#237, CASE WHEN ACTIVITY_STATE#238 IN (IN_VEHICLE,ON_BICYCLE,ON_FOOT,STILL,TILTING,WALKING,UNKNOWN) THEN ACTIVITY_STATE#238 ELSE UNKNOWN END AS ACTIVITY_STATE#376, ACTIVITY_CONFID#239L, SOURCE#242, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#258, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#263, LEAD_ACTIVITY#264, DISTANCE#267, ... 4 more fields]
                                                      +- Deduplicate [MAGNET_Z#256, WEEKEND#271, ACCURACY#244L, LAG_SPEED#259L, SPEED#243L, SOURCE#242, PROXIMITY#257, ACTIVITY_CONFID#239L, LEVEL_MW#273, MAGNET_X#254, ID_STATE#232, MAGNET_Y#255, QUAL#250L, ACTIVITY_STATE#238, LEVEL#249L, STYPE#237, LEAD_LTIME#262, LEAD_ACTIVITY#264, DISTANCE#267, LAG_LTIME#261, LEAD_SPEED#260L, WIFI_CONNECTED#252, GPS_DELAY#268, LAG_ACTIVITY#263, ... 4 more fields]
                                                         +- Project [ID_STATE#232, STYPE#237, ACTIVITY_STATE#238, ACTIVITY_CONFID#239L, SOURCE#242, SPEED#243L, ACCURACY#244L, TYPE#248, LEVEL#249L, QUAL#250L, WIFI_CONNECTED#252, LIGHT#253, MAGNET_X#254, MAGNET_Y#255, MAGNET_Z#256, PROXIMITY#257, BATT_CHARGE#258, LAG_SPEED#259L, LEAD_SPEED#260L, LAG_LTIME#261, LEAD_LTIME#262, LAG_ACTIVITY#263, LEAD_ACTIVITY#264, DISTANCE#267, ... 4 more fields]
                                                            +- Relation[ID_FILE#231L,ID_STATE#232,DAY#233,MONTH#234,YEAR#235,STIME#236L,STYPE#237,ACTIVITY_STATE#238,ACTIVITY_CONFID#239L,STIME_SEC#240,LTIME#241L,SOURCE#242,SPEED#243L,ACCURACY#244L,LON#245,LAT#246,LTIME_SEC#247,TYPE#248,LEVEL#249L,QUAL#250L,SLOT#251L,WIFI_CONNECTED#252,LIGHT#253,MAGNET_X#254,... 19 more fields] parquet


In [32]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from joblib import dump, load

In [33]:
rfc = load('RandomForestModel_10_06_2020.joblib') 



In [34]:
df = df_drop.toPandas()

In [37]:
df = df.reindex(sorted(df.columns), axis=1)
df.dtypes

ACCURACY             int64
ACTIVITY_CONFID      int64
ACTIVITY_STATE      object
BATT_CHARGE         object
DISTANCE             int32
GPS_DELAY            int32
ID_STATE             int32
LAG_ACTIVITY        object
LAG_LTIME            int32
LAG_SPEED            int64
LEAD_ACTIVITY       object
LEAD_LTIME           int32
LEAD_SPEED           int64
LEVEL                int64
LEVEL_MW           float64
LIGHT                int32
MAGNET_X           float64
MAGNET_Y           float64
MAGNET_Z           float64
PROXIMITY            int32
QUAL                 int64
SOURCE              object
SPEED                int64
STYPE               object
TIME_DAY            object
TYPE                object
WEEKEND              int32
WIFI_CONNECTED      object
dtype: object

In [15]:
df.SOURCE=df.SOURCE.astype(CategoricalDtype(categories=gps_sources)) 
df.WIFI_CONNECTED=df.WIFI_CONNECTED.astype(CategoricalDtype(categories=wifi)) 
df.WEEKEND=df.WEEKEND.astype(CategoricalDtype(categories=weekends)) 
df.STYPE=df.STYPE.astype(CategoricalDtype(categories=stypes)) 
df.TYPE=df.TYPE.astype(CategoricalDtype(categories=types))
    
df.ACTIVITY_STATE=df.ACTIVITY_STATE.astype(CategoricalDtype(categories=activity_states))
df.LAG_ACTIVITY=df.LAG_ACTIVITY.astype(CategoricalDtype(categories=activity_states))
df.LEAD_ACTIVITY=df.LEAD_ACTIVITY.astype(CategoricalDtype(categories=activity_states))
df.BATT_CHARGE=df.BATT_CHARGE.astype(CategoricalDtype(categories=battery_states))

df.TIME_DAY=df.TIME_DAY.astype(CategoricalDtype(categories=times_of_day))

In [16]:
df = pd.get_dummies(df, columns=['STYPE', 'TYPE','SOURCE',
                                 'WIFI_CONNECTED','BATT_CHARGE',
                                 'ACTIVITY_STATE','LAG_ACTIVITY',
                                 'LEAD_ACTIVITY', 'WEEKEND', 'TIME_DAY'])

In [19]:
df = df[
['ID_STATE',
 'LEVEL',
 'QUAL',
 'GPS_DELAY',
 'SPEED',
 'ACCURACY',
 'LAG_SPEED',
 'LAG_LTIME',
 'LEAD_SPEED',
 'LEAD_LTIME',
 'DISTANCE',
 'LIGHT',
 'MAGNET_X',
 'MAGNET_Y',
 'MAGNET_Z',
 'PROXIMITY',
 'ACTIVITY_CONFID',
 'STYPE_LOCATION',
 'STYPE_NF_BOOST_TECHNOLOGY',
 'STYPE_NF_SERVICE',
 'STYPE_NF_BOOST_COVERAGE',
 'STYPE_TIMER',
 'STYPE_WORKER',
 'STYPE_UNKNOWN',
 'TYPE_GSM',
 'TYPE_LTE',
 'TYPE_UNKNOWN',
 'TYPE_WCDMA',
 'SOURCE_FUSED',
 'SOURCE_GPS',
 'SOURCE_NET',
 'WIFI_CONNECTED_f',
 'WIFI_CONNECTED_t',
 'BATT_CHARGE_CHARGING',
 'BATT_CHARGE_FULL',
 'BATT_CHARGE_NOT_CHARGING',
 'BATT_CHARGE_UNCHARGING',
 'BATT_CHARGE_UNKNOWN',
 'ACTIVITY_STATE_IN_VEHICLE',
 'ACTIVITY_STATE_ON_BICYCLE',
 'ACTIVITY_STATE_ON_FOOT',
 'ACTIVITY_STATE_STILL',
 'ACTIVITY_STATE_TILTING',
 'ACTIVITY_STATE_WALKING',
 'ACTIVITY_STATE_UNKNOWN',
 'LAG_ACTIVITY_IN_VEHICLE',
 'LAG_ACTIVITY_ON_BICYCLE',
 'LAG_ACTIVITY_ON_FOOT',
 'LAG_ACTIVITY_STILL',
 'LAG_ACTIVITY_TILTING',
 'LAG_ACTIVITY_WALKING',
 'LAG_ACTIVITY_UNKNOWN',
 'LEAD_ACTIVITY_IN_VEHICLE',
 'LEAD_ACTIVITY_ON_BICYCLE',
 'LEAD_ACTIVITY_ON_FOOT',
 'LEAD_ACTIVITY_STILL',
 'LEAD_ACTIVITY_TILTING',
 'LEAD_ACTIVITY_WALKING',
 'LEAD_ACTIVITY_UNKNOWN',
 'WEEKEND_0',
 'WEEKEND_1',
 'TIME_DAY_MORNING',
 'TIME_DAY_AFTERNOON',
 'TIME_DAY_NIGHT']]

In [20]:
df = df.reindex(sorted(df.columns), axis=1)
df.columns

Index(['ACCURACY', 'ACTIVITY_CONFID', 'ACTIVITY_STATE_IN_VEHICLE',
       'ACTIVITY_STATE_ON_BICYCLE', 'ACTIVITY_STATE_ON_FOOT',
       'ACTIVITY_STATE_STILL', 'ACTIVITY_STATE_TILTING',
       'ACTIVITY_STATE_UNKNOWN', 'ACTIVITY_STATE_WALKING',
       'BATT_CHARGE_CHARGING', 'BATT_CHARGE_FULL', 'BATT_CHARGE_NOT_CHARGING',
       'BATT_CHARGE_UNCHARGING', 'BATT_CHARGE_UNKNOWN', 'DISTANCE',
       'GPS_DELAY', 'ID_STATE', 'LAG_ACTIVITY_IN_VEHICLE',
       'LAG_ACTIVITY_ON_BICYCLE', 'LAG_ACTIVITY_ON_FOOT', 'LAG_ACTIVITY_STILL',
       'LAG_ACTIVITY_TILTING', 'LAG_ACTIVITY_UNKNOWN', 'LAG_ACTIVITY_WALKING',
       'LAG_LTIME', 'LAG_SPEED', 'LEAD_ACTIVITY_IN_VEHICLE',
       'LEAD_ACTIVITY_ON_BICYCLE', 'LEAD_ACTIVITY_ON_FOOT',
       'LEAD_ACTIVITY_STILL', 'LEAD_ACTIVITY_TILTING', 'LEAD_ACTIVITY_UNKNOWN',
       'LEAD_ACTIVITY_WALKING', 'LEAD_LTIME', 'LEAD_SPEED', 'LEVEL', 'LIGHT',
       'MAGNET_X', 'MAGNET_Y', 'MAGNET_Z', 'PROXIMITY', 'QUAL', 'SOURCE_FUSED',
       'SOURCE_GPS', 'SOURCE_NE

In [22]:
X = df.drop(['ID_STATE'], axis=1).values
sample_id = df['ID_STATE'].values
y = rfc.predict(X)

In [23]:
df['OUTDOOR'] = y

In [25]:
df['OUTDOOR'].value_counts()

0    588463
1    216849
Name: OUTDOOR, dtype: int64