## Dataset

In [None]:
# Import libraries

import os
import sys
import glob
import json
import time
import pathlib

import numpy as np
import pandas as pd


from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

%matplotlib inline

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = pathlib.Path("../../2019-12-16.out/")
out_path = pathlib.Path("../../2019-12-16.out/dataset/")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

## Experience factors

In [None]:
### read data
all_factors = pd.read_pickle(input_path / "all_factors.pkl")

In [None]:
all_factors.head(3)

In [None]:
all_factors_out = all_factors[["legid", "factor", "type", "minus", "plus"]].copy()
all_factors_out.minus.replace({True: 1, False: 0}, inplace=True)
all_factors_out.plus.replace({True: 1, False: 0}, inplace=True)

all_factors_out.head(3)

In [None]:
all_factors_filename = "experience_factors.csv"
all_factors_path = out_path / all_factors_filename

In [None]:
all_factors_out.to_csv(all_factors_path, header=True, index=False)

## Activity codes

In [None]:
# read data
all_activities = pd.read_pickle(input_path / "all_gen_act.pkl")
all_activities.head(3)

In [None]:
all_activities_out = all_activities[["legid", "code"]].copy()
all_activities_out.rename(columns={"code": "activity"}, inplace=True)

all_activities_out.head(3)

In [None]:
all_activities_filename = "activities.csv"
all_activities_path = out_path / all_activities_filename

In [None]:
all_activities_out.to_csv(all_activities_path, header=True, index=False)

## Worthwhileness componenents

In [None]:
worthwhileness_values = pd.read_pickle(input_path / "values_from_trip.pkl")
worthwhileness_values.head(3)

## Trip purposes

In [None]:
purposes = pd.read_pickle(input_path / "trip_objs.pkl")
purposes.head(3)

In [None]:
purposes_out = purposes[["tripid", "tripObjectiveStringEN"]].copy()
purposes_out.rename(columns={"tripObjectiveStringEN": "purpose"}, inplace=True)

# trip_purposes_out.rename(columns={"tripObjectiveCode": "purposeid"}, inplace=True)
# trip_purposes_out = trip_purposes_out.astype({"purposeid": int})

purposes_out.head(3)

In [None]:
purposes_filename = "purposes.csv"
purposes_path = out_path / purposes_filename

In [None]:
purposes_out.to_csv(purposes_path, header=True, index=False)

## Legs

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"

# read datasets
legs_df = pd.read_pickle(input_path / legs)
trips_users_df = pd.read_pickle(input_path / trips_users)
trips_df = pd.read_pickle(input_path / trips)

In [None]:
legs_df.head(3)

In [None]:
legs_data = (
    legs_df[
        [
            "legid",
            "class",
            "userid",
            "tripid",
            "correctedModeOfTransport",
            "startDate_formated",
            "endDate_formated",
            "trueDistance",
            "legDistance",
            "legDuration",
            "wastedTime",
            "transp_category",
            "onCampaigns",
            "weekday",
            "we_vs_wd",
        ]
    ]
).copy()

In [None]:
legs_data.head(3)

In [None]:
legs_data.loc[legs_data["tripid"] == "#130:14929"].legid

In [None]:
legs_data.rename(
    columns={
        "correctedModeOfTransport": "motid",
        "startDate_formated": "start_date",
        "endDate_formated": "end_date",
        "trueDistance": "true_distance",
        "legDistance": "leg_distance",
        "legDuration": "leg_duration",
        "wastedTime": "worthwhileness_rating",
        "transp_category": "transport_category",
        "onCampaigns": "campaign",
        "we_vs_wd": "weekday_class",
    },
    inplace=True,
)
legs_data = legs_data.astype(
    {"motid": int, "worthwhileness_rating": float}, inplace=True
)

In [None]:
legs_data.head(3)

In [None]:
print("No. of legs: ", legs_data.shape)

In [None]:
legs_data.loc[(legs_data["leg_duration"] > 0) & (legs_data["leg_duration"] < 10)]

In [None]:
legs_data_filename = "legs.csv"
legs_data_path = out_path / legs_data_filename

In [None]:
legs_data.to_csv(legs_data_path, index=False, header=True, float_format="%.3f")

## all_legs_original

In [None]:
all_legs_original = pd.read_pickle(input_path / "all_legs_original.pkl")

In [None]:
all_legs_original.loc[all_legs_original["tripid"] == "#130:14929"].legid

## all_legs_merged_1.pkl

In [None]:
all_legs_merged_1 = pd.read_pickle(input_path / "all_legs_merged_1.pkl")

In [None]:
all_legs_merged_1.loc[all_legs_merged_1["tripid"] == "#130:14929"].legid

## all_legs_merged_no_outlier

In [None]:
all_legs_merged_no_outlier = pd.read_pickle(
    input_path / "all_legs_merged_no_outlier_0.01.pkl"
)

In [None]:
all_legs_merged_no_outlier.loc[
    all_legs_merged_no_outlier["tripid"] == "#130:14929"
].legid

## Trips

In [None]:
# input files
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"

# read datasets
trips_users_df = pd.read_pickle(input_path / trips_users)
trips_df = pd.read_pickle(input_path / trips)

In [None]:
trips_users_df.head(3)

In [None]:
trips_users_df.columns

In [None]:
trips_df.head(3)

In [None]:
trips_df.columns

In [None]:
trips_merged_data = trips_df.merge(trips_users_df[["tripid", "userid"]], on="tripid")

In [None]:
trips_merged_data.head(3)

In [None]:
trips_data = (
    trips_merged_data[
        [
            "tripid",
            "userid",
            "tripStartDate",
            "tripEndDate",
            "tripValidationDate",
            "averageSpeed",
            "didYouHaveToArrive",
            "distance",
            "duration",
            "howOften",
            "manualTripEnd",
            "manualTripStart",
            "maxSpeed",
            "model",
            "numDeletes",
            "numMerges",
            "numSplits",
            "oS",
            "oSVersion",
            "overallScore",
            "usetripMoreFor",
        ]
    ]
).copy()

In [None]:
trips_data.head(3)

In [None]:
print("No. of trips: ", trips_data.shape)

In [None]:
xxx = trips_data.loc[trips_data["tripValidationDate"].notnull()]
print("No. of trips: ", xxx.shape)

In [None]:
trips_data["tripStartDate"] = pd.to_datetime(trips_data["tripStartDate"]).dt.strftime(
    "%Y-%m-%d %H:%M:%S.%f%z"
)
trips_data["tripEndDate"] = pd.to_datetime(trips_data["tripEndDate"]).dt.strftime(
    "%Y-%m-%d %H:%M:%S.%f%z"
)
trips_data["tripValidationDate"] = pd.to_datetime(
    trips_data["tripValidationDate"]
).dt.strftime("%Y-%m-%d %H:%M:%S.%f%z")

In [None]:
trips_data.head(3)

In [None]:
# -- reorder columns
# tripid, userid
# start_date, end_date
# average_speed, max_speed
# distance, duration
# mood_rating
# did_you_have_to_arrive, how_often, use_trip_more_for
# manual_start, manual_end
# validation_date
# os, os_version, model

trips_data_out = trips_data[
    [
        "tripid",
        "userid",
        "tripStartDate",
        "tripEndDate",
        "averageSpeed",
        "maxSpeed",
        "distance",
        "duration",
        "overallScore",
        "didYouHaveToArrive",
        "howOften",
        "usetripMoreFor",
        "manualTripStart",
        "manualTripEnd",
        "tripValidationDate",
        "oS",
        "oSVersion",
        "model",
    ]
].copy()

trips_data_out.rename(
    columns={
        "tripStartDate": "start_date",
        "tripEndDate": "end_date",
        "averageSpeed": "average_speed",
        "maxSpeed": "max_speed",
        "overallScore": "mood_rating",
        "didYouHaveToArrive": "did_you_have_to_arrive",
        "howOften": "how_often",
        "usetripMoreFor": "use_trip_more_for",
        "manualTripStart": "manual_start",
        "manualTripEnd": "manual_end",
        "tripValidationDate": "validation_date",
        "oS": "os",
        "oSVersion": "os_version",
    },
    inplace=True,
)

In [None]:
trips_data_out.head(3)

In [None]:
trips_data_out_filename = "trips.csv"
trips_data_out_path = out_path / trips_data_out_filename

In [None]:
trips_data_out.to_csv(
    trips_data_out_path, index=False, header=True, float_format="%.3f"
)

## Values from trip

In [None]:
values_from_trip_path = "values_from_trip.pkl"
values_from_trip_df = pd.read_pickle(input_path / values_from_trip_path)

In [None]:
values_from_trip_df.head(3)

In [None]:
values_from_trip_out = values_from_trip_df[
    ["tripid", "legid", "valueFromTrip", "value"]
].copy()
values_from_trip_out.rename(
    columns={"valueFromTrip": "worthwhileness_element"}, inplace=True
)

values_from_trip_out = values_from_trip_out.astype({"value": int})

In [None]:
values_from_trip_out.head(3)

In [None]:
values_from_trip_out_filename = "worthwhileness_elements_from_trips.csv"
values_from_trip_out_path = out_path / values_from_trip_out_filename

print(values_from_trip_out_path)

In [None]:
values_from_trip_out.to_csv(
    values_from_trip_out_path, index=False, header=True, float_format="%.3f"
)

## Weather data

In [None]:
weather_df_path = pathlib.Path("../../2019-12-16.out/weather_final_with_legs_df.pkl")
weather_final_with_legs_df = pd.read_pickle(weather_df_path)

In [None]:
weather_final_with_legs_df.head(3)

In [None]:
weather_final_with_legs_df.rename(columns={"id": "weatherid"}, inplace=True)

In [None]:
weather_final_with_legs_df.head(3)

In [None]:
# weatherid, legid
# request_date
# centroid_x, centroid_y, country
# weather_scenario,
# apparent_temperature, net_radiation, temperature_category, temperature_description,
# cloud_category, cloud_main,
# precipitation_category, precipitation_main,
# wind_beaufort_number, wind_category, wind_description

weather_legs_out = weather_final_with_legs_df[
    [
        "weatherid",
        "legid",
        "request_timestamp",
        "centroid_x",
        "centroid_y",
        "city",
        "country",
        "weather_id",
        "weather_scenario",
        "weather_main",
        "weather_description",
        "weather_icon",
        "temp",
        "temp_min",
        "temp_max",
        "temp_kf",
        "apparent_temperature",
        "net_radiation",
        "temperature_category",
        "temperature_description",
        "pressure",
        "sea_level",
        "grnd_level",
        "humidity",
        "clouds_all",
        "cloud_category",
        "cloud_main",
        "precipitation_category",
        "precipitation_main",
        "wind_speed",
        "wind_deg",
        "wind_beaufort_number",
        "wind_category",
        "wind_description",
        "sys_pod",
    ]
].copy()

weather_legs_out.rename(columns={"request_timestamp": "request_date"}, inplace=True)

In [None]:
weather_legs_out.columns

In [None]:
weather_legs_out.head(3)

In [None]:
weather_legs_out_filename = "weather_legs.csv"
weather_legs_out_path = out_path / weather_legs_out_filename

In [None]:
weather_legs_out.to_csv(weather_legs_out_path, header=True, index=False)