In [None]:
import os
import sys
import csv
import pandas as pd
import numpy as np
import importlib
import itertools
from pandas.io.json import json_normalize
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from matplotlib import rcParams
import json
import math

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style"))

# Read Data

In [None]:
# Global variables
meta_data_path = "../../data-campaigns/meta-data/"

legs = "all_legs_merged_no_outlier_0.01.pkl"
input_path = "../../2019-12-16.out/"
out_path = "../../2019-12-16.out/D4.5/"
img_path = out_path

# Graphical parameters
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16
rcParams["figure.figsize"] = 12, 8
sns.set_style("whitegrid")

In [None]:
try:
    os.makedirs(os.path.abspath(out_path))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path), file=sys.stderr)

In [None]:
all_legs = pd.read_pickle(input_path + legs)
# trips_users_df = pd.read_pickle(input_path + 'trips_users_df.pkl')
trips_df = pd.read_pickle(input_path + "trips_df.pkl")
## select only trips in all_legs
# trips_df = trips_df[trips_df['tripid'].isin(all_legs['tripid'])]

# transport categories
with open(input_path + "category_transp_mode_dict.json", "r") as f:
    category_transp_mode_dict = json.load(f)

inverted_category_transp_mode_dict = dict(
    (v, k) for k in category_transp_mode_dict for v in category_transp_mode_dict[k]
)

#### remove "unknown" as transport category (?)

print("Legs:", all_legs.shape[0])
print("Trips: ", len(all_legs.tripid.unique()))
print("Users:", len(all_legs.userid.unique()))
print()
## Divide between male and female users

all_legs_M = all_legs[all_legs.gender == "Male"]
print("Legs of male users:", all_legs_M.shape[0])
print("Male users:", len(all_legs_M.userid.unique()))
print()
all_legs_F = all_legs[all_legs.gender == "Female"]
print("Legs of female users:", all_legs_F.shape[0])
print("Female users:", len(all_legs_F.userid.unique()))
print()
all_legs_O = all_legs[all_legs.gender == "Other"]
print("Legs of other users:", all_legs_O.shape[0])
print("Other users:", len(all_legs_O.userid.unique()))

In [None]:
# input csv
input_csv_path = "../../data-campaigns/D4.5"
csv1_path = os.path.join(input_csv_path, "grouping1.csv")
csv2_path = os.path.join(input_csv_path, "grouping2.csv")

# read first csv
with open(csv1_path, "r") as infp:
    reader = csv.reader(infp)

    # skip header
    next(reader)

    users1 = [line[0].strip() for line in reader if line]

with open(csv2_path, "r") as infp:
    reader = csv.reader(infp)

    # skip header
    next(reader)

    users2 = [line[0].strip() for line in reader if line]

In [None]:
print("* number of users in group 1: {}".format(len(users1)))
print("* number of users in group 2: {}".format(len(users2)))

In [None]:
set(users1) == set(users2)

# Extract data

Data to extract:
* `userid`
* `Average car (driver) trip duration`
* `Average car (passenger) trip duration`
* `Average car (driver) trip distance`
* `Average car (passenger) trip distance`
* `Home-to-work distance`
* `Average home-to-work duration`


In [None]:
all_legs.head(3)

## Grouping 1

In [None]:
all_legs_users = all_legs[all_legs["userid"].isin(users1)].reset_index()

In [None]:
print("Legs:", all_legs_users.shape[0])
print("Trips: ", len(all_legs_users.tripid.unique()))
print("Users:", len(all_legs_users.userid.unique()))
print()

In [None]:
# car drivers
cardrivers = all_legs_users[
    all_legs_users["correctedModeOfTransport_str"] == "carDriver"
]
cardrivers.head(3)

In [None]:
tripids_cardrivers = list(cardrivers.tripid.unique())
all_legs_cardrivers = all_legs[
    all_legs["tripid"].isin(tripids_cardrivers)
].reset_index()
all_legs_cardrivers.head(3)

In [None]:
print(
    "Legs: {} (all) {} (just carDriver legs)".format(
        all_legs_cardrivers.shape[0], cardrivers.shape[0]
    )
)
print(
    "Trips: {} (all) {} (just carDriver legs)".format(
        len(all_legs_cardrivers.tripid.unique()), len(cardrivers.tripid.unique())
    )
)
print(
    "Users: {} (all) {} (just carDriver legs)".format(
        len(all_legs_cardrivers.userid.unique()), len(cardrivers.userid.unique())
    )
)
print()

In [None]:
carpassengers = all_legs_users[
    all_legs_users["correctedModeOfTransport_str"] == "carPassenger"
]
carpassengers.head(3)

In [None]:
tripids_carpassengers = list(carpassengers.tripid.unique())
all_legs_carpassengers = all_legs[
    all_legs["tripid"].isin(tripids_carpassengers)
].reset_index()
all_legs_carpassengers.head(3)

In [None]:
print(
    "Legs: {} (all) {} (just carPassenger legs)".format(
        all_legs_carpassengers.shape[0], carpassengers.shape[0]
    )
)
print(
    "Trips: {} (all) {} (just carPassenger legs)".format(
        len(all_legs_carpassengers.tripid.unique()), len(carpassengers.tripid.unique())
    )
)
print(
    "Users: {} (all) {} (just carPassenger legs)".format(
        len(all_legs_carpassengers.userid.unique()), len(carpassengers.userid.unique())
    )
)
print()

In [None]:
carusers = set(list(all_legs_cardrivers.userid.unique())).union(
    set(list(all_legs_carpassengers.userid.unique()))
)
print(
    "Users {} (with at least one leg either carDriver or carPassenger)".format(
        len(carusers)
    )
)

In [None]:
all_legs_both = set(list(all_legs_cardrivers.tripid.unique())).intersection(
    set(list(all_legs_carpassengers.tripid.unique()))
)
print(len(all_legs_both))

In [None]:
sorted(
    list(all_legs_both),
    key=lambda x: (int(x.split(":")[0].lstrip("#")), int(x.split(":")[-1].lstrip("#"))),
)[:5]

In [None]:
all_legs[all_legs["tripid"] == "#30:2381"]["correctedModeOfTransport_str"]

In [None]:
sorted(
    list(all_legs_cardrivers.tripid.unique()),
    key=lambda x: (int(x.split(":")[0].lstrip("#")), int(x.split(":")[-1].lstrip("#"))),
)[:5]

In [None]:
all_legs.columns

In [None]:
all_legs[all_legs["tripid"] == "#30:1339"][
    [
        "correctedModeOfTransport_str",
        "legid",
        "tripStartDate_formated",
        "startDate_formated",
        "endDate_formated",
        "inferred_leg_duration_min",
    ]
]

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP)
trips_cardrivers_tottime = (
    all_legs_cardrivers.groupby("tripid")["inferred_leg_duration_min"]
    .sum()
    .reset_index(name="total_time")
)
trips_cardrivers_tottime.head(3)

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP)
trips_cardrivers_totlen = (
    all_legs_cardrivers.groupby("tripid")["trueDistance"]
    .sum()
    .reset_index(name="total_len")
)
trips_cardrivers_totlen.head(3)

In [None]:
users_cardrivers = all_legs_cardrivers[["userid", "tripid"]]
users_cardrivers_trips = users_cardrivers.merge(trips_tottime).merge(trips_totlen)
users_cardrivers_trips.head(3)

In [None]:
results_cardrivers = users_cardrivers_trips.groupby("userid").agg(["mean", "count"])
results_cardrivers.columns = [
    "total_time_mean",
    "total_time_count",
    "total_len_mean",
    "total_len_count",
]
results_cardrivers.head(3)

In [None]:
results_cardrivers_transposed = results_cardrivers.transpose()
results_cardrivers_transposed.head(3)

In [None]:
results_cardrivers_dict = results_cardrivers_transposed.to_dict()

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP)
trips_carpassengers_tottime = (
    all_legs_carpassengers.groupby("tripid")["inferred_leg_duration_min"]
    .sum()
    .reset_index(name="total_time")
)
trips_carpassengers_tottime.head(3)

In [None]:
# create a df with sum of inferred_leg_duration_min (PER TRIP)
trips_carpassengers_totlen = (
    all_legs_carpassengers.groupby("tripid")["trueDistance"]
    .sum()
    .reset_index(name="total_len")
)
trips_carpassengers_totlen.head(3)

In [None]:
users_carpassengers = all_legs_carpassengers[["userid", "tripid"]]
users_carpassengers_trips = users_carpassengers.merge(trips_tottime).merge(trips_totlen)
users_carpassengers_trips.head(3)

In [None]:
results_carpassengers = users_carpassengers_trips.groupby("userid").agg(
    ["mean", "count"]
)
results_carpassengers.columns = [
    "total_time_mean",
    "total_time_count",
    "total_len_mean",
    "total_len_count",
]
results_carpassengers.head(3)

In [None]:
results_carpassengers_transposed = results_carpassengers.transpose()
results_carpassengers_transposed.head(3)

In [None]:
results_carpassengers_dict = results_carpassengers_transposed.to_dict()

In [None]:
outfile_name = "trip_data_woorti_survey_group1.csv"
outfile_path = os.path.join(out_path, outfile_name)

with open(outfile_path, "w+") as outfp:
    writer = csv.writer(outfp)

    # write header
    writer.writerow(
        [
            "userid",
            "time_mean_cardriver",
            "len_mean_cardriver",
            "count_cardriver",
            "time_mean_carpassenger",
            "len_mean_carpassenger",
            "count_carpassenger",
        ]
    )

    for user in users1:
        time_mean_cardriver = 0
        len_mean_cardriver = 0
        count_cardriver = 0
        if user in results_cardrivers_dict:
            time_mean_cardriver = round(
                results_cardrivers_dict[user]["total_time_mean"], 4
            )
            len_mean_cardriver = round(
                results_cardrivers_dict[user]["total_len_mean"], 4
            )
            count_cardriver = int(results_cardrivers_dict[user]["total_time_count"])

        time_mean_carpassenger = 0
        len_mean_carpassenger = 0
        count_carpassenger = 0
        if user in results_carpassengers_dict:
            time_mean_carpassenger = round(
                results_carpassengers_dict[user]["total_time_mean"], 4
            )
            len_mean_carpassenger = round(
                results_carpassengers_dict[user]["total_len_mean"], 4
            )
            count_carpassenger = int(
                results_carpassengers_dict[user]["total_time_count"]
            )

        writer.writerow(
            [
                user,
                time_mean_cardriver,
                len_mean_cardriver,
                count_cardriver,
                time_mean_carpassenger,
                len_mean_carpassenger,
                count_carpassenger,
            ]
        )

In [None]:
outfile_name = "trip_data_woorti_survey_group2.csv"
outfile_path = os.path.join(out_path, outfile_name)

with open(outfile_path, "w+") as outfp:
    writer = csv.writer(outfp)

    # write header
    writer.writerow(
        [
            "userid",
            "time_mean_cardriver",
            "len_mean_cardriver",
            "count_cardriver",
            "time_mean_carpassenger",
            "len_mean_carpassenger",
            "count_carpassenger",
        ]
    )

    for user in users2:
        time_mean_cardriver = 0
        len_mean_cardriver = 0
        count_cardriver = 0
        if user in results_cardrivers_dict:
            time_mean_cardriver = round(
                results_cardrivers_dict[user]["total_time_mean"], 4
            )
            len_mean_cardriver = round(
                results_cardrivers_dict[user]["total_len_mean"], 4
            )
            count_cardriver = int(results_cardrivers_dict[user]["total_time_count"])

        time_mean_carpassenger = 0
        len_mean_carpassenger = 0
        count_carpassenger = 0
        if user in results_carpassengers_dict:
            time_mean_carpassenger = round(
                results_carpassengers_dict[user]["total_time_mean"], 4
            )
            len_mean_carpassenger = round(
                results_carpassengers_dict[user]["total_len_mean"], 4
            )
            count_carpassenger = int(
                results_carpassengers_dict[user]["total_time_count"]
            )

        writer.writerow(
            [
                user,
                time_mean_cardriver,
                len_mean_cardriver,
                count_cardriver,
                time_mean_carpassenger,
                len_mean_carpassenger,
                count_carpassenger,
            ]
        )