In [2]:
import pandas as pd
import re
import requests
import random

from tqdm import tqdm
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

In [31]:
rides = []

In [32]:
with open("routes.dat") as f:
    for line in f:
        if line.startswith("{H"):
            this = []
            line = re.sub(" +", " ", line)
            line = re.sub(" : ", " ", line)
            parts = line.split(" ")
            try:
                a, b = parts[1].split(".")
            except:
                continue
            a = a.lstrip("0")
            b = b.lstrip("0").rstrip(":")
            try:
                _ = int(a)
                _ = int(b)
            except:
                continue
            this.append(a)
            this.append(b)
            this.append(parts[2].rstrip("\n"))
            if len(parts) > 3 and "google" in parts[-1]:
                this.append(parts[-1])
            else:
                this.append(None)
            rides.append(this)

In [33]:
r = pd.DataFrame(rides, columns=["trip", "ride", "short", "long"])

In [None]:
r[r[["trip", "ride"]].duplicated(keep=False)]

In [35]:
short = r[r.long.isna()]

In [36]:
short = short[short.short.apply(lambda x: "/" in x and len(x.split("/")[-1]) > 5)]

In [37]:
def get_long(url):
    response = requests.get(url)
    if response.history:
        resp = response.history[-1]
        return resp.url


short["long"] = short.short.progress_apply(get_long)

  0%|          | 0/340 [00:00<?, ?it/s]

100%|██████████| 340/340 [09:46<00:00,  1.73s/it]


In [39]:
long = r[~r.long.isna()]

In [98]:
all = pd.concat([short, long])
all = all[all.long.str.contains("/dir/")]
all.to_csv("ride_urls.csv")

In [99]:
all = pd.read_csv("ride_urls.csv", index_col=0)

In [102]:
def get_start(url):
    start = re.findall(r"dir/(.*?)/", url)[0]
    if not start[0].isnumeric():
        start = list(re.findall(r"!1d(.*?)!2d(.*?)!", url)[0])
    else:
        start = start.split(",")
    return start


def get_end(url):
    end = re.findall(r"/.*/(.*)/@", url)[0]
    if not end[0].isnumeric():
        end = list(re.findall(r"!1d(.*?)!2d(.*?)!", url)[-1])
    else:
        end = end.split(",")

    return end

In [104]:
all["start"] = all.long.progress_apply(get_start)
all["end"] = all.long.progress_apply(get_end)

100%|██████████| 2835/2835 [00:00<00:00, 243819.88it/s]
100%|██████████| 2835/2835 [00:00<00:00, 57500.95it/s]


In [105]:
all.to_csv("ride_coords.csv")

In [48]:
all = pd.read_csv("ride_coords.csv", index_col=0)

In [49]:
all["stop"] = all.groupby(["trip", "ride"]).cumcount()

In [50]:
df = pd.read_csv(
    "rides.csv",
    names=[
        "trip",
        "ride",
        "departure_date",
        "departure_time",
        "departure_odometer_km",
        "arrival_date",
        "arrival_time",
        "arrival_odometer_km",
        "distance_km",
        "duration_h",
        "velocity_kmh",
        "waiting_time",
        "driver_country",
        "driver_type",
        "departure_country",
        "arrival_country",
        "departure_type_method",
        "departure_type_location",
        "departure_location",
        "arrival_location",
        "drop_1",
        "drop_2",
        "drop_3",
    ],
    index_col=False,
)

  df = pd.read_csv(


In [51]:
def get_time(x):
    try:
        t = datetime.strptime(x, "%H:%M")
        d = timedelta(hours=t.hour, minutes=t.minute)
        return int(d.total_seconds() // 60)
    except Exception as e:
        return None

df["waiting_time"] = df["waiting_time"].apply(get_time)

In [52]:
df = df.drop(columns=["drop_1", "drop_2", "drop_3"])

In [53]:
df["stop"] = df.groupby(["trip", "ride"]).cumcount()

In [54]:
full = pd.merge(all, df, on=["trip", "ride", "stop"], how="left")

In [55]:
full.dropna(subset=["departure_date"], inplace=True) # drop where no match could be found in rides table

In [56]:
full = full[full.departure_type_method != "q"] # prearranged rides

In [57]:
full["departure_type_method"] = full.departure_type_method.map({"a":"ask", "t":"thumb", "i":"invited"})

In [58]:
full.reset_index(drop=True, inplace=True)

In [59]:
# export to hitchmap format

hm = pd.DataFrame()
hm["id"] = [random.randint(9219501639295889231, 1e20) for _ in range(len(full))]
hm["lat"] = full.start.apply(lambda x: float(x.split(",")[0].lstrip("['").rstrip("'")))
hm["lon"] = full.start.apply(lambda x: float(x.split(",")[1].rstrip("']").lstrip(" '")))
hm["rating"] = None
hm["country"] = full.departure_country
hm["wait"] = full.waiting_time
hm["name"] = "Prino"  # https://hitchwiki.org/en/User:Prino
hm["comment"] = None
hm["datetime"] = full.departure_date + " " + full.departure_time + ":00.000000" # using datatime_ride so that one can see age of the ride on the frontend
hm["reviewed"] = 1
hm["banned"] = 0
hm["ip"] = None
hm["dest_lat"] = full.end.apply(
    lambda x: float(x.split(",")[0].lstrip("['").rstrip("'"))
)
hm["dest_lon"] = full.end.apply(
    lambda x: float(x.split(",")[1].rstrip("']").lstrip(" '"))
)
hm["signal"] = full.departure_type_method
hm["datetime_ride"] = full.departure_date + " " + full.departure_time + ":00.000000"
hm["gender"] = "male"
hm["age"] = full.departure_date.apply(
    lambda x: relativedelta(
        datetime.strptime(x, "%Y-%m-%d").date(), datetime(1960, 1, 1)
    ).years
)  # roughly
# new features
hm["driver_country"] = full.driver_country
hm["driver_gender"] = full.driver_type.map(
    {
        "-": "male",
        "V": "female",
        "E": "mixed",
        "F": "mixed",
        "V*": "female",
        "-*": "male",
        "Elec": "male",
        "V-El": "female",
        "Por":"male",
        "VPor":"female",
        "T":"male",
    },
    na_action=None,
)
hm["vehicle"] = full.driver_type.map(
    {
        "T": "truck",
        "B": "van",
        "MB": "bus",
        "C": "camper",
        "Bus": "bus",
        "M": "motorbike",
    },
    na_action=None,
)
hm["vehicle"] = hm.vehicle.fillna("car")

In [60]:
hm.comment.astype(str).replace("nan", None, inplace=True)

In [61]:
hm.to_csv("prino_to_hitchmap.csv", index=False)