In [1]:
import pandas as pd
import re
import requests
import random

from tqdm import tqdm
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

In [2]:
# collect all routes from routes.dat and store their related rides
rides = []

In [3]:
# lines with a route start with "{H"
# their format is not consistent, but usually looks like this:
# {H}
with open("routes.dat") as f:
    for line in f:
        if line.startswith("{H"):
            this = []
            # clean
            line = re.sub(" +", " ", line)
            line = re.sub(" : ", " ", line)

            parts = line.split(" ")

            # get trip and ride numbers that should be integers; one trip can consist of multiple rides
            try:
                trip, ride = parts[1].split(".")
            except:
                continue
            trip = trip.lstrip("0")
            ride = ride.lstrip("0").rstrip(":")
            try:
                _ = int(trip)
                _ = int(ride)
            except:
                continue
            this.append(trip)
            this.append(ride)

            # short links do not contain coordinates of the route
            short_link = parts[2].rstrip("\n")
            this.append(short_link)

            # a long link is not present for all entries
            if len(parts) > 3 and "google" in parts[-1]:
                long_link = parts[-1]
                this.append(long_link)
            else:
                this.append(None)

            rides.append(this)

In [4]:
rides = pd.DataFrame(rides, columns=["trip", "ride", "short_link", "long_link"])

In [5]:
# tear rides apart

# rides where long link to google maps route is missing
short = rides[rides.long_link.isna()]
long = rides[~rides.long_link.isna()]

In [6]:
# drop short links that do not work anymore (a google thing)
short = short[short.short_link.apply(lambda x: "/" in x and len(x.split("/")[-1]) > 5)]

In [7]:
def get_long(url):
    """Get the long url from the short url"""
    response = requests.get(url)
    if response.history:
        resp = response.history[-1]
        return resp.url


short["long_link"] = short.short_link.progress_apply(get_long)

100%|██████████| 340/340 [04:58<00:00,  1.14it/s]


In [16]:
long[long[['trip', 'ride']].duplicated(keep=False)]

Unnamed: 0,trip,ride,short_link,long_link
0,11,66,https://goo.gl/maps/rrDA3DvnWGC2,"https://www.google.com/maps/dir/39.9820088,22.6210543/41.1304734,22.5517487/@40.567385,22.0068016,9.25z/data=!4m24!4m23!1m20!3m4!1m2!1d22.5348794!2d40.3050979!3s0x1357ff4fcf4cbc5b:0xea7eecbf03c04d2e!3m4!1m2!1d22.5973625!2d40.3736971!3s0x1357fdea1ee008a9:0x85618d8beaed9e66!3m4!1m2!1d22.4494305!2d40.6146009!3s0x1357e7f253425815:0xfcd171ffccb567e3!3m4!1m2!1d22.7087757!2d40.7473537!3s0x14a828ce9a2c440b:0x6c7763dbd0ed30bd!1m0!3e0\n"
1,11,66,https://goo.gl/maps/jachNyRVGQD2,"https://www.google.com/maps/dir/41.1304734,22.5517487/46.4374774,14.2549411/@43.7036565,16.0577475,7z/data=!4m2!4m1!3e0\n"
2,11,66,https://goo.gl/maps/cZMotNfQx182,"https://www.google.com/maps/dir/46.4374926,14.2549379/47.7684429,12.9432423/@47.1004408,13.0620636,9z/data=!4m9!4m8!1m5!3m4!1m2!1d14.2564598!2d46.624219!3s0x4770740fe7164dc5:0xc59c7944c394129f!1m0!3e0\n"
3,11,66,https://goo.gl/maps/yX5QH6G6ibQ2,"https://www.google.com/maps/dir/47.7684429,12.9432423/51.8985932,6.1657444/@49.7943179,7.1208268,7.25z/data=!4m2!4m1!3e0\n"
4,11,66,https://goo.gl/maps/PmH25y82cr42,"https://www.google.com/maps/dir/51.8985932,6.1657444/52.0649302,5.2614321/@51.9983268,5.455841,10.5z/data=!4m2!4m1!3e0\n"
5,11,66,https://goo.gl/maps/B5Z6Rw5gaZT2,"https://www.google.com/maps/dir/39.9818958,22.6212919/52.0649305,5.2614366/@46.1784179,9.0183421,6z/data=!4m17!4m16!1m10!3m4!1m2!1d22.4484527!2d40.6277868!3s0x1357dd5c15194f69:0x8843d7720d1b68d1!3m4!1m2!1d14.2172655!2d46.6298212!3s0x477076b7a2e78a57:0xccb7900643666921!1m0!2m2!7e2!8j650565000!3e0\n"
14,51,7,https://maps.app.goo.gl/mFMME5RpcH6fmBN58,"https://www.google.com/maps/dir/35.2033999,139.0327588/MISHIMA+STATION,+16-1+Ichibancho,+Mishima,+Shizuoka+411-0036,+Japan/@35.1587012,138.9389742,12.79z/data=!4m9!4m8!1m0!1m5!1m1!1s0x60199a84df94e5c7:0x4359a0641553009f!2m2!1d138.9111261!2d35.1262994!3e0\n"
15,51,8,https://maps.app.goo.gl/eaMTBr1cidWS2aPn6,"https://www.google.com/maps/dir/34.6846331,135.800119/34.7234306,135.7915519/@34.7040955,135.781056,14z/data=!3m1!4b1!4m2!4m1!3e0\n"
26,51,7,http://goo.gl/maps/tGZQj,"https://www.google.com/maps/dir/35.2034745,139.0328928/MISHIMA+STATION,+16-1+Ichibancho,+Mishima,+Shizuoka+411-0036,+Japan/@35.1665198,138.9460223,13z/data=!4m9!4m8!1m0!1m5!1m1!1s0x60199a84df94e5c7:0x4359a0641553009f!2m2!1d138.9111261!2d35.1262994!3e0\n"
27,51,8,http://goo.gl/maps/FXl88,"https://www.google.com/maps/dir/34.687881,135.803036/34.7234306,135.7915519/@34.707932,135.7938233,14.38z/data=!4m2!4m1!3e0\n"


In [9]:
# put rides together again
rides = pd.concat([short, long])

In [18]:
rides.head()

Unnamed: 0,trip,ride,short_link,long_link
56,111,5,https://goo.gl/maps/nPDpQ2hQAEeckcdu7,"https://www.google.com/maps/@48.5947235,2.4420642,3a,75y,113.75h,75.55t/data=!3m6!1e1!3m4!1s-ssX8hMOyuRI_zQqrIbeLA!2e0!7i16384!8i8192?hl=en&shorturl=1"
57,111,6,https://goo.gl/maps/AMBNuHagai7op1kWA,"https://www.google.com/maps/@45.7412172,4.4635113,3a,75y,342.25h,84.72t/data=!3m6!1e1!3m4!1skYdB-2HcGy8P3ZwyD1LsJA!2e0!7i13312!8i6656?hl=en&shorturl=1"
405,156,1,https://goo.gl/maps/KHVVYLk9Zh12,"https://www.google.com/maps/dir/54.763469,25.080691/54.935537,23.870589/@54.7604884,24.2133912,10z/data=!4m6!4m5!1m1!4e1!1m1!4e1!3e0?hl=en&shorturl=1"
514,162,6,https://goo.gl/maps/T1hzytoX3N22,"https://www.google.com/maps/dir/54.3589914,23.1822141/52.3015858,13.0213019/@53.4176585,15.8157329,7z/data=!4m24!4m23!1m20!3m4!1m2!1d20.257172!2d53.7483332!3s0x471d7fceac1ea77b:0x63f879a034e4862d!3m4!1m2!1d19.5403897!2d54.1015715!3s0x46e2b25b2148c057:0x6bc3a212f78c2d55!3m4!1m2!1d16.6583117!2d53.3723023!3s0x470162482dc1302d:0x8fba958abc940158!3m4!1m2!1d14.6228172!2d53.3782926!3s0x4700a7f3c1a3815b:0xed6df1353198568d!1m0!3e0?hl=en&shorturl=1"
516,162,8,https://goo.gl/maps/2DN7KJ5vmn32,"https://www.google.com/maps/dir/52.2520523,12.3086909/Rasthof+Lehrter+See+Nord,+Lehrte,+Germany/@52.149435,10.5941378,9z/data=!4m9!4m8!1m0!1m5!1m1!1s0x47b0058ca1c6d2c3:0x34883c21066e75c6!2m2!1d9.9954637!2d52.3886797!3e0?hl=en&shorturl=1"


In [20]:
# drop instances with links that are no routes (street view)
rides = rides[rides.long_link.str.contains("/dir/")]

Unnamed: 0,trip,ride,short_link,long_link
56,111,5,https://goo.gl/maps/nPDpQ2hQAEeckcdu7,"https://www.google.com/maps/@48.5947235,2.4420642,3a,75y,113.75h,75.55t/data=!3m6!1e1!3m4!1s-ssX8hMOyuRI_zQqrIbeLA!2e0!7i16384!8i8192?hl=en&shorturl=1"
57,111,6,https://goo.gl/maps/AMBNuHagai7op1kWA,"https://www.google.com/maps/@45.7412172,4.4635113,3a,75y,342.25h,84.72t/data=!3m6!1e1!3m4!1skYdB-2HcGy8P3ZwyD1LsJA!2e0!7i13312!8i6656?hl=en&shorturl=1"
60,111,12,https://goo.gl/maps/QWXVtQzGGF4kzRMT9,"https://www.google.com/maps/@45.360087,1.5716114,3a,75y,230.63h,90t/data=!3m6!1e1!3m4!1s5sq37hxjPuZfQsbuNt7enw!2e0!7i13312!8i6656\n"


In [102]:
def get_start(url):
    """Extract the start coordinates from the url"""
    start = re.findall(r"dir/(.*?)/", url)[0]
    if not start[0].isnumeric():
        start = list(re.findall(r"!1d(.*?)!2d(.*?)!", url)[0])
    else:
        start = start.split(",")
    return start


def get_end(url):
    """Extract the end coordinates from the url"""
    end = re.findall(r"/.*/(.*)/@", url)[0]
    if not end[0].isnumeric():
        end = list(re.findall(r"!1d(.*?)!2d(.*?)!", url)[-1])
    else:
        end = end.split(",")

    return end

In [104]:
rides["start"] = rides.long_link.progress_apply(get_start)
rides["end"] = rides.long_link.progress_apply(get_end)

100%|██████████| 2835/2835 [00:00<00:00, 243819.88it/s]
100%|██████████| 2835/2835 [00:00<00:00, 57500.95it/s]


In [49]:
# instances with same trip + ride mark stops of the vehicle or border crossings
# the way this is used is inconsistent, sometimes a second occurende of the same trip + ride actually matches another ride
# we will take the start point of of first route and the coordinates of the destination (from rides.csv) as the route

ides["stop"] = rides.groupby(["trip", "ride"]).cumcount()

In [28]:
df = pd.read_csv(
    "rides.csv",
    names=[
        "trip",
        "ride",
        "departure_date",
        "departure_time",
        "departure_odometer_km",
        "arrival_date",
        "arrival_time",
        "arrival_odometer_km",
        "distance_km",
        "duration_h",
        "velocity_kmh",
        "waiting_time",
        "driver_country",
        "driver_type",
        "departure_country",
        "arrival_country",
        "departure_type_method",
        "departure_type_location",
        "departure_location",
        "arrival_location",
        # irrelevant columns
        "drop_1",
        "drop_2",
        "drop_3",
    ],
    index_col=False,
)

  df = pd.read_csv(


In [29]:
df = df.drop(columns=["drop_1", "drop_2", "drop_3"])

In [30]:
def get_time(x):
    """Retrieve the time in minutes from a string"""
    try:
        t = datetime.strptime(x, "%H:%M")
        d = timedelta(hours=t.hour, minutes=t.minute)
        return int(d.total_seconds() // 60)
    except Exception as e:
        return None

df["waiting_time"] = df["waiting_time"].apply(get_time)

In [33]:
dsjf

NameError: name 'dsjf' is not defined

In [54]:
full = pd.merge(all, df, on=["trip", "ride", "stop"], how="left")

In [55]:
full.dropna(subset=["departure_date"], inplace=True) # drop where no match could be found in rides table

In [56]:
full = full[full.departure_type_method != "q"] # prearranged rides

In [57]:
full["departure_type_method"] = full.departure_type_method.map({"a":"ask", "t":"thumb", "i":"invited"})

In [58]:
full.reset_index(drop=True, inplace=True)

In [59]:
# export to hitchmap format

hm = pd.DataFrame()
hm["id"] = [random.randint(9219501639295889231, 1e20) for _ in range(len(full))]
hm["lat"] = full.start.apply(lambda x: float(x.split(",")[0].lstrip("['").rstrip("'")))
hm["lon"] = full.start.apply(lambda x: float(x.split(",")[1].rstrip("']").lstrip(" '")))
hm["rating"] = None
hm["country"] = full.departure_country
hm["wait"] = full.waiting_time
hm["name"] = "Prino"  # https://hitchwiki.org/en/User:Prino
hm["comment"] = None
hm["datetime"] = full.departure_date + " " + full.departure_time + ":00.000000" # using datatime_ride so that one can see age of the ride on the frontend
hm["reviewed"] = 1
hm["banned"] = 0
hm["ip"] = None
hm["dest_lat"] = full.end.apply(
    lambda x: float(x.split(",")[0].lstrip("['").rstrip("'"))
)
hm["dest_lon"] = full.end.apply(
    lambda x: float(x.split(",")[1].rstrip("']").lstrip(" '"))
)
hm["signal"] = full.departure_type_method
hm["datetime_ride"] = full.departure_date + " " + full.departure_time + ":00.000000"
hm["gender"] = "male"
hm["age"] = full.departure_date.apply(
    lambda x: relativedelta(
        datetime.strptime(x, "%Y-%m-%d").date(), datetime(1960, 1, 1)
    ).years
)  # roughly
# new features
hm["driver_country"] = full.driver_country
hm["driver_gender"] = full.driver_type.map(
    {
        "-": "male",
        "V": "female",
        "E": "mixed",
        "F": "mixed",
        "V*": "female",
        "-*": "male",
        "Elec": "male",
        "V-El": "female",
        "Por":"male",
        "VPor":"female",
        "T":"male",
    },
    na_action=None,
)
hm["vehicle"] = full.driver_type.map(
    {
        "T": "truck",
        "B": "van",
        "MB": "bus",
        "C": "camper",
        "Bus": "bus",
        "M": "motorbike",
    },
    na_action=None,
)
hm["vehicle"] = hm.vehicle.fillna("car")

In [60]:
hm.comment.astype(str).replace("nan", None, inplace=True)

In [61]:
hm.to_csv("prino_to_hitchmap.csv", index=False)