In [None]:
import sqlite3
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import wget
from datasets import Dataset
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append("../../../")

# from https://github.com/Hitchwiki/hitchhiking-data-standard/tree/main/python
from migration.python import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 


  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


# Migrate the dataset obtained from hitchmap.com and publish it to huggingface

In [2]:
url = 'https://hitchmap.com/dump.sqlite'
filename = 'dump.sqlite'
if os.path.exists(filename):
        os.remove(filename)
filename = wget.download(url)

In [3]:
fn = 'dump.sqlite'
points = pd.read_sql('select * from points where not banned', sqlite3.connect(fn))
points["datetime"] = points["datetime"].astype("datetime64[ns]")

# cleaning
points["ride_datetime"].replace("0224-10-31T21:30", None, inplace=True)
points["ride_datetime"].replace("0025-03-07T08:00", None, inplace=True)
points["ride_datetime"].replace("1014-11-04T14:30", None, inplace=True)
points["ride_datetime"].replace("0202-04-03T18:50", None, inplace=True)

points["ride_datetime"] = points["ride_datetime"].astype("datetime64[ns]")
len(points)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  points["ride_datetime"].replace("0224-10-31T21:30", None, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  points["ride_datetime"].replace("0025-03-07T08:00", None, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 

63165

In [4]:
points.head()

Unnamed: 0,id,lat,lon,rating,country,wait,nickname,comment,datetime,reviewed,banned,ip,dest_lat,dest_lon,signal,ride_datetime,user_id,from_hitchwiki
0,0,40.974714,27.511654,3.0,TR,,Tamergem,"If you avoid the mini busses, you can get on a ride within 10 minutes, and Tekirdag city is a bridge between Istanbul and Greece. I always use that city center spot and it is quite good",2011-05-26 10:06:17,1,0,,,,,NaT,,1.0
1,1,32.072756,34.793444,4.0,IL,,,,NaT,1,0,,,,,NaT,,
2,2,41.727928,27.220731,4.0,TR,,,,NaT,1,0,,,,,NaT,,
3,3,41.099858,29.007339,3.0,TR,,Xavierallard,There is a lot of traffic there and little space to stop. I found it hard.,2011-03-15 12:52:11,1,0,,,,,NaT,,1.0
4,4,30.169989,66.999612,3.0,PK,,,,NaT,1,0,,,,,NaT,,


In [5]:
no_date = points[points["datetime"].isna()]
with_date = points[~points["datetime"].isna()]

lift = pd.concat([no_date, with_date[with_date["datetime"] < "2010-08-11"]])

wiki = with_date[(with_date["datetime"] >= "2010-08-11") & (with_date["datetime"] < "2022-10-13")]

map = with_date[with_date["datetime"] >= "2022-10-13"]

In [6]:
len(lift), len(wiki), len(map), len(lift) + len(wiki) + len(map)

(7477, 42466, 13222, 63165)

In [7]:
def map_signal(signal: str) -> Signal:
    if not signal:
        return None

    if signal == "sign":
        return Signal(
            methods=["sign"],
        )
    elif signal == "thumb":
        return Signal(
            methods=["thumb"],
        )
    elif signal == "ask":
        return Signal(
            methods=["asking"],
        )
    elif signal == "ask-sign":
        return Signal(
            methods=["asking", "sign"],
        )
    else:
        return None


def create_record_from_row(row: pd.Series, source: str, license: str, rating_formula= lambda x: x) -> HitchhikingRecord:
    stops = [
        Stop(
            location=Location(latitude=row["lat"], longitude=row["lon"], is_exact=True),
            arrival_time=row["ride_datetime"].strftime("%Y-%m-%dT%H:%M:%S") if pd.notna(row["ride_datetime"]) else None,
            departure_time=(row["ride_datetime"] + pd.to_timedelta(row["wait"], unit="m")).strftime(
                "%Y-%m-%dT%H:%M:%S"
            )
            if pd.notna(row["ride_datetime"]) and pd.notna(row["wait"])
            else None,
            waiting_duration=f"{int(row['wait'])}M" if pd.notna(row["wait"]) else None,
        ),
    ]
    if pd.notna(row["dest_lat"]) and pd.notna(row["dest_lon"]):
        stops.append(Stop(location=Location(latitude=row["dest_lat"], longitude=row["dest_lon"], is_exact=False)))

    entry = HitchhikingRecord(
        stops=stops,
        rating=rating_formula(row["rating"]),
        hitchhikers=[
            Hitchhiker(
                nickname=row["nickname"],
            )
        ],
        comment=row["comment"],
        signals=[map_signal(row["signal"])] if row["signal"] else None,
        occupants=None,
        mode_of_transportation=None,
        ride=None,
        declined_rides=None,
        source=source,  # "hitchmap.com"
        license=license,  # "odbl"
        submission_time=row["datetime"].strftime("%Y-%m-%dT%H:%M:%S") if pd.notna(row["datetime"]) else None,
    )

    return entry

In [8]:
entries = []

for _, row in tqdm(lift.iterrows(), total=len(lift)):
    entries.append(
        create_record_from_row(
            row,
            source="liftershalte.info",
            license="cc-by-sa-4.0",
        )
    )

for _, row in tqdm(wiki.iterrows(), total=len(wiki)):
    entries.append(
        create_record_from_row(
            row,
            source="hitchwiki.org",
            license="cc-by-sa-4.0",
        )
    )

for _, row in tqdm(map.iterrows(), total=len(map)):
    entries.append(
        create_record_from_row(
            row,
            source="hitchmap.com",
            license="odbl",
        )
    )

100%|██████████| 7477/7477 [00:00<00:00, 10651.52it/s]
100%|██████████| 42466/42466 [00:03<00:00, 10738.58it/s]
100%|██████████| 13222/13222 [00:01<00:00, 7164.16it/s]


In [9]:
entries.reverse()

In [10]:
print(entries[0].model_dump_json(indent=2, exclude_none=True))

{
  "stops": [
    {
      "location": {
        "latitude": 41.9308780118252,
        "longitude": 25.605654716491703,
        "is_exact": true
      },
      "waiting_duration": "25M"
    },
    {
      "location": {
        "latitude": 41.71828672552955,
        "longitude": 26.342468261718754,
        "is_exact": false
      }
    }
  ],
  "rating": 5,
  "hitchhikers": [
    {
      "nickname": "kyliann"
    }
  ],
  "comment": "Great spot for the border!",
  "source": "hitchmap.com",
  "license": "odbl",
  "submission_time": "2024-12-09T19:36:11"
}


In [11]:
entries_dicts = [entry.model_dump(exclude_none=False, by_alias=True) for entry in entries]

In [12]:
entries_dicts[:10]

[{'stops': [{'location': {'latitude': 41.9308780118252,
     'longitude': 25.605654716491703,
     'is_exact': True},
    'arrival_time': None,
    'departure_time': None,
    'waiting_duration': '25M'},
   {'location': {'latitude': 41.71828672552955,
     'longitude': 26.342468261718754,
     'is_exact': False},
    'arrival_time': None,
    'departure_time': None,
    'waiting_duration': None}],
  'rating': 5,
  'hitchhikers': [{'origin_location': None,
    'origin_country': None,
    'year_of_birth': None,
    'gender': None,
    'languages': None,
    'was_driver': None,
    'nickname': 'kyliann',
    'hitchhiking_since': None,
    'reasons_to_hitchhike': None}],
  'comment': 'Great spot for the border!',
  'signals': None,
  'occupants': None,
  'mode_of_transportation': None,
  'ride': None,
  'declined_rides': None,
  'source': 'hitchmap.com',
  'license': 'odbl',
  'submission_time': '2024-12-09T19:36:11'},
 {'stops': [{'location': {'latitude': 49.757773078442945,
     'longitu

In [13]:
huggingface_df = pd.DataFrame(entries_dicts)

In [14]:
# TODO MethodEnum.thumb as string

In [15]:


HF_TOKEN = os.getenv("HF_TOKEN")

# 1. Log in to Hugging Face (use your token from https://huggingface.co/settings/tokens)
login(token=HF_TOKEN)

hf_dataset = Dataset.from_pandas(huggingface_df)

# 3. Push to Hugging Face Hub
hf_dataset.push_to_hub("Hitchwiki/hitchhiking_rides_dataset")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Creating parquet from Arrow format: 100%|██████████| 64/64 [00:00<00:00, 519.55ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/Hitchwiki/hitchhiking_rides_dataset/commit/497ef2d9f4f4bd686d29c937a48d4e07a53f87b6', commit_message='Upload dataset', commit_description='', oid='497ef2d9f4f4bd686d29c937a48d4e07a53f87b6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Hitchwiki/hitchhiking_rides_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Hitchwiki/hitchhiking_rides_dataset'), pr_revision=None, pr_num=None)