In [1]:
import sqlite3
import pandas as pd
from tqdm import tqdm
import folium
import numpy as np
import os
import wget

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 


In [2]:
url = 'https://hitchmap.com/dump.sqlite'
filename = 'dump.sqlite'
if os.path.exists(filename):
        os.remove(filename)
filename = wget.download(url)

In [3]:
fn = 'dump.sqlite'
points = pd.read_sql('select * from points where not banned', sqlite3.connect(fn))
points["datetime"] = points["datetime"].astype("datetime64[ns]")

In [4]:
points.head()

Unnamed: 0,id,lat,lon,rating,country,wait,nickname,comment,datetime,reviewed,banned,ip,dest_lat,dest_lon,signal,ride_datetime,user_id,from_hitchwiki
0,0,40.974714,27.511654,3.0,TR,,Tamergem,"If you avoid the mini busses, you can get on a ride within 10 minutes, and Tekirdag city is a bridge between Istanbul and Greece. I always use that city center spot and it is quite good",2011-05-26 10:06:17,1,0,,,,,,,1.0
1,1,32.072756,34.793444,4.0,IL,,,,NaT,1,0,,,,,,,
2,2,41.727928,27.220731,4.0,TR,,,,NaT,1,0,,,,,,,
3,3,41.099858,29.007339,3.0,TR,,Xavierallard,There is a lot of traffic there and little space to stop. I found it hard.,2011-03-15 12:52:11,1,0,,,,,,,1.0
4,4,30.169989,66.999612,3.0,PK,,,,NaT,1,0,,,,,,,


In [5]:
points.info(), 44345/62768

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63149 entries, 0 to 63148
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              63149 non-null  int64         
 1   lat             63149 non-null  float64       
 2   lon             63149 non-null  float64       
 3   rating          63149 non-null  float64       
 4   country         63149 non-null  object        
 5   wait            31940 non-null  float64       
 6   nickname        26691 non-null  object        
 7   comment         44698 non-null  object        
 8   datetime        55909 non-null  datetime64[ns]
 9   reviewed        63149 non-null  int64         
 10  banned          63149 non-null  int64         
 11  ip              63149 non-null  object        
 12  dest_lat        11480 non-null  float64       
 13  dest_lon        11480 non-null  float64       
 14  signal          4843 non-null   object        
 15  ri

(None, 0.7064905684425185)

# activity

In [None]:
df = points
df["datetime"] = df["datetime"].astype("datetime64[ns]")
df = df[df["datetime"] >= "2005-01-01"]

# Create a complete date range for all months between min and max date
all_months = pd.date_range(df["datetime"].min().replace(day=1), df["datetime"].max().replace(day=1), freq='MS')
all_months_index = pd.MultiIndex.from_arrays([all_months.year, all_months.month])

# Group by year and month, and count
grouped = df["datetime"].groupby([df["datetime"].dt.year, df["datetime"].dt.month]).count()
grouped = grouped.reindex(all_months_index, fill_value=0)

# Only show x-ticks for January of each year
january_idx = [i for i, (y, m) in enumerate(grouped.index) if m == 1]
january_years = [str(y) for (y, m) in grouped.index if m == 1]

plot = grouped.plot(kind="bar", title="Hitchhiking rides per month (2005-2025)", figsize=(15, 5))
plot.set_ylabel("Number of recorded rides")
plot.set_xticks(january_idx)
plot.set_xticklabels(january_years, rotation=0)
plot.figure.tight_layout()
plot.figure.savefig("plots/rides_per_month.png")



In [None]:
df = points[['datetime', 'wait']].dropna()
df["datetime"] = df["datetime"].astype("datetime64[ns]")

plot = df['datetime'].groupby([df["datetime"].dt.year, df["datetime"].dt.month]).count().plot(kind="bar", title="Points per month with waiting time recorded", figsize=(50, 5))
# plot.figure.savefig("plots/points_per_month_with_waittime.png")

### seasonality


In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

df = points.copy()
df["datetime"] = df["datetime"].astype("datetime64[ns]")
df["month"] = df["datetime"].dt.month

plt.figure(figsize=(10, 5))
sns.histplot(df["month"], bins=12, discrete=True)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlabel("Month")
plt.ylabel("Number of rides")
plt.title("Accumulated number of recorded hitchhiking rides by month (2005-2025)")
plt.tight_layout()
plt.savefig("plots/number_of_rides_per_month.png")
plt.show()

# history

## determine time of entries with missing timestamp

In [6]:
df = points
df["datetime"] = df["datetime"].astype("datetime64[ns]")

In [7]:
df = points[points["datetime"].isna()]
len(df), len(points)

(7240, 63149)

In [64]:
a = df[(df.datetime.isna()) & (df.comment.notna())]

In [None]:
a[a.comment.str.contains("2005")].head(1)

In [None]:
a[a.comment.str.contains("2006")].head(1)

In [None]:
a[a.comment.str.contains("2010")].head(1)

In [None]:
a[a.comment.str.contains("2011")].head(1)

## waiting time

In [None]:
df = points[points["datetime"].notna()]
df.sort_values(by="datetime", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(by="datetime", inplace=True)


In [None]:
df[df.wait.notna()].head(100)

## nickname introduction -> defines liftershalte to hitchwiki switch

In [None]:
df[df["nickname"].notna()].head()

In [None]:
df.info()

## destination location and signal introduction

In [None]:
df.head()

In [None]:
df[df["dest_lat"].notna()].head()

In [None]:
df[df["signal"].notna()].head(5)

## who contributed before 2010?

In [None]:
df[~df["nickname"].isna()].head(1)

In [None]:
old = points[points["datetime"] < "2010-08-11"].sort_values(by='datetime')
old = old[old["datetime"] > "2005-08-01"]
m = folium.Map(location=[old['lat'].mean(), old['lon'].mean()], zoom_start=2)
for _, row in tqdm(old.iterrows()):
    folium.CircleMarker(location=[row['lat'], row['lon']], radius=2).add_to(m)
m

In [None]:
# there are multiple people who submitted points before 2010
old[old["datetime"] > "2007-03-28"].head(12)

## contributions during malfunction 2017-2020

In [None]:
df[(df["datetime"] > "2017-01-01") & (df["datetime"] < "2020-05-01")]["nickname"].value_counts()
# no-one adding spots consistantly in this period

In [None]:
# there are users who kept adding over 2017 but something caused a heavy drop in activity of other users
df[df["nickname"] == "Sitko"].tail()

In [None]:
df[df["nickname"] == "Thumb-up"].tail()

## contributions during 2010-2017

In [None]:
df[(df["datetime"] < "2017-01-01") & (df["datetime"] > "2005-05-01")]["nickname"].value_counts()


In [None]:
df = points[(points.ride_datetime.notna()) & (points.ride_datetime != "")].sort_values(by='ride_datetime')
df.head(100)

## determine hitchwiki to hitchmap switch

In [None]:
# look at first review from Bob
df[df["datetime"] > "2022-09-01"].head(20)

Unnamed: 0,id,lat,lon,rating,country,wait,nickname,comment,datetime,reviewed,banned,ip,dest_lat,dest_lon,signal,ride_datetime,user_id,from_hitchwiki
13170,13179,53.482935,10.023799,4.0,DE,45.0,,"As anyone else said, good place for Bremen and firther",2022-09-20 21:40:07.000000,1,0,,,,,,,
28025,28039,51.895018,4.531432,4.0,NL,35.0,,Took around 30 minutes on a Saturday morning to get on the highway to Breda.,2022-09-20 21:41:46.000000,1,0,,,,,,,
23214,23226,51.660075,4.681579,4.0,NL,,,Got a ride to Antwerp in 40 minutes on a Saturday. Not much but enough traffic.,2022-09-20 21:44:27.000000,1,0,,,,,,,
16341,16350,51.207963,4.547658,5.0,BE,,,"Good Place, took me 10 minutes to get a ride in direction to Hasselt.",2022-09-20 21:45:22.000000,1,0,,,,,,,
18247,18259,51.069202,5.148084,4.0,BE,,,"After 45 minutes, I got a lift to Aachen. Some german cars, but also lots of traffic towards Hasselt and beyond",2022-09-20 21:47:28.000000,1,0,,,,,,,
4895,4903,50.823365,6.019266,4.0,DE,,,"Good Place for Cologne, as everyone above said.",2022-09-20 21:48:25.000000,1,0,,,,,,,
29801,29815,51.347666,8.299868,5.0,DE,20.0,,ok,2022-10-08 15:09:28.000000,1,0,,,,,,,
49987,4070133487,48.459903,8.40902,4.0,DE,10.0,Bob,Nice spot to get to Kniebis!,2022-10-13 13:07:19.850954,0,0,,,,,,,0.0
49977,3214828793,51.563596,4.732251,5.0,NL,20.0,Bob,"Hitched a ride to Brussels in 20 minutes, stood by the highway sign",2022-11-02 11:11:50.571645,0,0,,50.818835,4.404008,,,,0.0
49989,4180146009,51.82873,5.821075,5.0,NL,10.0,Bob,Getting a direct ride to Breda is easy from this spot (n=2),2022-11-02 11:17:21.996908,0,0,,,,,,,0.0


# top contributors

In [None]:
points[points.nickname != ""].nickname.value_counts().head(20)

In [None]:
import matplotlib.pyplot as plt

user_counts = points.nickname.value_counts().head(100)
plt.figure(figsize=(12, 6))
user_counts.plot(kind='bar')
plt.xlabel('Nickname')
plt.ylabel('Number of Rides')
plt.title('Top 100 Contributors')
plt.tight_layout()
plt.savefig("plots/top_100_contributors.png")
plt.show()

In [None]:
user_counts = points.nickname.value_counts()

bins = list(range(0, 105, 5)) + [np.inf]
labels = [f'{i+1}-{i+5}' for i in range(0, 100, 5)] + ['>100']
grouped = pd.cut(user_counts.values, bins=bins, labels=labels, right=True)
grouped_counts = pd.Series(grouped).value_counts().sort_index()

plt.figure(figsize=(10, 5))
grouped_counts.plot(kind='bar')
plt.xlabel('Number of Rides Submitted (Grouped)')
plt.ylabel('Number of Hitchhikers')
plt.title('Distribution of Hitchhikers by Number of Submitted Rides')
plt.tight_layout()
plt.savefig("plots/distribution_of_hitchhikers_by_number_of_submitted_rides.png")
plt.show()


In [None]:
# recent
points[points.datetime.dt.year == 2024].nickname.value_counts().head(20)