In [7]:
import sqlite3
import pandas as pd
from tqdm import tqdm
import os
import wget

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

In [8]:
url = 'https://hitchmap.com/dump.sqlite'
filename = 'dump.sqlite'
if os.path.exists(filename):
        os.remove(filename)
filename = wget.download(url)
fn = 'dump.sqlite'
points = pd.read_sql('select * from points where not banned', sqlite3.connect(fn))
points["datetime"] = points["datetime"].astype("datetime64[ns]")

In [3]:
points.head()

Unnamed: 0,id,lat,lon,rating,country,wait,nickname,comment,datetime,reviewed,banned,ip,dest_lat,dest_lon,signal,ride_datetime,user_id,from_hitchwiki
0,0,40.97,27.51,3.0,TR,,Tamergem,"If you avoid the mini busses, you can get on a...",2011-05-26 10:06:17.000000,1,0,,,,,,,1.0
1,1,32.07,34.79,4.0,IL,,,,,1,0,,,,,,,
2,2,41.73,27.22,4.0,TR,,,,,1,0,,,,,,,
3,3,41.1,29.01,3.0,TR,,Xavierallard,There is a lot of traffic there and little spa...,2011-03-15 12:52:11.000000,1,0,,,,,,,1.0
4,4,30.17,67.0,3.0,PK,,,,,1,0,,,,,,,


In [4]:
hitchwiki_users = points[points['from_hitchwiki'] == 1.0].nickname.unique()
len(hitchwiki_users)

2562

In [5]:
hitchwiki_users

array(['Tamergem', 'Xavierallard', 'Rozwal', ..., 'Lunasis', 'Francagini',
       'Silverness'], shape=(2562,), dtype=object)

In [None]:
import requests

S = requests.Session()

URL = "https://hitchwiki.org/en/api.php"

def get_wiki_user(name):
    try:
        PARAMS = {
            "action": "query",
            "format": "json",
            "list": "users",
            "ususers": name,
            "usprop": "blockinfo|groups|editcount|registration|emailable|gender"
        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        return DATA["query"]["users"][0]["gender"]
    except Exception as e:
        print(f"Error fetching data for user {name}: {e}")
        return None

genders = []

for u in tqdm(hitchwiki_users):
    genders.append((u, get_wiki_user(u)))

In [7]:
users = pd.DataFrame(genders, columns=['nickname', "gender"])
users.gender.value_counts()

gender
unknown    1940
male        224
female       48
Name: count, dtype: int64

In [12]:
users.to_csv('hitchwiki_users.csv', index=False)

In [10]:
users = pd.read_csv('hitchwiki_users.csv')

In [16]:
users.head()

Unnamed: 0,nickname,gender
0,Tamergem,unknown
1,Xavierallard,male
2,Rozwal,unknown
3,Greg Pelka,male
4,Duvik,unknown


In [15]:
points.head()

Unnamed: 0,id,lat,lon,rating,country,wait,nickname,comment,datetime,reviewed,banned,ip,dest_lat,dest_lon,signal,ride_datetime,user_id,from_hitchwiki
0,0,40.97,27.51,3.0,TR,,Tamergem,"If you avoid the mini busses, you can get on a...",2011-05-26 10:06:17,1,0,,,,,,,1.0
1,1,32.07,34.79,4.0,IL,,,,NaT,1,0,,,,,,,
2,2,41.73,27.22,4.0,TR,,,,NaT,1,0,,,,,,,
3,3,41.1,29.01,3.0,TR,,Xavierallard,There is a lot of traffic there and little spa...,2011-03-15 12:52:11,1,0,,,,,,,1.0
4,4,30.17,67.0,3.0,PK,,,,NaT,1,0,,,,,,,


In [19]:
def match_gender(row):
    if row["from_hitchwiki"] == 1.0:
        lookup = users.loc[users['nickname'] == row['nickname']]
        if lookup.empty:
            return None
        gender = lookup["gender"].iloc[0]
        if gender == "unknown":
            return None
        return gender
    return None

In [20]:
points["gender"] = points.apply(match_gender, axis=1)

In [21]:
len(points[points["gender"].notna()])

3359

In [22]:
points.gender.value_counts()

gender
male      2879
female     480
Name: count, dtype: int64

In [28]:
submitters = points[["nickname","gender"]].drop_duplicates()
submitters = submitters[submitters.gender.notna()]
len(submitters)

272

In [32]:
submitters.gender.value_counts(normalize=True)

gender
male     0.82
female   0.18
Name: proportion, dtype: float64