In [None]:
import pywikibot
from pywikibot import pagegenerators
from tqdm import tqdm
from scipy.spatial import cKDTree
from coords_extraction import find_coords_and_headings
import pandas as pd
import sqlite3
import wget
import os
import math

# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Display full content in each cell
pd.set_option('display.max_colwidth', None)  # For pandas < 1.0 use -1 instead of None


from articles import articles

lang_wiki = pywikibot.Site(code='en', fam='hitchwiki')
if not lang_wiki.user():
    lang_wiki.login()

In [None]:
pages = list(pagegenerators.AllpagesPageGenerator(site=lang_wiki))

In [None]:
for page in tqdm(pages, desc="Processing pages"):
    try:
        if any(s in page.text for s in ["{{Coords"]):
            articles[page.title()] = {"text": page.text}
    except Exception as e:
        print(f"Error processing page: {e}")
        continue

In [None]:
len(articles)

In [None]:
coords = []

for article, items in tqdm(articles.items()):
    coords_results = find_coords_and_headings(raw_wiki_page=items["text"], title=article)

    coords.extend(coords_results)


In [None]:
coords_df = pd.DataFrame(coords)

In [None]:
coords_df.head()

# match to hitchhiking ride data

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))
    r = 6371 
    return c * r

In [None]:
coords_df["lat"] = coords_df["coords"].apply(lambda x: float(x.split("|")[1].strip()))
coords_df["lon"] = coords_df["coords"].apply(lambda x: float(x.split("|")[2].strip().rstrip("}")))

In [None]:
url = 'https://hitchmap.com/dump.sqlite'
filename = 'dump.sqlite'
if os.path.exists(filename):
        os.remove(filename)
filename = wget.download(url)
points = pd.read_sql('select * from points', sqlite3.connect(filename))

In [None]:
tree = cKDTree(points[['lat', 'lon']].values)

distances, indices = tree.query(coords_df[['lat', 'lon']].values)

# Add nearest node info to points DataFrame
coords_df['nearest_node_id'] = points.iloc[indices]['id'].values
coords_df['nearest_node_lat'] = points.iloc[indices]['lat'].values
coords_df['nearest_node_lon'] = points.iloc[indices]['lon'].values
coords_df['distance'] = distances



coords_df['haversine_distance_in_m'] = coords_df.apply(lambda row: haversine(row['lat'], row['lon'], row['nearest_node_lat'], row['nearest_node_lon']) * 1000, axis=1)

coords_df = coords_df.sort_values(by='haversine_distance_in_m')



In [None]:
coords_df.head(100)

In [None]:
coords_df.tail()

In [None]:
coords_df.to_csv('coords_df.csv', index=False)