<a href="https://www.kaggle.com/code/mattop/geoguessr-user-dataset-api-pull?scriptVersionId=196943405" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import re
import time
from datetime import datetime
from typing import Any, Optional
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib import parse
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import os
import shutil
import json

In [2]:
current_date = datetime.now().strftime('%m-%d-%Y')
print(current_date)

09-17-2024


In [3]:
metadata = {
    "title": "Geoguessr User Dataset",
    "id": "mattop/geoguessr-user-dataset",
    "licenses": [
        {
            "name": "Apache 2.0"
        }
    ],
    "resources": [
        {
            "path": f"geoguessr-user-data-{current_date}.csv",
            "description": "This file contains user data from the Geoguessr platform."
        }
    ],
    "columns": [
        {
            "name": "nick",
            "description": "nickname"
        }
    ]
}

metadata_file_path = '/kaggle/working/geoguessr-user-dataset/dataset-metadata.json'

In [4]:
pd.set_option('display.max_columns', 150)
os.makedirs('/root/.kaggle/', exist_ok = True)
shutil.copy('/kaggle/input/kaggle-api/kaggle.json', '/root/.kaggle/kaggle.json')
os.chmod('/root/.kaggle/kaggle.json', 600)

In [5]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

with open('/kaggle/input/kaggle-api/kaggle.json', 'r') as file:
    NCFA_COOKIE = json.load(file)["geo_key"]

In [6]:
id_df = pd.read_csv("/kaggle/input/140-000-geoguessr-player-id-list/full_id_list.csv")
full_id_list = list(id_df.ids)
len(full_id_list)

143812

In [7]:
def flatten_dict(d: dict, parent_key: str='', separator: str=''):
    """
    Flattens a nested dictionary into a single-level dictionary.

    Args:
        d (dict): The input dictionary to be flattened.
        parent_key (str, optional): The parent key used for recursive calls. Defaults to ''.
        separator (str, optional): The separator to be used between keys. Defaults to ''.

    Returns:
        dict: The flattened dictionary.

    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{separator}{k.capitalize()}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, separator).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [8]:
class GeoguessrStats:
    def __init__(self, datas: dict) -> None:
        datas = flatten_dict(datas)
        for key, value in datas.items():
            setattr(self, key, value)

    @classmethod
    def from_api_response(cls, response_json: dict):
        return cls(response_json)

    def to_dataframe(self) -> pd.DataFrame:
        data = {key: [getattr(self, key)] for key in vars(self)}
        return pd.DataFrame(data)

In [9]:
class Geoguessr:
    """Represents a geoguessr connection that connects to the Geoguessr API.
    This class is used to interact with the Geoguess API/
    """
    def __init__(self, _ncfa) -> None:
        self._ncfa = _ncfa
        self.headers = {
            "Content-Type": "application/json",
            "cookie": f"_ncfa={self._ncfa}",
        }
        self.session = requests.Session()
        self.session.headers = self.headers
        self.me = None
        self.me_stats = None
        self.friends = None

    def get_all_my_infos(self):
        """
        Retrieves all the necessary information for the current user.

        This function sends a GET request to the Geoguessr API to retrieve the user's profile information
        using the provided session. It then extracts the user's ID from the response and stores it in the
        'id' attribute of the class. The function also calls the 'get_user_infos' method to retrieve the
        user's detailed information, the 'get_user_stats' method to retrieve the user's stats, and the
        '__get_my_friends_list' method to retrieve the user's friends list.

        Parameters:
            self (object): The instance of the class that the method is called on.

        Returns:
            None
        """
        with self.session.get("https://www.geoguessr.com/api/v3/profiles/") as r:
            self.id = (r.json())["user"]["id"]
        self.me = self.get_user_infos(self.id)
        self.me_stats = self.get_user_stats(self.id)
        self.friends = self.__get_my_friends_list()

    def __get_my_friends_list(self):
        with self.session.get(
            "https://www.geoguessr.com/api/v3/social/friends/summary?page=0&fast=true"
        ) as r:
            js = r.json()
            return {friend["nick"]: friend["userId"] for friend in js["friends"]}

    def get_geoguessr_stats(self, userId):
        with self.session.get(
                f"https://www.geoguessr.com/api/v4/stats/users/{userId}"
            ) as r:
                return GeoguessrStats.from_api_response(r.json())
    
    
    def get_geoguessr_infos(self, userId):
        with self.session.get(
                f"https://www.geoguessr.com/api/v3/users/{userId}"
            ) as r:
                return GeoguessrStats.from_api_response(r.json())
            
    def get_geoguessr_ranked_progress(self, userId):
        with self.session.get(
                f"https://www.geoguessr.com/api/v4/ranked-system/progress/{userId}"
            ) as r:
                return GeoguessrStats.from_api_response(r.json())
            
    def get_geoguessr_ranked_best(self, userId):
        with self.session.get(
                f"https://www.geoguessr.com/api/v4/ranked-system/best/{userId}"
            ) as r:
                return GeoguessrStats.from_api_response(r.json())
            
    def get_random_maps(self, list_n):
        with self.session.get(
                "https://www.geoguessr.com/api/v3/social/maps/browse/popular/random"
            ) as r:
                return GeoguessrStats.from_api_response(r.json()[list_n])
            
            
    def get_default_maps(self, list_n):
        with self.session.get(
                "https://www.geoguessr.com/api/maps"
            ) as r:
                return GeoguessrStats.from_api_response(r.json()[list_n])

In [10]:
client = Geoguessr(NCFA_COOKIE)

In [11]:
def sample_random_ids():
    ids = []
    for i in range(1_000):
        if len(set(ids)) == 48: break
        for j in range(0, 3):
            id = client.get_random_maps(j).creatorId
            if id not in ids:
                ids.append(id)
    return ids

def sample_random_maps():
    urls = []
    for i in range(1_000):
        if len(set(urls)) == 48: break
        for j in range(0, 3):
            url = "https://www.geoguessr.com" + client.get_random_maps(j).url
            if url not in urls:
                urls.append(url)
    return urls

In [12]:
drop_cols = ["id", "url"]

rename_cols = [
    "competitiveElo",
    "competitiveRating",
    "competitiveLastratingchange",
    "competitiveDivisionType",
    "competitiveDivisionStartrating",
    "competitiveDivisionEndrating",
    "competitiveOnleaderboard"
]

rename_dict = {col: col + "_deprecated" for col in rename_cols}

In [13]:
def fetch_player_data(player_id):
    try:
        user_stats = client.get_geoguessr_stats(player_id).to_dataframe()
        user_info = client.get_geoguessr_infos(player_id).to_dataframe()
        try:
            user_ranked_progress = client.get_geoguessr_ranked_progress(player_id).to_dataframe()
            user_ranked_best = client.get_geoguessr_ranked_best(player_id).to_dataframe().rename(lambda x: x + "Best", axis=1)
        except:
            user_ranked_progress = None
            user_ranked_best = None
        dfs = [df for df in [user_info, user_stats, user_ranked_progress, user_ranked_best] if df is not None]
        df_row = pd.concat(dfs, axis=1)
        return df_row, None
    
    except Exception as e:
        return None, player_id

def scrape_player_data(player_ids):
    results = []
    missed_ids = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(fetch_player_data, player_id): player_id for player_id in player_ids}
        for future in tqdm(as_completed(futures), total=len(player_ids)):
            df_row, missed_id = future.result()
            if df_row is not None:
                results.append(df_row)
            if missed_id is not None:
                missed_ids.append(missed_id)
    
    df = pd.concat(results, axis=0, ignore_index=True).drop(columns = drop_cols, axis=1).rename(columns = rename_dict)
    return df, missed_ids

In [14]:
player_df, missed_ids = scrape_player_data(full_id_list)

100%|██████████| 143812/143812 [9:10:20<00:00,  4.36it/s]
  df = pd.concat(results, axis=0, ignore_index=True).drop(columns = drop_cols, axis=1).rename(columns = rename_dict)


In [15]:
missed_ids_df = pd.DataFrame({"missed_ids": missed_ids})
print(missed_ids_df.shape[0])

224


In [16]:
iso_3166 = pd.read_html("https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes")[0]
iso_3166.columns = ['_'.join(col).strip() for col in iso_3166.columns.values]

In [17]:
unwanted_chars = ["\u200a", "\xa0", "(the)"]

country_dict = {
    row["ISO 3166-1[2]_A-2 [5]"].lower(): row["ISO 3166[1] name[5]_ISO 3166[1] name[5]"]
    for _, row in iso_3166.iterrows()
    if isinstance(row["ISO 3166-1[2]_A-2 [5]"], str)
}

In [18]:
def clean_value(value):
    for char in unwanted_chars:
        value = value.split(char)[0]
    return value

country_dict = {k: clean_value(v) for k, v in country_dict.items()}
country_dict["gb"] = "United Kingdom"

In [19]:
player_df["countryCode"] = player_df["countryCode"].map(country_dict)

In [20]:
if not os.path.exists('/kaggle/working/geoguessr-user-dataset/'):
    os.makedirs('/kaggle/working/geoguessr-user-dataset/')

with open(metadata_file_path, 'w') as metadata_file:
    json.dump(metadata, metadata_file, indent=4)

In [21]:
player_df.to_csv(f"geoguessr-user-data-{current_date}.csv", index=False)
player_df.to_csv(f"/kaggle/working/geoguessr-user-dataset/geoguessr-user-data-{current_date}.csv", index=False)
missed_ids_df.to_csv(f"missed_ids-{current_date}.csv", index=False)

In [22]:
api.dataset_create_version('/kaggle/working/geoguessr-user-dataset/', version_notes=f'{current_date}-data, added new ranked system variables', delete_old_versions=False)

Starting upload for file geoguessr-user-data-09-17-2024.csv


100%|██████████| 90.3M/90.3M [00:05<00:00, 16.3MB/s]


Upload successful: geoguessr-user-data-09-17-2024.csv (90MB)


https://www.kaggle.com/mattop/geoguessr-user-dataset