<a href="https://www.kaggle.com/code/mattop/geoguessr-user-dataset-stats-visualizations?scriptVersionId=196394193" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')

In [None]:
master_df = pd.read_csv("/kaggle/input/geoguessr-user-dataset/geoguessr-user-data-09-11-2024.csv")

In [None]:
RANKED_PALETTE = ["#9a6a33", "#9ba6b2", "#e5c655", "#f77ea6", "#965ee0"]
RANKED_HUE_ORDER = ['Bronze', 'Silver', 'Gold', 'Master', 'Champion']

PRO_PLAYERS = [
    "Consus", "rainbolt", "zi8gzag", "Blinky", "Lennli", 
    "John Harvey Kellogg", "Jamabi", "mipt_demetra", 
    "Topotic (YT)", "Radu C", "Debre", "maccem", 
    "Finbarr", "toro3317", "OceanMan", "Trichter33"
]

In [None]:
n_banned = len(master_df.query('isBanned == True'))
n_players = master_df.shape[0]

print(f"{n_banned} of {n_players} players are banned ({round(n_banned / n_players * 100, 3)}%)")

In [None]:
master_df.query("nick == 'rainbolt'").T

In [None]:
master_df.query("isBanned == True")[[
"nick",
"countryCode",
"duelsTotalWinratio", 
"duelsTotalNumgamesplayed",
"duelsTotalAvgguessdistance",
"competitiveRating",
 ]].sort_values("duelsTotalWinratio", ascending=False).style.bar(subset=[
"duelsTotalWinratio", 
"duelsTotalAvgguessdistance",
"competitiveRating"
], color='green')

In [None]:
df = master_df.query("isBanned == False").reset_index(drop = True)

In [None]:
plt.figure(figsize = (14, 9))
value_counts = df['nick'].value_counts(ascending=True)
value_counts[value_counts > 15].plot.barh(edgecolor = "#000000", color = "green")

for idx, val in enumerate(value_counts[value_counts > 15]):
    plt.text(val + value_counts.max() * 0.01, idx, str(val), va='center')

plt.xlim(right = value_counts.max() + value_counts.max() * 0.1)

plt.xlabel('Count')
plt.ylabel('Nick')
plt.title('Most popular nicknames in Geoguessr')
plt.show()

In [None]:
df.created = pd.to_datetime(df.created.str[:-5])

In [None]:
plt.figure(figsize = (14, 8))
df_counts = df.resample('M', on='created').size().reset_index(name='count')

ax = sns.lineplot(x='created', y='count', data=df_counts, marker='o', color = "green")

plt.text(18_750, 7000, "March 2021 (6,972 accounts)")
plt.title('Number of Geoguessr Accounts Created (2015-2025)')
plt.xlabel('Date')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
value_counts = df.type.value_counts(ascending=True)
value_counts.plot.barh(edgecolor = "#000000", color = "green")

for idx, val in enumerate(value_counts):
    plt.text(val + value_counts.max() * 0.01, idx, str(val), va='center')

plt.xlim(right = value_counts.max() + value_counts.max() * 0.1)
plt.show()

In [None]:
col = "consumedTrial"
print("Consumed trial?")
print(f"{df[col].value_counts().keys()[0]} {round(df[col].value_counts(True).values[0] * 100, 2)}% ({df[col].value_counts().values[0]})")
print(f"{df[col].value_counts().keys()[1]} {round(df[col].value_counts(True).values[1] * 100, 2)}% ({df[col].value_counts().values[1]})")

In [None]:
col = "isVerified"
print("Is verified?")
print(f"{df[col].value_counts().keys()[0]} {round(df[col].value_counts(True).values[0] * 100, 2)}% ({df[col].value_counts().values[0]})")
print(f"{df[col].value_counts().keys()[1]} {round(df[col].value_counts(True).values[1] * 100, 2)}% ({df[col].value_counts().values[1]})")

In [None]:
plt.figure(figsize = (14, 9))
value_counts = df.countryCode.value_counts(ascending=True)[-30:]
value_counts.plot.barh(edgecolor = "#000000", color = "green")

for idx, val in enumerate(value_counts):
    plt.text(val + value_counts.max() * 0.01, idx, str(val), va='center')

plt.xlim(right = value_counts.max() + value_counts.max() * 0.1)
plt.title("Global Geoguessr Community: Top 30 Countries by Player Count")
plt.ylabel("")
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
ax = sns.histplot(df, x = "progressLevel", binwidth = 5, kde = True)
plt.show()

In [None]:
all_gamemodes = list(df.filter(like="Numgamesplayed").columns)
all_gamemodes.remove("duelsTotalNumgamesplayed")
all_gamemodes.remove("rankedTeamDuelsTotalNumgamesplayed")
gamemode_names = [game.split("Numgamesplayed")[0] for game in all_gamemodes]
total_games = [sum(df[game]) for game in all_gamemodes]
combined = list(zip(total_games, gamemode_names))
combined_sorted = sorted(combined, key=lambda x: x[0], reverse=True)
total_games, gamemode_names = zip(*combined_sorted)

In [None]:
total_games = list(total_games)
gamemode_names = list(gamemode_names)
gamemode_names[1] = "duelsMoving"

plt.figure(figsize = (14, 9))

ax = sns.barplot(x = total_games, y = gamemode_names, edgecolor = "#000000")

for idx, val in enumerate(total_games):
    plt.text(val + 100_000, idx, str(val), va='center')

plt.title("Distribution of Multiplayer Games Played Across Different Modes")
plt.xlim(right = np.max(total_games) + 1.5e6)
plt.show()

In [None]:
for division, name in zip(sorted(df.competitiveDivisionType.unique()), ["Bronze", "Silver", "Gold", "Master", "Champion"]):
    temp_df = df.query(f"competitiveDivisionType == {division}")
    print(division, f"Competitive division rating range: {temp_df.competitiveDivisionStartrating.values[0]}-{temp_df.competitiveDivisionEndrating.values[0] - 1} ({name})")

In [None]:
def map_division(score):
    if 0 <= score <= 449: return 'Bronze'
    elif 450 <= score <= 674: return 'Silver'
    elif 675 <= score <= 849: return 'Gold'
    elif 850 <= score <= 1099: return 'Master'
    else: return 'Champion'

In [None]:
df["competitiveDivision"] = df['competitiveRating'].apply(map_division)

In [None]:
print(f"{round(len(df.query('competitiveRating != 0')) / len(df) * 100, 1)}% of the community plays ranked")

In [None]:
plt.figure(figsize=(14, 8))
ax = sns.histplot(
    df.query("competitiveRating not in [0]"),
    x="competitiveRating",
    hue="competitiveDivision",
    palette = RANKED_PALETTE,
    binwidth=15,
    hue_order=RANKED_HUE_ORDER
)

plt.xlabel("Competitive Rating")
plt.title("Competitive Rating Distribution")
plt.show()

In [None]:
ranked_df = df.query("competitiveRating not in [0]")
n_players = 0

for division in reversed(RANKED_HUE_ORDER):
    if division == "Bronze":
        percentage = round(abs(n_players / ranked_df.shape[0] - 1), 3) * 100
        bottom_percentage = f"{percentage}%"
        print(f"{division} is bottom {bottom_percentage} of players")
    else:
        n_players += (ranked_df[ranked_df["competitiveDivision"] == division].shape[0])
        percentage = round(n_players / ranked_df.shape[0] * 100, 1)
        top_percentage = f"{percentage}%"
        print(f"{division} is top {top_percentage} of players")

In [None]:
def plot_stat_by_gamemode(gamemode, stat):

    plt.figure(figsize=(14, 8))

    ax = sns.boxplot(
        data = df.query(f"duels{gamemode}Numgamesplayed > 50"), orient = "h",
        x = f"duels{gamemode}{stat}", y = "competitiveDivision", palette = RANKED_PALETTE,
        hue_order = RANKED_HUE_ORDER, width = 0.6, order = RANKED_HUE_ORDER
    )
    
    if stat == "Winratio":
        plt.axvline(0.5, linestyle = "--", color = "#000000")
    if stat == "Avgguessdistance":
        plt.xlabel("Average Guess Distance (Meters)")
    if gamemode == "":
        plt.title(f"Moving {stat} by Division")
    else:
        plt.title(gamemode + f" {stat} by Division")
        
    ax.invert_yaxis()
    plt.show()

In [None]:
# "" = Moving gamemode
for gamemode in ["", "NoMove", "Nmpz", "Total"]:
    plot_stat_by_gamemode(gamemode=gamemode, stat="Numgamesplayed")

In [None]:
for gamemode in ["", "NoMove", "Nmpz", "Total"]:
    plot_stat_by_gamemode(gamemode=gamemode, stat="Avgguessdistance")

In [None]:
for gamemode in ["", "NoMove", "Nmpz", "Total"]:
    plot_stat_by_gamemode(gamemode=gamemode, stat="Winratio")

In [None]:
for gamemode in ["", "NoMove", "Nmpz", "Total"]:
    df[f"duels{gamemode}Avgnumberofrounds"] = df[f"duels{gamemode}Numguesses"] / df[f"duels{gamemode}Numgamesplayed"]
    plot_stat_by_gamemode(gamemode=gamemode, stat="Avgnumberofrounds")

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = df.query("duelsNumgamesplayed > 100"), x = "competitiveRating", y = "duelsAvgguessdistance", s = 8, 
                hue = "competitiveDivision", palette = RANKED_PALETTE, hue_order = RANKED_HUE_ORDER)

plt.xlabel("Competitive Rating")
plt.ylabel("Moving Duels Average Guess Distance (Meters)")
plt.legend(title = "Competitive Division")
plt.title("Competitive Rating by Average Guess Distance (Meters) in Moving Duels (>100 Games Played)")
plt.grid()
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = df.query("duelsNumgamesplayed > 100"), x = "duelsWinratio", y = "duelsAvgguessdistance", s = 8, 
                hue = "competitiveDivision", palette = RANKED_PALETTE, hue_order = RANKED_HUE_ORDER)

plt.xlabel("Win Ratio")
plt.ylabel("Moving Duels Average Guess Distance (Meters)")
plt.legend(title = "Competitive Division")
plt.title("Win Ratio by Average Guess Distance (Meters) in Moving Duels (>100 Games Played)")
plt.grid()
plt.show()

In [None]:
plt.figure(figsize = (14, 8))
sns.scatterplot(data = df.query("duelsNumgamesplayed > 100"), x = "duelsAvgnumberofrounds", y = "duelsAvgguessdistance", s = 8, 
                hue = "competitiveDivision", palette = RANKED_PALETTE, hue_order = RANKED_HUE_ORDER)

plt.xlabel("Average Number of Rounds Played")
plt.ylabel("Moving Duels Average Guess Distance (Meters)")
plt.legend(title = "Competitive Division")
plt.title("Average Number of Rounds Played by Average Guess Distance (Meters) in Moving Duels (>100 Games Played)")
plt.grid()
plt.show()