In [1]:
from faker import Faker
import random
from datetime import datetime
from tqdm import tqdm

fake = Faker()
Faker.seed(42)

# Constants
GAME_NAME = "Mythic Quest"
MEMBERSHIPS = ["Free", "Silver", "Gold", "Platinum"]
DEVICES = ["PC", "Xbox", "PlayStation", "Mobile", "Switch"]
ACHIEVEMENTS_POOL = [
    "First Blood", "Monster Slayer", "Treasure Hunter", "Dungeon Master",
    "Speed Runner", "Unstoppable", "Champion of the Arena", "Lore Keeper"
]
# Expanded heroes list
HEROES = [
    "Artemis", "Zephyr", "Titan", "Morgana", "Aurora", "Vulcan",
    "Nyx", "Ragnar", "Selene", "Orion", "Echo", "Fenrir", "Astra", "Draco"
]
# For each hero, define a unique list of skins (varying counts)
HERO_SKINS = {
    "Artemis":    ["Forest Huntress", "Shadow Stalker", "Celestial Archer"],
    "Zephyr":     ["Stormcaller", "Zephyr Prime"],
    "Titan":      ["Colossus", "Stoneheart", "Ironclad", "Earthshaker"],
    "Morgana":    ["Nightshade", "Hex Weaver", "Soul Binder"],
    "Aurora":     ["Frost Maiden", "Solar Flare", "Polar Queen"],
    "Vulcan":     ["Forge Master", "Lava Sentinel"],
    "Nyx":        ["Midnight Whisper", "Dreambreaker", "Shadowveil"],
    "Ragnar":     ["Berserker", "Warlord", "Frostborn", "Thunderlord", "Ironforge"],
    "Selene":     ["Moonlight Dancer", "Lunar Priestess"],
    "Orion":      ["Star Hunter", "Galactic Ranger", "Void Stalker"],
    "Echo":       ["Soundwave", "Resonance", "Pulse Knight"],
    "Fenrir":     ["Bloodmoon", "Spirit Howl", "Feral King", "Glacial Fang"],
    "Astra":      ["Cosmic Empress", "Nebula Sorceress", "Void Empress"],
    "Draco":      ["Sky Terror", "Dragonheart", "Inferno Wing", "Stormscale", "Obsidian Drake"]
}
CURRENCIES = ["USD", "EUR", "MYR", "JPY", "INR"]

def generate_player():
    player_id = fake.uuid4()
    # Personal info
    age = random.randint(13, 60)
    birthday = fake.date_of_birth(minimum_age=age, maximum_age=age)
    email = fake.email()
    phone = fake.phone_number()
    gender = random.choice(["Male", "Female", "Other"])
    region = fake.state()
    address = fake.address().replace("\n", ", ")

    # Account times
    reg_time = fake.date_time_between(start_date="-2y", end_date="-1y")
    last_login = fake.date_time_between(start_date=reg_time, end_date="now")

    # Social & devices
    friend_list = [fake.uuid4() for _ in range(random.randint(0, 20))]
    device_list = random.sample(DEVICES, k=random.randint(1, 3))

    # Economy & virtual currencies
    purchases = [{
        "item_id": fake.uuid4(),
        "item_name": fake.word().capitalize(),
        "amount": round(random.uniform(1, 300), 2),
        "currency": random.choice(CURRENCIES),
        "purchased_at": fake.date_time_between(start_date=reg_time, end_date=last_login).isoformat()
    } for _ in range(random.randint(0, 8))]
    wallet_balance = round(sum(p["amount"] for p in purchases) * random.uniform(0.1, 2.0), 2)
    in_game_gold = random.randint(0, 100000)
    in_game_diamonds = random.randint(0, 5000)

    # Game-specific stats
    total_playtime_hours = round(random.uniform(1, 2000), 1)
    game_level = random.randint(1, 100)
    total_matches = random.randint(10, 5000)
    wins = random.randint(0, total_matches)
    win_rate = round(wins / total_matches * 100, 2)
    total_kills = random.randint(0, total_matches * 20)
    total_deaths = random.randint(0, total_matches * 20)

    # Achievements unlocked
    num_ach = random.randint(0, len(ACHIEVEMENTS_POOL))
    achievements = random.sample(ACHIEVEMENTS_POOL, num_ach)
    achievement_dates = {
        ach: fake.date_time_between(start_date=reg_time, end_date=last_login).isoformat()
        for ach in achievements
    }

    # Heroes & skins: pick 1–5 heroes, then for each pick a random # of skins
    num_heroes = random.randint(1, 5)
    picked_heroes = random.sample(HEROES, k=num_heroes)
    skins = []
    for h in picked_heroes:
        # choose between 0 and all available skins for this hero
        available = HERO_SKINS.get(h, [])
        num_skins = random.randint(0, len(available))
        skins.extend(random.sample(available, k=num_skins))

    return {
        "player_id": player_id,
        "username": fake.user_name(),
        "email": email,
        "phone_number": phone,
        "gender": gender,
        "age": age,
        "birthday": birthday.isoformat(),
        "region": region,
        "address": address,
        "registration_time": reg_time.isoformat(),
        "last_login_time": last_login.isoformat(),
        "membership_level": random.choice(MEMBERSHIPS),
        "device_list": device_list,
        "friend_list": friend_list,
        "ip_address": fake.ipv4(),
        "mac_address": fake.mac_address(),
        "purchases": purchases,
        "wallet_balance": wallet_balance,
        "in_game_gold": in_game_gold,
        "in_game_diamonds": in_game_diamonds,
        "total_playtime_hours": total_playtime_hours,
        "game_level": game_level,
        "total_matches": total_matches,
        "wins": wins,
        "win_rate_percent": win_rate,
        "total_kills": total_kills,
        "total_deaths": total_deaths,
        "achievements": achievement_dates,
        "heroes": picked_heroes,
        "skins": skins
    }

def generate_large_dataset(n=100000, save_path="players.jsonl"):
    """Generates `n` player records and writes them as NDJSON."""
    import json
    with open(save_path, "w", encoding="utf-8") as f:
        for _ in tqdm(range(n), desc="Generating player records"):
            record = generate_player()
            f.write(json.dumps(record) + "\n")
    print(f"Generated {n} records in {save_path}")

if __name__ == "__main__":
    generate_large_dataset(n=100000)


Generating player records: 100%|██████████| 100000/100000 [04:58<00:00, 335.04it/s]

Generated 100000 records in players.jsonl





In [2]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import os
os.environ["HADOOP_USER_NAME"] = "haus" # Replace with your username
# Initialize Spark session
spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()
df = spark.read.json("data/players.jsonl")
df.show(5)

+--------------------+--------------------+---+----------+--------------------+--------------------+--------------------+----------+------+--------------------+----------------+------------+--------------+-------------------+-----------------+----------------+--------------------+--------------------+--------------------+--------------+-------------------+--------------------+------------+-----------+-------------+--------------------+-----------+--------------+----------------+----+
|        achievements|             address|age|  birthday|         device_list|               email|         friend_list|game_level|gender|              heroes|in_game_diamonds|in_game_gold|    ip_address|    last_login_time|      mac_address|membership_level|        phone_number|           player_id|           purchases|        region|  registration_time|               skins|total_deaths|total_kills|total_matches|total_playtime_hours|   username|wallet_balance|win_rate_percent|wins|
+--------------------+

In [11]:
df.groupby('age').count().orderBy('age').show(10)
df.groupby('game_level').count().orderBy('game_level').show(10)

+---+-----+
|age|count|
+---+-----+
| 13| 2072|
| 14| 2172|
| 15| 2092|
| 16| 2059|
| 17| 2169|
| 18| 2071|
| 19| 2017|
| 20| 2108|
| 21| 2053|
| 22| 2091|
+---+-----+
only showing top 10 rows

+----------+-----+
|game_level|count|
+----------+-----+
|         1| 1015|
|         2|  944|
|         3|  988|
|         4|  997|
|         5|  985|
|         6|  996|
|         7|  963|
|         8| 1011|
|         9| 1024|
|        10|  992|
+----------+-----+
only showing top 10 rows



In [15]:
df.groupby('age').agg(
    F.avg("total_playtime_hours").alias("avg_playtime"),
    F.max("total_playtime_hours").alias("max_playtime"),
    F.min("total_playtime_hours").alias("min_playtime")
).orderBy('age').show()

+---+------------------+------------+------------+
|age|      avg_playtime|max_playtime|min_playtime|
+---+------------------+------------+------------+
| 13| 985.4364864864866|      1998.5|         1.0|
| 14| 996.6730202578268|      1999.6|         2.4|
| 15| 1001.126481835564|      1999.3|         2.0|
| 16| 978.9996114618746|      1999.6|         1.1|
| 17|  997.634854771784|      1999.5|         1.6|
| 18|1004.9695316272336|      1999.1|         2.3|
| 19| 984.8029251363408|      1999.8|         1.8|
| 20| 989.4011859582541|      1999.8|         1.1|
| 21| 1020.902435460302|      1997.3|         4.3|
| 22|1008.4568627450975|      1999.8|         4.7|
| 23|1011.5020833333332|      1997.9|         1.3|
| 24| 992.8988697788697|      1999.5|         2.0|
| 25| 990.6013935607882|      2000.0|         2.4|
| 26|1004.8282608695653|      1999.6|         2.5|
| 27| 999.7208373435997|      1999.4|         1.3|
| 28| 980.7605633802821|      1998.4|         1.6|
| 29|  983.321331997997|      1

In [18]:
df.select('player_id','purchases').filter(F.size('purchases') == 0).show(5, truncate=False)

+------------------------------------+---------+
|player_id                           |purchases|
+------------------------------------+---------+
|988c24c9-61b1-4d22-a280-1c4510435a10|[]       |
|ff574e2b-4991-4b9b-abc2-026faf34cf65|[]       |
|6a5e6920-bf5a-47e6-93a3-dd5a4b8c5bdc|[]       |
|5dae1201-673b-48bd-838c-1dec5da39a73|[]       |
|bd10f87c-6d0e-4597-900d-92a7ad409244|[]       |
+------------------------------------+---------+
only showing top 5 rows

