In [1]:
from faker import Faker
import random
from datetime import datetime
from tqdm import tqdm

fake = Faker()
Faker.seed(42)

# Constants
GAME_NAME = "Mythic Quest"
MEMBERSHIPS = ["Free", "Silver", "Gold", "Platinum"]
DEVICES = ["PC", "Xbox", "PlayStation", "Mobile", "Switch"]
ACHIEVEMENTS_POOL = [
    "First Blood", "Monster Slayer", "Treasure Hunter", "Dungeon Master",
    "Speed Runner", "Unstoppable", "Champion of the Arena", "Lore Keeper"
]
# Expanded heroes list
HEROES = [
    "Artemis", "Zephyr", "Titan", "Morgana", "Aurora", "Vulcan",
    "Nyx", "Ragnar", "Selene", "Orion", "Echo", "Fenrir", "Astra", "Draco"
]
# For each hero, define a unique list of skins (varying counts)
HERO_SKINS = {
    "Artemis":    ["Forest Huntress", "Shadow Stalker", "Celestial Archer"],
    "Zephyr":     ["Stormcaller", "Zephyr Prime"],
    "Titan":      ["Colossus", "Stoneheart", "Ironclad", "Earthshaker"],
    "Morgana":    ["Nightshade", "Hex Weaver", "Soul Binder"],
    "Aurora":     ["Frost Maiden", "Solar Flare", "Polar Queen"],
    "Vulcan":     ["Forge Master", "Lava Sentinel"],
    "Nyx":        ["Midnight Whisper", "Dreambreaker", "Shadowveil"],
    "Ragnar":     ["Berserker", "Warlord", "Frostborn", "Thunderlord", "Ironforge"],
    "Selene":     ["Moonlight Dancer", "Lunar Priestess"],
    "Orion":      ["Star Hunter", "Galactic Ranger", "Void Stalker"],
    "Echo":       ["Soundwave", "Resonance", "Pulse Knight"],
    "Fenrir":     ["Bloodmoon", "Spirit Howl", "Feral King", "Glacial Fang"],
    "Astra":      ["Cosmic Empress", "Nebula Sorceress", "Void Empress"],
    "Draco":      ["Sky Terror", "Dragonheart", "Inferno Wing", "Stormscale", "Obsidian Drake"]
}
CURRENCIES = ["USD", "EUR", "MYR", "JPY", "INR"]

def generate_player():
    player_id = fake.uuid4()
    # Personal info
    age = random.randint(13, 60)
    birthday = fake.date_of_birth(minimum_age=age, maximum_age=age)
    email = fake.email()
    phone = fake.phone_number()
    gender = random.choice(["Male", "Female", "Other"])
    region = fake.state()
    address = fake.address().replace("\n", ", ")

    # Account times
    reg_time = fake.date_time_between(start_date="-2y", end_date="-1y")
    last_login = fake.date_time_between(start_date=reg_time, end_date="now")

    # Social & devices
    friend_list = [fake.uuid4() for _ in range(random.randint(0, 20))]
    device_list = random.sample(DEVICES, k=random.randint(1, 3))

    # Economy & virtual currencies
    purchases = [{
        "item_id": fake.uuid4(),
        "item_name": fake.word().capitalize(),
        "amount": round(random.uniform(1, 300), 2),
        "currency": random.choice(CURRENCIES),
        "purchased_at": fake.date_time_between(start_date=reg_time, end_date=last_login).isoformat()
    } for _ in range(random.randint(0, 8))]
    wallet_balance = round(sum(p["amount"] for p in purchases) * random.uniform(0.1, 2.0), 2)
    in_game_gold = random.randint(0, 100000)
    in_game_diamonds = random.randint(0, 5000)

    # Game-specific stats
    total_playtime_hours = round(random.uniform(1, 2000), 1)
    game_level = random.randint(1, 100)
    total_matches = random.randint(10, 5000)
    wins = random.randint(0, total_matches)
    win_rate = round(wins / total_matches * 100, 2)
    total_kills = random.randint(0, total_matches * 20)
    total_deaths = random.randint(0, total_matches * 20)

    # Achievements unlocked
    num_ach = random.randint(0, len(ACHIEVEMENTS_POOL))
    achievements = random.sample(ACHIEVEMENTS_POOL, num_ach)
    achievement_dates = {
        ach: fake.date_time_between(start_date=reg_time, end_date=last_login).isoformat()
        for ach in achievements
    }

    # Heroes & skins: pick 1–5 heroes, then for each pick a random # of skins
    num_heroes = random.randint(1, 5)
    picked_heroes = random.sample(HEROES, k=num_heroes)
    skins = []
    for h in picked_heroes:
        # choose between 0 and all available skins for this hero
        available = HERO_SKINS.get(h, [])
        num_skins = random.randint(0, len(available))
        skins.extend(random.sample(available, k=num_skins))

    return {
        "player_id": player_id,
        "username": fake.user_name(),
        "email": email,
        "phone_number": phone,
        "gender": gender,
        "age": age,
        "birthday": birthday.isoformat(),
        "region": region,
        "address": address,
        "registration_time": reg_time.isoformat(),
        "last_login_time": last_login.isoformat(),
        "membership_level": random.choice(MEMBERSHIPS),
        "device_list": device_list,
        "friend_list": friend_list,
        "ip_address": fake.ipv4(),
        "mac_address": fake.mac_address(),
        "purchases": purchases,
        "wallet_balance": wallet_balance,
        "in_game_gold": in_game_gold,
        "in_game_diamonds": in_game_diamonds,
        "total_playtime_hours": total_playtime_hours,
        "game_level": game_level,
        "total_matches": total_matches,
        "wins": wins,
        "win_rate_percent": win_rate,
        "total_kills": total_kills,
        "total_deaths": total_deaths,
        "achievements": achievement_dates,
        "heroes": picked_heroes,
        "skins": skins
    }

def generate_large_dataset(n=100000, save_path="players.jsonl"):
    """Generates `n` player records and writes them as NDJSON."""
    import json
    with open(save_path, "w", encoding="utf-8") as f:
        for _ in tqdm(range(n), desc="Generating player records"):
            record = generate_player()
            f.write(json.dumps(record) + "\n")
    print(f"Generated {n} records in {save_path}")

if __name__ == "__main__":
    generate_large_dataset(n=100000)


Generating player records: 100%|██████████| 100000/100000 [04:58<00:00, 335.04it/s]

Generated 100000 records in players.jsonl





In [4]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()
df = spark.read.json("players.jsonl")
df.show(5)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "d:\brainstack\.venv\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "d:\brainstack\.venv\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\oscar\AppData\Local\Programs\Python\Python310\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 