In [0]:
df=spark.table("online_gaming_behavior_dataset")
df
display(df)


PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
9000,43,Male,Other,Strategy,16.271118760553215,0,Medium,6,108,79,25,Medium
9001,29,Female,USA,Strategy,5.525961380570566,0,Medium,5,144,11,10,Medium
9002,22,Female,USA,Sports,8.223755243499511,0,Easy,16,142,35,41,High
9003,35,Male,USA,Action,5.265351277318268,1,Easy,9,85,57,47,Medium
9004,33,Male,Europe,Action,15.53194452113429,0,Medium,2,131,95,37,Medium
9005,37,Male,Europe,RPG,20.561855414112557,0,Easy,2,81,74,22,Low
9006,25,Male,USA,Action,9.752716365932256,0,Hard,1,50,13,2,Low
9007,25,Female,Asia,RPG,4.401729344841462,0,Medium,10,48,27,23,Medium
9008,38,Female,Europe,Simulation,18.15273259575482,0,Easy,5,101,23,41,Medium
9009,38,Female,Other,Sports,23.942771725289525,0,Easy,13,95,99,36,High


Databricks visualization. Run in Databricks to view.

In [0]:
display(df.printSchema())



root
 |-- PlayerID: long (nullable = true)
 |-- Age: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- GameGenre: string (nullable = true)
 |-- PlayTimeHours: double (nullable = true)
 |-- InGamePurchases: long (nullable = true)
 |-- GameDifficulty: string (nullable = true)
 |-- SessionsPerWeek: long (nullable = true)
 |-- AvgSessionDurationMinutes: long (nullable = true)
 |-- PlayerLevel: long (nullable = true)
 |-- AchievementsUnlocked: long (nullable = true)
 |-- EngagementLevel: string (nullable = true)



In [0]:
from pyspark.sql.functions import col, sum
display(df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]))


PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
display(df.describe())


summary,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
count,40034.0,40034.0,40034,40034,40034,40034.0,40034.0,40034,40034.0,40034.0,40034.0,40034.0,40034
mean,29016.5,31.9925313483539,,,,12.024365373325834,0.2008542738672128,,9.471773992106709,94.79225158615178,49.65556776739771,24.52647749412999,
stddev,11556.964675034704,10.04322679163468,,,,6.914637905333832,0.4006442861498496,,5.763667125348518,49.0113745387013,28.58837914040544,14.430726177623368,
min,9000.0,15.0,Female,Asia,Action,0.0001146866199155383,0.0,Easy,0.0,10.0,1.0,0.0,High
max,49033.0,49.0,Male,USA,Strategy,23.999591633580454,1.0,Medium,19.0,179.0,99.0,49.0,Medium


In [0]:
display(df.select("location"))


location
Other
USA
USA
USA
Europe
Europe
USA
Asia
Europe
Other


In [0]:
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import RandomForestClassifier
import mlflow
mlflow.autolog(log_models=True)




In [0]:
from pyspark.sql import functions as F

# 1. Read the Silver Layer data
df_silver = spark.table("silver_player_events")

# 2. Group by all identifying and static fields (PlayerID, Age, Gender, etc.).
#    This ensures these columns are included in the result automatically.
#    The only aggregation needed is the count of events.
features_sess = (
    df_silver
    .groupBy("PlayerID", "Age", "Gender", "Location", "GameGenre")
    .agg(
        # Counting all rows within the group gives the total event count per player
        F.count("*").alias("event_count") 
    )
)

# 3. Create the Gold Layer View/Table
features_sess.createOrReplaceTempView("gold_player_session_features")

# Display a preview to confirm the new table structure
display(spark.table("gold_player_session_features").limit(5))

PlayerID,Age,Gender,Location,GameGenre,event_count
9000,43,Male,Other,Strategy,1
9001,29,Female,USA,Strategy,1
9002,22,Female,USA,Sports,1
9003,35,Male,USA,Action,1
9004,33,Male,Europe,Action,1


1 WEAPON RECOMMENDATION


In [0]:
from pyspark.sql import functions as F

# 1. Read the Silver Layer data
df_silver = spark.table("silver_player_events")

# 2. Group by ALL static player attributes and calculate aggregates for behavioral metrics.
features_sess = (
    df_silver
    .groupBy("PlayerID", "Age", "Gender", "Location", "GameGenre", "GameDifficulty", "EngagementLevel")
    .agg(
        # General Metrics
        F.count("*").alias("total_event_count"),
        F.sum(F.col("InGamePurchases")).alias("total_purchases"),
        F.max(F.col("AchievementsUnlocked")).alias("max_achievements"),
        F.max(F.col("PlayerLevel")).alias("max_player_level"),
        
        # Play Time & Session Metrics
        F.sum(F.col("PlayTimeHours")).alias("total_play_time_hours"),
        F.avg(F.col("SessionsPerWeek")).alias("avg_sessions_per_week"),
        F.avg(F.col("AvgSessionDurationMinutes")).alias("avg_session_duration_minutes"),
        
        # Financial Metrics (if InGamePurchases represents value/currency)
        F.sum(F.col("InGamePurchases")).alias("total_spending_value")
    )
    # Add a simple calculated metric for potential analysis
    .withColumn("SpendingPerPlayHour", 
                F.col("total_spending_value") / F.col("total_play_time_hours"))
    .fillna(0, subset=["SpendingPerPlayHour"]) # Handle division by zero for players with 0 hours
)

# 3. Create the Gold Layer View/Table
features_sess.createOrReplaceTempView("gold_player_session_features_enhanced")

# Display a preview
print("Created 'gold_player_session_features_enhanced' with aggregated metrics.")
display(spark.table("gold_player_session_features_enhanced").limit(5))

Created 'gold_player_session_features_enhanced' with aggregated metrics.


PlayerID,Age,Gender,Location,GameGenre,GameDifficulty,EngagementLevel,total_event_count,total_purchases,max_achievements,max_player_level,total_play_time_hours,avg_sessions_per_week,avg_session_duration_minutes,total_spending_value,SpendingPerPlayHour
9000,43,Male,Other,Strategy,Medium,Medium,1,0,25,79,16.271118760553215,6.0,108.0,0,0.0
9001,29,Female,USA,Strategy,Medium,Medium,1,0,10,11,5.525961380570566,5.0,144.0,0,0.0
9002,22,Female,USA,Sports,Easy,High,1,0,41,35,8.223755243499511,16.0,142.0,0,0.0
9003,35,Male,USA,Action,Easy,Medium,1,1,47,57,5.265351277318268,9.0,85.0,1,0.1899208518732138
9004,33,Male,Europe,Action,Medium,Medium,1,0,37,95,15.53194452113429,2.0,131.0,0,0.0


In [0]:
from pyspark.sql import functions as F

df_gold = spark.table("workspace.default.gold_player_metrics")

weapon_reco = (
    df_gold.withColumn(
        "Weapon_Recommendation",
        F.when(
            (F.col("event_count") > 100) & (F.col("max_player_level") >= 50),
            "Advanced Sniper Rifle"
        )
        .when(
            (F.col("max_player_level") >= 30),
            "Silenced Pistol"
        )
        .when(
            (F.col("total_play_time_hours") > 200),
            "Grenade Launcher"
        )
        .otherwise("Starter Pack")
    )
)

display(
    weapon_reco.select(
        "PlayerID",
        "max_player_level",
        "event_count",
        "total_play_time_hours",
        "Weapon_Recommendation"
    ).limit(10)
)

PlayerID,max_player_level,event_count,total_play_time_hours,Weapon_Recommendation
9976,44,1,15.747175887506769,Silenced Pistol
10480,47,1,6.659295203449065,Silenced Pistol
12598,16,1,21.66931412295419,Starter Pack
13681,26,1,4.332865992112929,Starter Pack
14671,93,1,15.090670815165488,Silenced Pistol
15835,44,1,22.29806383384735,Silenced Pistol
15977,84,1,9.812176193392553,Silenced Pistol
18675,84,1,23.116015589654744,Silenced Pistol
19460,90,1,19.739164895276907,Silenced Pistol
19903,72,1,12.223017831209642,Silenced Pistol


2. Increasing the engaement level

In [0]:
from pyspark.sql import functions as F

# Calculate the average PlayTimeHours for a dynamic benchmark
avg_playtime = df.agg(F.avg("PlayTimeHours")).collect()[0][0]

reco_by_preference = (
    df
    # Create a feature to flag high-engagement, high-spending players
    .withColumn("is_high_spender", F.col("InGamePurchases") > 0)
    .withColumn(
        "AI_Recommendation",
        F.when(
            (F.col("EngagementLevel") == "High") & F.col("is_high_spender"),
            F.lit("Elite Spender Cosmetic Pack")
        )
        .when(
            (F.col("PlayTimeHours") > avg_playtime) & (F.col("GameGenre") == "Action"),
            F.lit("Advanced Sniper Rifle Blueprint")
        )
        .when(F.col("PlayerLevel") < 20, F.lit("Tutorial Completion Quest"))
        .otherwise(F.lit("Daily Login Reward Crate"))
    )
    .select(
        F.col("PlayerID"),
        F.col("GameGenre"),
        F.col("EngagementLevel"),
        F.col("AI_Recommendation")
    )
)

display(reco_by_preference)

PlayerID,GameGenre,EngagementLevel,AI_Recommendation
9000,Strategy,Medium,Daily Login Reward Crate
9001,Strategy,Medium,Tutorial Completion Quest
9002,Sports,High,Daily Login Reward Crate
9003,Action,Medium,Daily Login Reward Crate
9004,Action,Medium,Advanced Sniper Rifle Blueprint
9005,RPG,Low,Daily Login Reward Crate
9006,Action,Low,Tutorial Completion Quest
9007,RPG,Medium,Daily Login Reward Crate
9008,Simulation,Medium,Daily Login Reward Crate
9009,Sports,High,Daily Login Reward Crate


3. Adjustment of Game difficulty

In [0]:
from pyspark.sql import functions as F

# 1. Load the Gold Layer table containing per-player aggregated metrics
df_gold = spark.table("gold_player_metrics")

# Calculate the average PlayTimeHours for a dynamic benchmark
# NOTE: Using the aggregated column name 'total_play_time_hours'
avg_playtime = df_gold.agg(F.avg("total_play_time_hours")).collect()[0][0]

df_reco_by_preference = (
    df_gold
    # Create a feature to flag high-engagement, high-spending players
    # NOTE: Using the aggregated column name 'total_in_game_purchases'
    .withColumn("is_high_spender", F.col("total_in_game_purchases") > 0) 
    .withColumn(
        "AI_Recommendation",
        F.when(
            # 1. High Engagement & High Spender
            (F.col("EngagementLevel") == "High") & F.col("is_high_spender"),
            F.lit("Elite Spender Cosmetic Pack")
        )
        .when(
            # 2. High Play Time in Action Genre
            (F.col("total_play_time_hours") > avg_playtime) & (F.col("GameGenre") == "Action"),
            F.lit("Advanced Sniper Rifle Blueprint")
        )
        .when(
            # 3. Low Level Player
            # NOTE: Using the aggregated column name 'max_player_level'
            F.col("max_player_level") < 20, 
            F.lit("Tutorial Completion Quest")
        )
        .otherwise(F.lit("Daily Login Reward Crate"))
    )
    .select(
        F.col("PlayerID"),
        F.col("GameGenre"),
        F.col("EngagementLevel"),
        F.col("AI_Recommendation")
    )
)
display(reco_by_preference)



PlayerID,GameGenre,EngagementLevel,AI_Recommendation
9000,Strategy,Medium,Daily Login Reward Crate
9001,Strategy,Medium,Tutorial Completion Quest
9002,Sports,High,Daily Login Reward Crate
9003,Action,Medium,Daily Login Reward Crate
9004,Action,Medium,Advanced Sniper Rifle Blueprint
9005,RPG,Low,Daily Login Reward Crate
9006,Action,Low,Tutorial Completion Quest
9007,RPG,Medium,Daily Login Reward Crate
9008,Simulation,Medium,Daily Login Reward Crate
9009,Sports,High,Daily Login Reward Crate


{"ts": "2025-11-11 17:45:34.731", "level": "ERROR", "logger": "pyspark.sql.connect.logging", "msg": "GRPC Error received", "context": {}, "exception": {"class": "_InactiveRpcError", "msg": "<_InactiveRpcError of RPC that terminated with:\n\tstatus = StatusCode.INTERNAL\n\tdetails = \"[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `EngagementLevel` cannot be resolved. Did you mean one of the following? [`is_high_spender`, `workspace`.`default`.`gold_player_metrics`.`Age`, `workspace`.`default`.`gold_player_metrics`.`Gender`, `workspace`.`default`.`gold_player_metrics`.`PlayerID`, `workspace`.`default`.`gold_player_metrics`.`Location`]. SQLSTATE: 42703;\n'Project ['PlayerID, 'GameGenre, 'EngagementLevel, 'AI_Recommendation]\n+- 'Project [PlayerID#16040, Age#16041, Gender#16042, Location#16043, event_count#16044L, total_play_time_hours#16045, avg_sessions_per_week#16046, avg_session_duration_minutes#16047, max_player_level#16048, total_achievements

4. stuck players and recomendation


In [0]:
from pyspark.sql import functions as F

contextual_offers = (
    df
    .withColumn(
        "Contextual_Offer",
        # Trigger 1: High-Level Player (potential high-value goal) who hasn't spent money (High Conversion Opportunity)
        F.when(
            (F.col("PlayerLevel") > 90) & (F.col("InGamePurchases") == 0),
            F.lit("Special 50% Off Top-Tier Gear Offer")
        )
        # Trigger 2: Players struggling (simulated by low achievements relative to their level)
        .when(
            (F.col("PlayerLevel") > 50) & (F.col("AchievementsUnlocked") < 10),
            F.lit("3-Day Double XP Booster for $1.99")
        )
        .otherwise(F.lit("No Offer Triggered"))
    )
    .select(
        F.col("PlayerID"),
        F.col("PlayerLevel"),
        F.col("InGamePurchases"),
        F.col("Contextual_Offer")
    )
)

display(contextual_offers.limit(10))

PlayerID,PlayerLevel,InGamePurchases,Contextual_Offer
9000,79,0,No Offer Triggered
9001,11,0,No Offer Triggered
9002,35,0,No Offer Triggered
9003,57,1,No Offer Triggered
9004,95,0,Special 50% Off Top-Tier Gear Offer
9005,74,0,No Offer Triggered
9006,13,0,No Offer Triggered
9007,27,0,No Offer Triggered
9008,23,0,No Offer Triggered
9009,99,0,Special 50% Off Top-Tier Gear Offer


5. Churn risk level


In [0]:
from pyspark.sql import functions as F


churn_risk_prediction = (
    df
    .withColumn(
        "Churn_Risk_Level",
        # High Risk: Low sessions, short duration, low engagement
        F.when(
            (F.col("SessionsPerWeek") < 4) &
            (F.col("AvgSessionDurationMinutes") < 60) &
            (F.col("EngagementLevel") == "Low"),
            F.lit("HIGH_RISK_CHURN")
        )
        # Medium Risk: Moderate sessions, but total playtime is low
        .when(
            (F.col("SessionsPerWeek").between(4, 7)) &
            (F.col("PlayTimeHours") < avg_playtime),
            F.lit("MEDIUM_RISK_MONITOR")
        )
        .otherwise(F.lit("LOW_RISK"))
    )
    .withColumn(
        "Retention_Action",
        F.when(F.col("Churn_Risk_Level") == "HIGH_RISK_CHURN", F.lit("Proactive Free Premium Gift"))
        .when(F.col("Churn_Risk_Level") == "MEDIUM_RISK_MONITOR", F.lit("Targeted Engagement Survey"))
        .otherwise(F.lit("Standard Check-in"))
    )
    .select(
        F.col("PlayerID"),
        F.col("SessionsPerWeek"),
        F.col("EngagementLevel"),
        F.col("Churn_Risk_Level"),
        F.col("Retention_Action")
    )
)

display(churn_risk_prediction.limit(10))

PlayerID,SessionsPerWeek,EngagementLevel,Churn_Risk_Level,Retention_Action
9000,6,Medium,LOW_RISK,Standard Check-in
9001,5,Medium,MEDIUM_RISK_MONITOR,Targeted Engagement Survey
9002,16,High,LOW_RISK,Standard Check-in
9003,9,Medium,LOW_RISK,Standard Check-in
9004,2,Medium,LOW_RISK,Standard Check-in
9005,2,Low,LOW_RISK,Standard Check-in
9006,1,Low,HIGH_RISK_CHURN,Proactive Free Premium Gift
9007,10,Medium,LOW_RISK,Standard Check-in
9008,5,Medium,LOW_RISK,Standard Check-in
9009,13,High,LOW_RISK,Standard Check-in
