Initialization

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

Data Cleaning

In [2]:
#load dataset
df = pd.read_csv('C:/Users/josep/OneDrive/Desktop/small python project/online-game-behaviour/online_gaming_behavior_insights.csv')

#simple check on column data types and distribution
#df.info()
#df.describe()

#check for null
df.isna().sum()         #no nulls
df[df.isna().any(axis=1)]
df.duplicated().sum()   #no duplicates
df[df.duplicated()] 

#if there is duplicate
df.drop_duplicates(subset=['PlayerID'], keep=False)

#ensure every values in column has no extra whitespaces
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()
df.info()

#rounding playtimehours to 2 decimal places
df['PlayTimeHours'] = df['PlayTimeHours'].apply(lambda x: round(x,2))

#changing in-game purchases to 'Yes' and 'No'
df['InGamePurchases'] = df['InGamePurchases'].apply(lambda x: 'Yes' if x == 1 else 'No')
df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PlayerID                   40034 non-null  int64  
 1   Age                        40034 non-null  int64  
 2   Gender                     40034 non-null  object 
 3   Location                   40034 non-null  object 
 4   GameGenre                  40034 non-null  object 
 5   PlayTimeHours              40034 non-null  float64
 6   InGamePurchases            40034 non-null  int64  
 7   GameDifficulty             40034 non-null  object 
 8   SessionsPerWeek            40034 non-null  int64  
 9   AvgSessionDurationMinutes  40034 non-null  int64  
 10  PlayerLevel                40034 non-null  int64  
 11  AchievementsUnlocked       40034 non-null  int64  
 12  EngagementLevel            40034 non-null  object 
dtypes: float64(1), int64(7), object(5)
memory usag

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.27,No,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.53,No,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.22,No,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.27,Yes,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.53,No,Medium,2,131,95,37,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40029,49029,32,Male,USA,Strategy,20.62,No,Easy,4,75,85,14,Medium
40030,49030,44,Female,Other,Simulation,13.54,No,Hard,19,114,71,27,High
40031,49031,15,Female,USA,RPG,0.24,Yes,Easy,10,176,29,1,High
40032,49032,34,Male,USA,Sports,14.02,Yes,Medium,3,128,70,10,Medium


EDA for Features Impacting Engagement

In [3]:
#finding factors that may influence engagement level
#gender and age distribution for each engagement level
def get_age_group(age):

    if age <= 20 :
        return 'Young'
    elif age > 20 and age <= 35 :
        return 'Young Adult'
    elif age > 35 and age <= 45 :
        return 'Middle Age'
    else :
        return 'Old'
    
def sex_age_EDA(df):

    temp_df = pd.DataFrame()
    temp_df[['player_id','age','gender','engagement_level']] = df[['PlayerID','Age','Gender','EngagementLevel']]
    temp_df['age_group'] = temp_df['age'].apply(get_age_group)
    temp_df = (temp_df.groupby(['age_group','gender','engagement_level'])['player_id'].count()
                .reset_index(name='count'))
    
    #facet bar charts to show both gender player counts based on engagment levels in all age groups
    fig = px.bar(temp_df,x='age_group',y='count',color='engagement_level',facet_col='gender', category_orders={'age_group':['Young','Young Adult','Middle Age','Old']},
                 labels={'count':'Number of Players', 'age_group':'Age Group', 'engagement_level':'Engagement Level'},
                 title='Engagement Level by Age Group and Gender')
    fig.for_each_annotation(lambda a: a.update(text=a.text.replace("gender=", "")))
    fig.update_layout(bargap=0.2)

    fig.show()

    #heatmap to show relationship between age and gender on engagement level
    for gender in temp_df['gender'].unique():

        df_gender = temp_df[temp_df['gender']==gender]
        pivot = df_gender.pivot(index='age_group',columns='engagement_level',values='count')
        pivot_pct = pivot.div(pivot.sum(axis=1), axis=0) * 100

        fig2 = px.imshow(pivot_pct, text_auto='.1f', 
                        labels={'x':'Engagement Level', 'y':'Age Group', 'color':'Number of Players'},
                        color_continuous_scale='portland', aspect='auto')
        fig2.update_layout(title=f'Engagement Heatmap for {gender} Players', xaxis_title='Engagement Level',
                           yaxis_title='Age Group')
        fig2.show()

#from the graphs, most of the players are 'Young Adult' from age 21 to 35 in both female and male
#in both female and male, the engagement level are mostly 'Medium' comprising of nearly 50% 
#in both female and male, the engagement level 'High' can be be found more in 'Young' and 'Old' age groups
sex_age_EDA(df)

In [5]:
#game time, avg hours per session, session per week in game genres
#game time. hours per sesion and session per week are factors that can influence engagement level directly
def get_avg_hours(avg_mins):
    avg_hrs = avg_mins/60
    return round(avg_hrs,1)

def genres_EDA(df):
    temp_df = pd.DataFrame()
    temp_df[['player_id','play_time','avg_mins','no_session_week','game_diff','genre','engagement_level']] = (df[['PlayerID',
            'PlayTimeHours','AvgSessionDurationMinutes','SessionsPerWeek','GameDifficulty','GameGenre','EngagementLevel']])
    temp_df['avg_hrs'] = temp_df['avg_mins'].apply(get_avg_hours)
    
    #average play time in each genre
    playtime_df = (temp_df.groupby(['genre','engagement_level'])['play_time'].mean().
                   reset_index(name='avg_playtime').sort_values(by='avg_playtime',ascending=False))
    #playtime_df = temp_df.groupby(['genre','game_diff','engagement_level'])['play_time'].mean().reset_index(name='avg_playtime') 
    print(playtime_df)      #similar total playtime hours 11-12 
                            #game difficulties did not influence the average total playtime 11 - 12 hours

    #average hours per session in each genre
    hrs_df = (temp_df.groupby(['genre','engagement_level'])['avg_hrs'].mean().
              reset_index(name='avg_hours_per_session').sort_values(by='avg_hours_per_session',ascending=False))
    print(hrs_df)           #avg_hours_per_session shows a bigger influence with engagement level
                            #2 hours per session are 'high' engagement and values close to 1 are 'low' level
                            #difference across genres are minimal

    #avg number of session per week in each genre
    week_df = (temp_df.groupby(['genre','engagement_level'])['no_session_week'].mean().
              reset_index(name='avg_session_week').sort_values(by='avg_session_week',ascending=False))
    print(week_df)          #avg number of session per week shows a big influence with engagement level 
                            #avg sessions of 14 times are 'high', avg 9 times are 'medium' and avg 4 times are 'low'
                            #difference across genres are minimal

    #grouped bar chart for average play time in each genre
    fig1 = px.bar(playtime_df, x='genre', y='avg_playtime', color='engagement_level', 
                  barmode='group', category_orders={'engagement_level':['Low','Medium','High']},
                  labels={'avg_playtime':'Average Total Playtime','genre':'Game Genre','engagement_level':'Engagement Level'},
                  title='Average Total Playtime per Genre by Engagement Level')
    fig1.show()

    #faceted bar chart for average hours per session in each genre
    colors = {'Low':'#962B2B','Medium':"#928F2C", 'High':"#389B3D"}
    fig2 = px.bar(hrs_df, x='genre', y='avg_hours_per_session', color='engagement_level', facet_col='engagement_level',
                  text='avg_hours_per_session', color_discrete_map=colors,
                  category_orders={'engagement_level':['Low','Medium','High']},
                  labels={'avg_hours_per_session':'Avg Hours per Session','genre':'Game Genre','engagement_level':'Engagement Level'},
                  title='Average Hours per Session per Genre by Engagement Level')
    fig2.update_traces(texttemplate='%{text:.3f}',textfont_color='white',textfont_weight='bold',textfont_size=14)
    fig2.for_each_annotation(lambda a: a.update(text=a.text.replace('Engagement Level=','')))
    fig2.show()

    #grouped bar chart for average number of sessiosn per week
    fig3 = px.bar(week_df, x='genre', y='avg_session_week', color='engagement_level', 
                  barmode='group', category_orders={'engagement_level':['Low','Medium','High']},
                  labels={'avg_session_week':'Avg Sessions', 'genre':'Game Genre', 'engagement_level':'Engagement Level'},
                  title='Average Number of Sessions per Week per Genre by Engagement Level')
    fig3.show()

    #heatmap to show relationship between genre on engagement level
    count_df = temp_df.groupby(['genre','engagement_level'])['player_id'].count().reset_index(name='count')
    pivot = count_df.pivot_table(index='genre', columns='engagement_level', values='count')
    pivot_pct = pivot.div(pivot.sum(axis=1), axis=0) * 100

    fig4 = px.imshow(pivot_pct, text_auto='.1f', color_continuous_scale='peach', aspect='auto',
                     labels={'x':'Engagement Level', 'y':'Genre', 'color':'Number of Players'},
                     title='Number of Players by Genre and Engagement Level')

    fig4.update_layout(xaxis_title='Engagement Level', yaxis_title='Game Genre')
    fig4.show()

genres_EDA(df)

         genre engagement_level  avg_playtime
1       Action              Low     12.351360
12    Strategy             High     12.311297
9       Sports             High     12.255819
0       Action             High     12.170228
4          RPG              Low     12.169771
10      Sports              Low     12.113505
14    Strategy           Medium     12.085991
2       Action           Medium     12.063624
5          RPG           Medium     12.051621
7   Simulation              Low     12.048640
8   Simulation           Medium     11.845677
6   Simulation             High     11.843161
13    Strategy              Low     11.833913
3          RPG             High     11.752988
11      Sports           Medium     11.737163
         genre engagement_level  avg_hours_per_session
12    Strategy             High               2.210253
6   Simulation             High               2.208175
9       Sports             High               2.202112
0       Action             High             

In [7]:
#geographical and gender influence on in-game purchases 
#it is important to identify which areas attract male or female purchase patterns in order to plan proper marketing strategies
#first, identify how much in-game purchases influence engagement level
#then, identify how much location and gender influence on engagement level
#finally, identify areas that have more number of purchases for male/female

def purchases_EDA(df):
    temp_df = pd.DataFrame()
    temp_df[['player_id','gender','location','in_game_purchase','engagement_level']] = df[['PlayerID','Gender','Location','InGamePurchases','EngagementLevel']]
    purchase_df = temp_df.groupby(['in_game_purchase','engagement_level'])['player_id'].count().reset_index(name='count')
    #print(purchase_df)          #the difference between 'high' and 'low' engagement does not show a big difference from in_game_purchase 

    loc_sex_df = temp_df.groupby(['location','gender','engagement_level'])['player_id'].count().reset_index(name='count')
    #print(loc_sex_df)           #the difference between locations for each gender also does not show much difference on 'high' and 'low' engagement

    locSex_buy_df = temp_df.groupby(['location','gender'])['in_game_purchase'].count().reset_index(name='total_count')
    yes_df = temp_df[temp_df['in_game_purchase']=='Yes'].groupby(['location','gender']).size().reset_index(name='yes_count')
    combined_df = locSex_buy_df.merge(yes_df,on=['location','gender'],how='left')
    combined_df['percentage'] = round(combined_df['yes_count'] / combined_df['total_count'] * 100.0, 2)
    print(combined_df)          #the distribution does not show a big disparity but some locations show 'Male' with more purchase percentage, 
                                #while some 'Female' with majority
    
    #stacked bar chart to show in-game purchase in relation to engagement level
    colors = {'Low':"#C52B2B",'Medium':"#C1BD40", 'High':"#51BD56"}
    fig1 = px.bar(purchase_df, x='in_game_purchase', y='count', color='engagement_level', text='count',
                  category_orders={'engagement_level':['Low','Medium','High']},
                  labels={'in_game_purchase':'In-Game Purchase','count':'Number of Players','engagement_level':'Engagement Level'}, 
                  color_discrete_map=colors, title='Engagement Level by In-Game Purchase')
    fig1.update_traces(textfont_color='white',textfont_weight='bold',textfont_size=12,textposition='inside')
    fig1.update_layout(bargap=0.2)
    fig1.show()

    #grouped facet bar chart to show location and gender influence on engagement level
    fig2 = px.bar(loc_sex_df, x='location', y='count', color='engagement_level', facet_col='gender', barmode='group',
                  labels={'gender':'Gender','location':'Region','count':'Number of Players','engagement_level':'Engagement Level'}, 
                  category_orders={'engagement_level':['Low','Medium','High'],'location':['USA','Europe','Asia','Other']},
                  title='Engagement Level by Location and Gender')
    fig2.for_each_annotation(lambda a: a.update(text=a.text.replace('Gender=','')))
    fig2.show()

    #heatmap to show in-game purchases by location and gender
    pivot = combined_df.pivot(index='location', columns='gender', values='percentage')
    fig3 = px.imshow(pivot, text_auto='.2f', aspect='auto', color_continuous_scale='YlOrRd', 
                     labels={'location':'Region','gender':'Gender','percentage':'Percentage of Players'},
                     title='In-Game Purchases by Location and Gender')
    fig3.update_layout(yaxis_title='Location',xaxis_title='Gender')
    fig3.show()                 #highest 'male' in-game purchase count in 'Asia', highest 'female' in-game purchase count in 'Europe'
                                #lowest for both genders in 'Other'

purchases_EDA(df)

  location  gender  total_count  yes_count  percentage
0     Asia  Female         3251        637       19.59
1     Asia    Male         4844       1023       21.12
2   Europe  Female         4762        990       20.79
3   Europe    Male         7242       1453       20.06
4    Other  Female         1580        304       19.24
5    Other    Male         2355        456       19.36
6      USA  Female         6482       1249       19.27
7      USA    Male         9518       1929       20.27


Machine Learning (Random Forest Model - Engagement Level Prediction)

In [8]:
#although some of the significant and insignificant factors influencing engagement level were studied
#it is important to verify it through feature importance during training

#get all features that influence engagement level into a list
features = ['Age','Gender','Location','GameGenre','PlayTimeHours','InGamePurchases','GameDifficulty',
            'SessionsPerWeek','AvgSessionDurationMinutes','PlayerLevel','AchievementsUnlocked']

#get x-(predictors) and y-(outcome)
raw_X = df[features]
y = df['EngagementLevel']

#convert all features and engagement level into numerical values 
X = pd.get_dummies(raw_X, drop_first=True)
le = LabelEncoder()
y = le.fit_transform(y)

#set test size to 20% and maintain proportion with stratify, random state to always get same result
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#efine the initial random forest model
#n_estimator -> number of trees (100 to 500) as starting point
#can reduce computational time with GridSearchCV or RandomizedSearchCV
rf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)
rf.fit(X_train,y_train)

#check feature importance with permutation importance
#standard feature importance can be bias towards highly correlated features
#permutation importance checks based on performance degradation
#set the scoring to f1_weighted for a balanced evaluation on all engagement levels
perm = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42, scoring='f1_weighted')
perm_sig = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)
print(perm_sig)

SessionsPerWeek              0.412770
AvgSessionDurationMinutes    0.345432
PlayerLevel                  0.024399
AchievementsUnlocked         0.023740
GameGenre_Simulation         0.000420
PlayTimeHours                0.000337
Location_Europe              0.000277
Location_Other               0.000030
Location_USA                -0.000003
GameGenre_Sports            -0.000034
GameDifficulty_Medium       -0.000041
GameDifficulty_Hard         -0.000077
GameGenre_RPG               -0.000108
Age                         -0.000248
InGamePurchases_Yes         -0.000375
Gender_Male                 -0.000459
GameGenre_Strategy          -0.000662
dtype: float64


In [9]:
#predict and evaluate the model with all features
y_pred = rf.predict(X_test)
all_acc = round(accuracy_score(y_test, y_pred), 2)
print('Accuracy: ', all_acc)
print(classification_report(y_test, y_pred, target_names=le.classes_)) 

#check cross validation score on different sets
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print('CV Accuracy: ', cv_scores.mean())

#check f1-scores 
print('F1-Score: ',f1_score(y_test, y_pred, average='weighted'))


Accuracy:  0.9
              precision    recall  f1-score   support

        High       0.92      0.86      0.89      2067
         Low       0.91      0.87      0.89      2065
      Medium       0.90      0.94      0.92      3875

    accuracy                           0.90      8007
   macro avg       0.91      0.89      0.90      8007
weighted avg       0.90      0.90      0.90      8007

CV Accuracy:  0.9095267814781476
F1-Score:  0.9041786441621998


In [10]:
#select features that are at least > 0.01 
selected_features = perm_sig[perm_sig > 0.01].index
X_selected = X[selected_features]

#re-train the model with selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)
rf.fit(X_train,y_train)

#predict and evaluate model
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)
print(y_pred_proba)         #probability shows good accuracy 

selected_acc = round(accuracy_score(y_test,y_pred), 2)
print('Accuracy: ', selected_acc)
print(classification_report(y_test, y_pred, target_names=le.classes_))      #92% accuracy score

#check cross validation score on different sets
cv_scores = cross_val_score(rf, X_selected, y, cv=5, scoring='accuracy')
print('CV Accuracy: ', cv_scores.mean())

#check f1-scores 
print('F1-Score: ',f1_score(y_test, y_pred, average='weighted'))


[[0.04  0.02  0.94 ]
 [0.075 0.905 0.02 ]
 [0.985 0.005 0.01 ]
 ...
 [0.035 0.095 0.87 ]
 [0.005 0.    0.995]
 [0.995 0.    0.005]]
Accuracy:  0.92
              precision    recall  f1-score   support

        High       0.92      0.88      0.90      2067
         Low       0.91      0.90      0.91      2065
      Medium       0.92      0.95      0.93      3875

    accuracy                           0.92      8007
   macro avg       0.92      0.91      0.91      8007
weighted avg       0.92      0.92      0.92      8007

CV Accuracy:  0.9211918462177471
F1-Score:  0.91808550706357


Visualization on Performance with Feature Selection

In [11]:
#show accuracy score on all features vs selected features
accuracies = {'All Features': all_acc*100, 'Selected Features': selected_acc*100}
fig1 = px.bar(x=accuracies.keys(), y=accuracies.values(), text=accuracies.values(), color=accuracies.keys(),
              title='Model Accuracy Comparison')
fig1.update_traces(textfont_size=20, textfont_color='white', textfont_weight='bold')
fig1.update_layout(yaxis_title='Accuracy (%)', showlegend=False)
fig1.show()


#probability confusion matrix
feature_matrix = confusion_matrix(y_test, y_pred)
class_labels = le.classes_
prob_matrix = feature_matrix / feature_matrix.sum(axis=1, keepdims=True)
fig2 = px.imshow(prob_matrix, x=class_labels, y=class_labels, title='Confusion Matrix - Engagement Level',
                labels=dict(x='Predicted Label', y='True Label', color='Proportion'), text_auto='.2f')
fig2.update_layout(xaxis_title='Predicted', yaxis_title='Actual')
fig2.show()

#classification metrics chart
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).T

metrics_df = (report_df.iloc[:-3][['precision','recall','f1-score']].reset_index().rename(columns={'index':'EngagementLevel'}))
metrics_df['EngagementLevel'] = metrics_df['EngagementLevel'].map({'0':'High', '1':'Low', '2':'Medium'})
metrics_df = metrics_df.melt(id_vars='EngagementLevel', var_name='Metric', value_name='Score')
metrics_df['Score'] = round(metrics_df['Score'], 3)

fig3 = px.bar(metrics_df, x='EngagementLevel', y='Score', color='Metric', barmode='group', text='Score',
              title='Classification Metrics by Engagement Level', category_orders={'EngagementLevel':['High','Medium','Low']})
fig3.update_traces(textfont_size=18, textfont_color='white', textfont_weight='bold')
fig3.update_layout(xaxis_title='Engagement Level', yaxis_title='Score', bargap=0.2)
fig3.show()

In [12]:
#predict engagement level on the original dataset
df['PredictedEngagement'] = le.inverse_transform(rf.predict(df[selected_features]))
df

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel,PredictedEngagement
0,9000,43,Male,Other,Strategy,16.27,No,Medium,6,108,79,25,Medium,Medium
1,9001,29,Female,USA,Strategy,5.53,No,Medium,5,144,11,10,Medium,Medium
2,9002,22,Female,USA,Sports,8.22,No,Easy,16,142,35,41,High,High
3,9003,35,Male,USA,Action,5.27,Yes,Easy,9,85,57,47,Medium,Medium
4,9004,33,Male,Europe,Action,15.53,No,Medium,2,131,95,37,Medium,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40029,49029,32,Male,USA,Strategy,20.62,No,Easy,4,75,85,14,Medium,Medium
40030,49030,44,Female,Other,Simulation,13.54,No,Hard,19,114,71,27,High,High
40031,49031,15,Female,USA,RPG,0.24,Yes,Easy,10,176,29,1,High,High
40032,49032,34,Male,USA,Sports,14.02,Yes,Medium,3,128,70,10,Medium,Medium


Machine Learning to Find Hidden Behavioural Patterns (K-Means Clustering)

In [17]:
#use unselected features and predicted engagement to find hidden patterns
#identify the unselected features for engagement level
unselected_features = [feature for feature in features if feature not in selected_features]
cluster_features = unselected_features + ['PredictedEngagement']

#encode object datatype 
categorical_cols = df[cluster_features].select_dtypes(include='object').columns.tolist()
df_encoded = pd.get_dummies(df[cluster_features], columns=categorical_cols, drop_first=True)

#standardize 
scaler = StandardScaler()
X_cluster = scaler.fit_transform(df_encoded)

#identify the best number of clusters (elbow method)
wcss = []
K = range(1,11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cluster)
    wcss.append(kmeans.inertia_)

#elbow plot
elbow_df = pd.DataFrame({'k': list(K), 'wcss' : wcss})
elbow_df['diff'] = elbow_df['wcss'].diff().fillna(0) * -1
print(elbow_df)

fig = px.line(elbow_df, x='k', y='wcss', markers=True, title='Elbow Graph to Find Optimal n_cluster')
fig.update_layout(xaxis_title='Number of clusters (k)', yaxis_title='WCSS')
fig.show()                  #based on the elbow plot, the most optimal is 7


    k           wcss          diff
0   1  600510.000000     -0.000000
1   2  547178.826387  53331.173613
2   3  510317.143349  36861.683038
3   4  480733.528458  29583.614892
4   5  454619.147558  26114.380900
5   6  429958.975066  24660.172492
6   7  401857.750933  28101.224133
7   8  388761.197132  13096.553801
8   9  378706.509367  10054.687765
9  10  371774.099477   6932.409889


In [18]:
#use the most optimal n_cluster
kmeans = KMeans(n_clusters=7, random_state=42)
temp_df = df[cluster_features].copy()
temp_df['cluster'] = kmeans.fit_predict(X_cluster)
temp_df['cluster'] = temp_df['cluster'] + 1         #set from 0 to 6 -> 1 to 7

#cluster summary, use mean for numerical and mode for others
cluster_summary = temp_df.groupby('cluster').agg(
    {**{col: 'mean' for col in temp_df.select_dtypes(include='number')},
     **{col: lambda x: x.mode()[0] for col in temp_df.select_dtypes(exclude='number')}})     
print(cluster_summary)

#based on cluster summary
#cluster 1 -> 'Location' -> 'Europe', 'GameGenre' -> 'Action'
#cluster 2 -> 'Location' -> 'Europe', 'Game Difficulty' -> 'Hard', 'GameGenre' -> 'Sports'
#cluster 3 -> 'Location' -> 'USA', 'PlayTimeHours' -> below 12, 'GameGenre' -> 'Simulation'
#cluster 4 -> 'Location' -> 'USA', 'GameGenre' -> 'Strategy'
#cluster 5 -> 'Location' -> 'USA', 'GameGenre' -> 'Sports'
#cluster 6 -> 'Location' -> 'USA', 'GameGenre' -> 'RPG'
#cluster 7 -> 'Location' -> 'Other', 'GameGenre' -> 'Simulation'

#regional content or events could be tailored based on locations
#popular genre does vary according to clusters, however there is a need to check the distribution for clarification
#cluster 3 shows lowest average playtime, can focus on marketing / campagins to increase engagement
#cluster 2 shows highest frequency for hard game difficulty, potential for competitive or premium content 

               Age  PlayTimeHours  cluster Gender Location   GameGenre  \
cluster                                                                  
1        31.777761      12.126234      1.0   Male   Europe      Action   
2        31.997149      12.006211      2.0   Male   Europe      Sports   
3        32.199344      11.874788      3.0   Male      USA  Simulation   
4        32.050034      12.099938      4.0   Male      USA    Strategy   
5        32.108139      12.025721      5.0   Male      USA      Sports   
6        31.866336      12.001845      6.0   Male      USA         RPG   
7        31.962389      12.061017      7.0   Male    Other  Simulation   

        InGamePurchases GameDifficulty PredictedEngagement  
cluster                                                     
1                    No           Easy              Medium  
2                    No           Hard              Medium  
3                    No           Easy              Medium  
4                    No     

In [19]:
#2D-PCA scatter plot for clusters
#exclude cluster and predicted engagement
categorical_cols = temp_df.select_dtypes(include='object').columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['PredictedEngagement', 'cluster']] 
temp_encoded = pd.get_dummies(temp_df, columns=categorical_cols, drop_first=True)   #drop_first to reduce dimensions

#standardize all other features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(temp_encoded.drop(columns=['PredictedEngagement', 'cluster']))

#transform into 2 dimension PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plot_df = pd.DataFrame({'PC1': X_pca[:,0], 'PC2': X_pca[:,1], 'cluster': temp_df['cluster'].astype(str), 
                        'PredictedEngagement': temp_df['PredictedEngagement']})

fig1 = px.scatter(plot_df, x='PC1', y='PC2', color='cluster',  hover_data=['PredictedEngagement'], 
                  category_orders={'cluster':['1','2','3','4','5','6','7']},
                  title='Cluster Visualization (PCA 2D)')
fig1.show()






#Engagement Proportion plot for Each Cluster
colors = {'Low':"#AB2C2C",'Medium':"#2F4794", 'High':"#22982C"}
engagement_pct = temp_df.groupby('cluster')['PredictedEngagement'].value_counts(normalize=True).reset_index(name='percent')
engagement_pct['percent'] = round(engagement_pct['percent'] * 100, 2)
fig2 = px.bar(engagement_pct, x='cluster', y='percent', color='PredictedEngagement',
              title='Predicted Engagement Distribution per Cluster', labels={'percent':'Percentage of Players','cluster':'Cluster'},
              text='percent', barmode='stack', category_orders={'PredictedEngagement':['Low','Medium','High']},
              color_discrete_map=colors)
fig2.update_traces(textfont_size=13, textfont_weight='bold', textfont_color='white')
fig2.show()