In [21]:
# Basic Libraries
import numpy as np
import pandas as pd

In [2]:
game = pd.read_csv('online_gaming_behavior_dataset.csv')
display(game.count())
game.head()

PlayerID                     40034
Age                          40034
Gender                       40034
Location                     40034
GameGenre                    40034
PlayTimeHours                40034
InGamePurchases              40034
GameDifficulty               40034
SessionsPerWeek              40034
AvgSessionDurationMinutes    40034
PlayerLevel                  40034
AchievementsUnlocked         40034
EngagementLevel              40034
dtype: int64

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium


## Engagement Level 

<b>Objective:</b> Find out which factors might correlate the best to engagement level. 

For this dataset, we will looking at each columns to see its correlation to the engagement level.
The columns that we will be looking at are
1. Age
2. Gender
3. Location
4. GameGenre
5. PlayTimeHours
6. InGamePurchases
7. GameDifficulty

<i>Also, the dataset has been checked for duplicate and missing data <br/>
Source: https://www.kaggle.com/datasets/rabieelkharoua/predict-online-gaming-behavior-dataset?resource=download</i>

In [3]:
game[["Age","EngagementLevel"]].head()

Unnamed: 0,Age,EngagementLevel
0,43,Medium
1,29,Medium
2,22,High
3,35,Medium
4,33,Medium


In [4]:
game_lowLvl = game.loc[game["EngagementLevel"] == "Low"]
display(game_lowLvl.head())

game_medLvl = game.loc[game["EngagementLevel"] == "Medium"]
display(game_medLvl.head())

game_highLvl = game.loc[game["EngagementLevel"] == "High"]
display(game_highLvl.head())

Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
5,9005,37,Male,Europe,RPG,20.561855,0,Easy,2,81,74,22,Low
6,9006,25,Male,USA,Action,9.752716,0,Hard,1,50,13,2,Low
13,9013,38,Female,USA,Strategy,8.701959,0,Easy,0,156,33,47,Low
14,9014,44,Male,USA,Simulation,17.9752,0,Easy,8,41,98,1,Low
17,9017,47,Male,USA,RPG,17.272113,0,Medium,2,131,13,9,Low


Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
0,9000,43,Male,Other,Strategy,16.271119,0,Medium,6,108,79,25,Medium
1,9001,29,Female,USA,Strategy,5.525961,0,Medium,5,144,11,10,Medium
3,9003,35,Male,USA,Action,5.265351,1,Easy,9,85,57,47,Medium
4,9004,33,Male,Europe,Action,15.531945,0,Medium,2,131,95,37,Medium
7,9007,25,Female,Asia,RPG,4.401729,0,Medium,10,48,27,23,Medium


Unnamed: 0,PlayerID,Age,Gender,Location,GameGenre,PlayTimeHours,InGamePurchases,GameDifficulty,SessionsPerWeek,AvgSessionDurationMinutes,PlayerLevel,AchievementsUnlocked,EngagementLevel
2,9002,22,Female,USA,Sports,8.223755,0,Easy,16,142,35,41,High
9,9009,38,Female,Other,Sports,23.942772,0,Easy,13,95,99,36,High
10,9010,17,Male,USA,Strategy,4.829916,0,Hard,8,95,14,12,High
11,9011,36,Female,Asia,Simulation,5.535981,1,Easy,16,124,62,31,High
12,9012,16,Male,USA,Sports,18.776234,1,Easy,9,18,52,32,High


In [5]:
print("Low Engagement Level")
display(game_lowLvl["Location"].value_counts())
print()
print("Medium Engagement Level")
display(game_medLvl["Location"].value_counts())
print()
print("High Engagement Level")
display(game_highLvl["Location"].value_counts())

Low Engagement Level


Location
USA       4087
Europe    3143
Asia      2058
Other     1036
Name: count, dtype: int64


Medium Engagement Level


Location
USA       7786
Europe    5793
Asia      3906
Other     1889
Name: count, dtype: int64


High Engagement Level


Location
USA       4127
Europe    3068
Asia      2131
Other     1010
Name: count, dtype: int64

From a glance, it seems that for <br/>
Low Engagement Level - USA has the most then Europe, Asia then Other <br/>
Medium Engagement Level - USA > Europe > Asia > Other <br/>
High Engagement Level - USA > Europe > Asia > Other <br/>
<br/>
With the data provided, country does not seems to be a major factor and also, there were no mentions of the distrubution of nationalities partaking in the survey. Thus, even if there is a distinction in the engagement levels, country will not be a proper factor to the engagement level. <br/>
Next, we will be checking if gender might play a role.

In [6]:
print("Low Engagement Level")
display(game_lowLvl["Gender"].value_counts())
print()
print("Medium Engagement Level")
display(game_medLvl["Gender"].value_counts())
print()
print("High Engagement Level")
display(game_highLvl["Gender"].value_counts())

Low Engagement Level


Gender
Male      6203
Female    4121
Name: count, dtype: int64


Medium Engagement Level


Gender
Male      11616
Female     7758
Name: count, dtype: int64


High Engagement Level


Gender
Male      6140
Female    4196
Name: count, dtype: int64

It seems that at all level, male takes on the majority of the population. However, similar to the problem presented earlier, there are more males taking the survey than females which could be the case since the general consensus is that gaming is more male dominated. Therefore, gender may not be a good indicator to engagement level. <br/>
Now, we will be looking at GameGenre

In [7]:
print("Low Engagement Level")
display(game_lowLvl["GameGenre"].value_counts())
print()
print("Medium Engagement Level")
display(game_medLvl["GameGenre"].value_counts())
print()
print("High Engagement Level")
display(game_highLvl["GameGenre"].value_counts())

Low Engagement Level


GameGenre
RPG           2097
Simulation    2088
Sports        2074
Action        2059
Strategy      2006
Name: count, dtype: int64


Medium Engagement Level


GameGenre
Action        3916
Strategy      3909
Sports        3891
RPG           3867
Simulation    3791
Name: count, dtype: int64


High Engagement Level


GameGenre
Simulation    2104
Strategy      2097
Sports        2083
Action        2064
RPG           1988
Name: count, dtype: int64

The differences seem to be pronunced in this dataset column.
For Low Engagement, RPG>Simulation>Sports>Action>Strategy <br/> 
For Medium Engagement, Action>Strategy>Sports>RPG>Simulation <br/>
For High Engagement, Simulation>Strategy>Sports>Action>RPG <br/>

However, the differences between the top 3 genres across all engagement level are quite close. <br/>
For Low Engagement Level, RPG seems to have the highest count with lower counts in the other 2 levels.<br/>
For Medium Engagement Level, Action has the highest count with lower counts in other levels.<br/>
For High Engagement Level, Simulation has the highest count but it seeems to be conflicting as its the 2nd highest in Low Engagement Level. 

One consistent trait would be Strategy holding a medium-high engagement level. Sports seem to be equal in all three levels.
From my understanding of these genres, strategy games have been popular for a long time due to games such as Chess, Othello, Checkers and more. In addition, due to the learning curve of strategy games, it is able engage more players and supposingly have higher playtime. This statement could be tested in future sections of this Notebook. 

In [8]:
game_lowLvl_4 = game_lowLvl[(game_lowLvl["PlayTimeHours"] < 4)]
game_lowLvl_12 = game_lowLvl[(game_lowLvl["PlayTimeHours"] >= 4) & (game_lowLvl["PlayTimeHours"] < 12)]
game_lowLvl_13 = game_lowLvl[(game_lowLvl["PlayTimeHours"] >= 12)]

game_medLvl_4 = game_medLvl[(game_medLvl["PlayTimeHours"] < 4)]
game_medLvl_12 = game_medLvl[(game_medLvl["PlayTimeHours"] >= 4) & (game_medLvl["PlayTimeHours"] < 12)]
game_medLvl_13 = game_medLvl[(game_medLvl["PlayTimeHours"] >= 12)]

game_highLvl_4 = game_highLvl[(game_highLvl["PlayTimeHours"] < 4)]
game_highLvl_12 = game_highLvl[(game_highLvl["PlayTimeHours"] >= 4) & (game_highLvl["PlayTimeHours"] < 12)]
game_highLvl_13 = game_highLvl[(game_highLvl["PlayTimeHours"] >= 12)]

In [9]:
print("Low Engagement Level")
print("Less than 4 hours: ",(len(game_lowLvl_4)))
print("Less than 12 hours: ",(len(game_lowLvl_12)))
print("More than 12 hours: ",(len(game_lowLvl_13)))
print()
print("Medium Engagement Level")
print("Less than 4 hours: ",(len(game_medLvl_4)))
print("Less than 12 hours: ",(len(game_medLvl_12)))
print("More than 12 hours: ",(len(game_medLvl_13)))
print()
print("High Engagement Level")
print("Less than 4 hours: ",(len(game_highLvl_4)))
print("Less than 12 hours: ",(len(game_highLvl_12)))
print("More than 12 hours: ",(len(game_highLvl_13)))

Low Engagement Level
Less than 4 hours:  1630
Less than 12 hours:  3505
More than 12 hours:  5189

Medium Engagement Level
Less than 4 hours:  3265
Less than 12 hours:  6431
More than 12 hours:  9678

High Engagement Level
Less than 4 hours:  1654
Less than 12 hours:  3521
More than 12 hours:  5161


From the comparison, it seems that there is no correlation between playtime hours and engagement levels as high playtime hours should be equalivant to a higher engagement level. Therefore, PlayTimeHours is not a significant factor to affecting the engagement levels.

In [10]:
print("Low Engagement Level")
display(game_lowLvl["InGamePurchases"].value_counts())
print()
print("Medium Engagement Level")
display(game_medLvl["InGamePurchases"].value_counts())
print()
print("High Engagement Level")
display(game_highLvl["InGamePurchases"].value_counts())

Low Engagement Level


InGamePurchases
0    8287
1    2037
Name: count, dtype: int64


Medium Engagement Level


InGamePurchases
0    15504
1     3870
Name: count, dtype: int64


High Engagement Level


InGamePurchases
0    8202
1    2134
Name: count, dtype: int64

Same as well, there is not much correlation as the general consensus is usually players who make more ingame purchases are usually those with higher engagement level.

In [11]:
print("Low Engagement Level")
display(game_lowLvl["GameDifficulty"].value_counts())
print()
print("Medium Engagement Level")
display(game_medLvl["GameDifficulty"].value_counts())
print()
print("High Engagement Level")
display(game_highLvl["GameDifficulty"].value_counts())

Low Engagement Level


GameDifficulty
Easy      5206
Medium    3042
Hard      2076
Name: count, dtype: int64


Medium Engagement Level


GameDifficulty
Easy      9677
Medium    5877
Hard      3820
Name: count, dtype: int64


High Engagement Level


GameDifficulty
Easy      5132
Medium    3092
Hard      2112
Name: count, dtype: int64

Similar to the previous column as well.

In [12]:
game["PlayerLevel"].describe()

count    40034.000000
mean        49.655568
std         28.588379
min          1.000000
25%         25.000000
50%         49.000000
75%         74.000000
max         99.000000
Name: PlayerLevel, dtype: float64

In [13]:
game_lowLvl_30 = game_lowLvl[(game_lowLvl["PlayerLevel"] < 30)]
game_lowLvl_60 = game_lowLvl[(game_lowLvl["PlayerLevel"] >= 30) & (game_lowLvl["PlayerLevel"] < 60)]
game_lowLvl_99 = game_lowLvl[(game_lowLvl["PlayerLevel"] >= 60)]

game_medLvl_30 = game_medLvl[(game_medLvl["PlayerLevel"] < 30)]
game_medLvl_60 = game_medLvl[(game_medLvl["PlayerLevel"] >= 30) & (game_medLvl["PlayerLevel"] < 60)]
game_medLvl_99 = game_medLvl[(game_medLvl["PlayerLevel"] >= 60)]

game_highLvl_30 = game_highLvl[(game_highLvl["PlayerLevel"] < 30)]
game_highLvl_60 = game_highLvl[(game_highLvl["PlayerLevel"] >= 30) & (game_highLvl["PlayerLevel"] < 60)]
game_highLvl_99 = game_highLvl[(game_highLvl["PlayerLevel"] >= 60)]

In [14]:
print("Low Engagement Level")
print("Less than level 30: ",(len(game_lowLvl_30)))
print("Less than level 60: ",(len(game_lowLvl_60)))
print("More than level 60: ",(len(game_lowLvl_99)))
print()
print("Medium Engagement Level")
print("Less than level 30: ",(len(game_medLvl_30)))
print("Less than level 60: ",(len(game_medLvl_60)))
print("More than level 60: ",(len(game_medLvl_99)))
print()
print("High Engagement Level")
print("Less than level 30: ",(len(game_highLvl_30)))
print("Less than level 60: ",(len(game_highLvl_60)))
print("More than level 60: ",(len(game_highLvl_99)))

Low Engagement Level
Less than level 30:  3501
Less than level 60:  3272
More than level 60:  3551

Medium Engagement Level
Less than level 30:  5457
Less than level 60:  5884
More than level 60:  8033

High Engagement Level
Less than level 30:  2943
Less than level 60:  3095
More than level 60:  4298


Similar reasonings, however the data could be easily justified.
Depending on what kind of game and genre it is, it could be due to what the game content has to offer. For example, end game content in certain games are scarce which would lead to lower engagement level in gamers. However, there are games that do not require high engagement level as they can be played casually or does not require frequent engagements.

In [15]:
game["AchievementsUnlocked"].describe()

#This factor may not be significant as there are games with hundreds of achievements and some with no achievements. Even if
#the factor has been balanced out to take into account this flaw. It will result in improper results. 
#Therefore, the results achieved would not mean much, and should be taken with a pinch of salt. 

count    40034.000000
mean        24.526477
std         14.430726
min          0.000000
25%         12.000000
50%         25.000000
75%         37.000000
max         49.000000
Name: AchievementsUnlocked, dtype: float64

In [16]:
game_lowLvl_10 = game_lowLvl[(game_lowLvl["AchievementsUnlocked"] < 10)]
game_lowLvl_30 = game_lowLvl[(game_lowLvl["AchievementsUnlocked"] >= 10) & (game_lowLvl["AchievementsUnlocked"] < 30)]
game_lowLvl_49 = game_lowLvl[(game_lowLvl["AchievementsUnlocked"] >= 30)]

game_medLvl_10 = game_medLvl[(game_medLvl["AchievementsUnlocked"] < 10)]
game_medLvl_30 = game_medLvl[(game_medLvl["AchievementsUnlocked"] >= 10) & (game_medLvl["AchievementsUnlocked"] < 30)]
game_medLvl_49 = game_medLvl[(game_medLvl["AchievementsUnlocked"] >= 30)]

game_highLvl_10 = game_highLvl[(game_highLvl["AchievementsUnlocked"] < 10)]
game_highLvl_30 = game_highLvl[(game_highLvl["AchievementsUnlocked"] >= 10) & (game_highLvl["AchievementsUnlocked"] < 30)]
game_highLvl_49 = game_highLvl[(game_highLvl["AchievementsUnlocked"] >= 30)]

In [17]:
print("Low Engagement Level")
print("Less than 10 achievements: ",(len(game_lowLvl_30)))
print("Less than 30 achievements: ",(len(game_lowLvl_60)))
print("More than 30 achievements: ",(len(game_lowLvl_99)))
print()
print("Medium Engagement Level")
print("Less than 10 achievements: ",(len(game_medLvl_30)))
print("Less than 30 achievements: ",(len(game_medLvl_60)))
print("More than 30 achievements: ",(len(game_medLvl_99)))
print()
print("High Engagement Level")
print("Less than 10 achievements: ",(len(game_highLvl_30)))
print("Less than 30 achievements: ",(len(game_highLvl_60)))
print("More than 30 achievements: ",(len(game_highLvl_99)))

Low Engagement Level
Less than 10 achievements:  4382
Less than 30 achievements:  3272
More than 30 achievements:  3551

Medium Engagement Level
Less than 10 achievements:  7514
Less than 30 achievements:  5884
More than 30 achievements:  8033

High Engagement Level
Less than 10 achievements:  4034
Less than 30 achievements:  3095
More than 30 achievements:  4298
