# Handout code
## This section is provided for your convenience. Feel free to run the code locally and explore how adjusting the parameters in different functions impacts the results.

In [None]:
import pandas as pd
pd.options.display.precision = 1

game_data = {
    "User_ID": [101, 102, 103, 104, 105, 106, 107],
    "Age": [23, 35, 45, 30, 25, 28, 33],
    "Gender": ["F", "M", "M", "F", "F", "M", "F"],  
    "Country": ["US", "CA", "US", "CA", "US", "US", "US"],  
    "Game_Score": [88, 92, 78, 85, None, 95, 82],
    "Sessions": [5, 8, 6, 9, 3, 7, 4],
    "Device": ["Mobile", "Desktop", "Tablet", "Mobile", "Mobile", "Desktop", "Mobile"],
    "App_Rating": [5.0, None, 3.8, 4.2, 4.0, 4.7, 4.1],
    "Game_Type": [
        ["Candy Crush", "Warzone"],
        ["Fortnite", "Elden Ring"],
        ["Super Mario", "Elden Ring"],
        ["Minecraft", "Among Us"],
        ["Roblox", "Call of Duty"],
        ["Candy Crush", "PUBG"],
        ["Among Us", "Candy Crush"]
    ]
}

game_df = pd.DataFrame(game_data)
display(game_df)

In [None]:
game_df.head()

In [None]:
game_df.sample(3)

In [None]:
game_df.info()

In [None]:
game_df.describe()

In [None]:
game_df.describe(include='all')

In [None]:
game_df.isnull()

In [None]:
game_df.isnull().sum()

In [None]:
game_df.isnull().sum().sum()

In [None]:
## converting Age and Sessions to integers
game_df["Age"] = game_df["Age"].astype(int)
game_df["Sessions"] = game_df["Sessions"].astype(int)

## converting Gender to a categorical type
game_df["Gender"] = game_df["Gender"].astype("category")

## ensuring Game_Score is float
game_df["Game_Score"] = game_df["Game_Score"].astype(float)

In [None]:
## Selecting numerical columns
print('numerical columns')
numerical_columns = game_df.select_dtypes(include=["int64", "float64"])
display(numerical_columns.head())

## Selecting categorical columns
print('categorical columns')
categorical_columns = game_df.select_dtypes(include=["category", "object"])
display(categorical_columns.head())

In [None]:
game_df_expanded = game_df.explode("Game_Type")
display(game_df_expanded)

In [None]:
popular_games = game_df_expanded["Game_Type"].value_counts()
display(popular_games)

In [None]:
def categorize_rating(rating):
    """
    a function to categorize the app rating 
    with a threshold of 4
    """
    if rating >= 4:
        return "High"
    else:
        return "Low"

# Apply the function using the 'map' method
game_df["Rating_Category"] = game_df["App_Rating"].map(categorize_rating)

# Display the updated DataFrame with App Rating and Rating Category
display(game_df[["App_Rating", "Rating_Category"]])

In [None]:
def calculate_engagement_score(row):
    """
    Calculate engagement score based on Game_Score, App_Rating, and Sessions.
    
    The engagement score is calculated as:
        (Game_Score * App_Rating) / Sessions
        
    If Sessions is 0, the score is set to 0 to avoid division by zero.
    
    Parameters:
    row (pandas.Series): A single row of the DataFrame. Each row contains 
    the values of all columns for that specific observation, allowing us 
    to access the "Game_Score", "App_Rating", and "Sessions" values directly.
    
    Returns:
    float: The engagement score for the given row.
    """
    if row["Sessions"] > 0:
        return (row["Game_Score"] * row["App_Rating"]) / row["Sessions"]
    else:
        return 0

# Apply the function row-wise to calculate the Engagement Score
# We have axis = 1 indicating that operation is done for every row
game_df["Engagement_Score"] = game_df.apply(calculate_engagement_score, axis=1)

# Display the updated DataFrame with relevant columns
display(game_df[["Game_Score", "App_Rating", "Sessions", "Engagement_Score"]])

In [None]:
display(game_df.drop(columns=["Game_Type"]).nunique())


In [None]:
print(game_df["Gender"].unique())
print(game_df["Country"].unique())
print(game_df["Device"].unique())

In [None]:
game_df[game_df["Device"] == "Mobile"]["Country"].unique()


In [None]:
display(game_df["Device"].value_counts())


In [None]:
display(game_df["Device"].value_counts(normalize=True))


In [None]:
device_gender_counts = game_df.value_counts(subset=["Device", "Gender"])
display(device_gender_counts)

In [None]:
display(device_gender_counts.index)

In [None]:
display(device_gender_counts.loc[("Mobile", "F")])


In [None]:
display(device_gender_counts.loc[[("Mobile", "F"), ("Tablet", "M")]])


In [None]:
# Step 1
game_df["Device"] = game_df["Device"].astype("category")

# Step 2
game_df["encoded_Device"] = game_df["Device"].cat.codes

In [None]:
cat_dict = dict(enumerate(game_df["Device"].cat.categories))
cat_dict

In [None]:
game_df

In [None]:
encoding_dict = {
    "Device": {1: "Mobile", 2: "Desktop", 3: "Tablet"}
}
game_df.replace(encoding_dict, inplace=True)
display(game_df[["Device"]])

In [None]:
game_df

In [None]:
# I added this cell to bring back the device names; instead of codes of 1, 2,3 created
# in the previous cell

encoding_dict = {
    "Device": {1: "Mobile", 2: "Desktop", 3: "Tablet"}
}
game_df.replace(encoding_dict, inplace=True)
display(game_df[["Device"]])

In [None]:
game_df

In [None]:
game_df_one_hot = pd.get_dummies(game_df, columns=["Device"], dtype = int)
game_df

In [None]:
game_df_one_hot

In [None]:
us_users = game_df.query("Country == 'US'")
display(us_users)

In [None]:
# Select rows where Game_Score > 85 and Device is Mobile
high_scoring_mobile_users = game_df.query("Game_Score > 85 & Device == 'Mobile'")
display(high_scoring_mobile_users)

In [None]:
# Select rows where Gender is F and Sessions > 5
active_female_users = game_df.query("Gender == 'F' & Sessions > 5")
display(active_female_users)

In [None]:
threshold = 85
game_df.query("Game_Score > @threshold")

In [None]:
avg_game_score_by_country = game_df.groupby("Country")["Game_Score"].mean()
display(avg_game_score_by_country)

In [None]:
avg_game_score_country_device = game_df.groupby(["Country", "Device"])["Game_Score"].mean()
display(avg_game_score_country_device)

In [None]:
user_counts_by_device = game_df.groupby("Device")["User_ID"].count()
display(user_counts_by_device)


In [None]:
summary_stats = game_df.groupby("Country").agg({
                                                "Game_Score": ["mean", "max"], 
                                                "Sessions": "mean"})
display(summary_stats)

In [None]:
def score_range(input_s):
    """
    returning range of a Series

    """
    return input_s.max() - input_s.min()

# Using custom aggregation function in agg
score_summary = game_df.groupby("Device").agg({"Game_Score": ["mean", score_range]})
display(score_summary)

In [None]:
pivot_table = game_df.pivot_table(index = "Country", 
                                  columns = "Device",
                                  values = "Game_Score", 
                                  aggfunc = "mean", 
                                  fill_value = 0)
display(pivot_table)

In [None]:
# Creating a pivot table with multiple aggregations
pivot_table_multi = game_df.pivot_table(index="Country", 
                                        columns="Device",
                                        values=["Game_Score", "Sessions"],  
                                        aggfunc={"Game_Score": "mean", "Sessions": "sum"}, 
                                        fill_value=0,
                                        margins = True)
display(pivot_table_multi)

In [None]:
# redefine game_df before calculating corr
game_data = {
    "User_ID": [101, 102, 103, 104, 105, 106, 107],
    "Age": [23, 35, 45, 30, 25, 28, 33],
    "Gender": ["F", "M", "M", "F", "F", "M", "F"],  
    "Country": ["US", "CA", "US", "CA", "US", "US", "US"],  
    "Game_Score": [88, 92, 78, 85, None, 95, 82],
    "Sessions": [5, 8, 6, 9, 3, 7, 4],
    "Device": ["Mobile", "Desktop", "Tablet", "Mobile", "Mobile", "Desktop", "Mobile"],
    "App_Rating": [5.0, None, 3.8, 4.2, 4.0, 4.7, 4.1],
    "Game_Type": [
        ["Candy Crush", "Warzone"],
        ["Fortnite", "Elden Ring"],
        ["Super Mario", "Elden Ring"],
        ["Minecraft", "Among Us"],
        ["Roblox", "Call of Duty"],
        ["Candy Crush", "PUBG"],
        ["Among Us", "Candy Crush"]
    ]
}

game_df = pd.DataFrame(game_data)
#display(game_df)

In [None]:
# Select only numerical columns
numerical_game_df = game_df.select_dtypes(include=['number']).drop(columns = ['User_ID'])

# Compute correlation
correlation_matrix = numerical_game_df.corr()
correlation_matrix


In [None]:
import seaborn as sns

sns.heatmap(correlation_matrix, annot=True)