In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
# from scipy.stats import t
from scipy.stats import f_oneway

In [2]:
df = pd.read_csv('data/candy_crush.csv')
df.head()

Unnamed: 0,player_id,dt,level,num_attempts,num_success
0,6dd5af4c7228fa353d505767143f5815,2014-01-04,4,3,1
1,c7ec97c39349ab7e4d39b4f74062ec13,2014-01-01,8,4,1
2,c7ec97c39349ab7e4d39b4f74062ec13,2014-01-05,12,6,0
3,a32c5e9700ed356dc8dd5bb3230c5227,2014-01-03,11,1,1
4,a32c5e9700ed356dc8dd5bb3230c5227,2014-01-07,15,6,0


In [3]:
# create a column difficulty
df['difficulty'] = df.num_attempts/df.num_success
df.head()

Unnamed: 0,player_id,dt,level,num_attempts,num_success,difficulty
0,6dd5af4c7228fa353d505767143f5815,2014-01-04,4,3,1,3.0
1,c7ec97c39349ab7e4d39b4f74062ec13,2014-01-01,8,4,1,4.0
2,c7ec97c39349ab7e4d39b4f74062ec13,2014-01-05,12,6,0,inf
3,a32c5e9700ed356dc8dd5bb3230c5227,2014-01-03,11,1,1,1.0
4,a32c5e9700ed356dc8dd5bb3230c5227,2014-01-07,15,6,0,inf


In [8]:
# replace inf by 0 in the column difficulty
df['difficulty'] = df['difficulty'].replace([np.inf, -np.inf], 0)
# fill nan with 0
df = df.fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16865 entries, 0 to 16864
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   player_id     16865 non-null  object 
 1   dt            16865 non-null  object 
 2   level         16865 non-null  int64  
 3   num_attempts  16865 non-null  int64  
 4   num_success   16865 non-null  int64  
 5   difficulty    16865 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 790.7+ KB


In [7]:
unique_difficulties = df['difficulty'].unique()

# Print unique values
print(unique_difficulties)

[3.00000000e+00 4.00000000e+00 0.00000000e+00 1.00000000e+00
 8.00000000e+00 1.50000000e+01 2.00000000e+00 5.00000000e+00
 7.00000000e+00 1.40000000e+01 9.00000000e+00 6.00000000e+00
 1.50000000e+00 1.70000000e+01 2.80000000e+01 2.10000000e+01
 1.00000000e+01 3.50000000e+00 1.20000000e+01 9.50000000e+00
            nan 3.00000000e+01 1.30000000e+01 1.10000000e+01
 4.00000000e-01 1.60000000e+01 1.80000000e+01 2.20000000e+01
 5.00000000e-01 1.90000000e+01 2.00000000e+01 1.25000000e+00
 2.60000000e+01 2.00000000e-01 6.66666667e-01 2.70000000e+01
 3.20000000e+01 6.15384615e+00 3.33333333e+00 2.40000000e+01
 2.90000000e+01 3.40000000e+01 2.30000000e+01 1.83333333e+00
 3.60000000e+01 1.80000000e+00 2.50000000e+01 2.33333333e+00
 4.70000000e+01 2.50000000e+00 2.50000000e-01 9.16666667e+00
 4.00000000e+01 3.33333333e-01 5.50000000e+01 9.50000000e+01
 2.66666667e+00 1.26086957e+00 1.33333333e+00 5.50000000e+00
 3.10000000e+01 1.75000000e+00 5.80000000e+01 6.50000000e+00
 8.50000000e+00 1.700000

In [9]:
# df['difficulty'] = df['difficulty'].astype(int)

In [10]:
# Perform one-way ANOVA
grouped_data = df.groupby('level')['difficulty'].apply(list)
F, p = f_oneway(*grouped_data)

# Print results
print("F-value:", F)
print("p-value:", p)
if p < 0.05:
    print("The difficulty is significantly different across different game levels.")
else:
    print("The difficulty is not significantly different across different game levels.")

F-value: 11.455219773675706
p-value: 8.262214143225648e-27
The difficulty is significantly different across different game levels.


In this example, we first load the dataset using the pandas library, and then group the data by the game level. We then apply the f_oneway() function from the scipy.stats library to perform the one-way ANOVA test, which tests whether there is a significant difference in difficulty across different game levels. Finally, we print the F-value and p-value, and check whether the p-value is less than 0.05 (our chosen significance level) to determine whether the difficulty is significantly different across different game levels.

