In [1]:
from datascience import *
import numpy as np
import math
import scipy.stats as stats
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [2]:
Data = Table.read_table('Games.csv')

In [3]:
Data.group("Developer").sort('count',descending = 1).where('Developer',are.containing('Supercell'))

Developer,count
Supercell,3


# Q1.
How much influence does the popularity of a game have on the user rating, and what causes the influence? (Chi-Square independency test)

To study this question, we choose to use the independency test. In order to do that, we should first convert the popularity into catagorical data. We determine the popularity of a game based on its user rating count, since the number of players and the count of ratings are positively related. The games are divided into 6 groups, and lets do the independency test.

In [4]:
data1 = Data.select('Average User Rating','User Rating Count').sort('User Rating Count',descending = True)
index = np.log10(data1.column(1)).astype('int16')
Class = []
for i in range(len(index)):
    if index[i] > 0:
        Class.append('More than 10^{} ratings'.format(index[i]))
    else:
        Class.append('Less than 10 ratings')
data11 = data1.with_column('Class',np.array(Class))
data1 = data11.select(0,2)

In [5]:
data1_df = data1.to_df()
contigencyTable = pd.crosstab(data1_df['Class'], data1_df['Average User Rating'], margins=True)
contigencyTable

Average User Rating,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,All
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Less than 10 ratings,7,32,75,92,133,184,220,257,332,1332
More than 10^1 ratings,5,19,54,158,249,462,800,1021,368,3136
More than 10^2 ratings,0,3,11,34,78,180,443,779,114,1642
More than 10^3 ratings,0,1,1,15,21,41,159,499,70,807
More than 10^4 ratings,0,0,0,3,12,21,33,182,60,311
More than 10^5 ratings,0,0,0,0,0,5,6,19,7,37
More than 10^6 ratings,0,0,0,0,0,0,0,2,0,2
All,12,55,141,302,493,893,1661,2759,951,7267


In [6]:
observed_margin = contigencyTable.values
observed = observed_margin[:-1, :-1]
observed

array([[   7,   32,   75,   92,  133,  184,  220,  257,  332],
       [   5,   19,   54,  158,  249,  462,  800, 1021,  368],
       [   0,    3,   11,   34,   78,  180,  443,  779,  114],
       [   0,    1,    1,   15,   21,   41,  159,  499,   70],
       [   0,    0,    0,    3,   12,   21,   33,  182,   60],
       [   0,    0,    0,    0,    0,    5,    6,   19,    7],
       [   0,    0,    0,    0,    0,    0,    0,    2,    0]])

In [7]:
chi2, p, dof, ex = stats.chi2_contingency(observed, correction=False)
print('test statistic {:.4},\ndegree of freedom {:},\np-value {:.4}'.format(chi2,dof,p))

test statistic 1.031e+03,
degree of freedom 48,
p-value 1.345e-184


The p-value is very small. Clearly, the rating of a game is related to its rating counts.

In [8]:
data1_grouped = data1.group('Class',np.mean)
data1_grouped

Class,Average User Rating mean
Less than 10 ratings,3.88476
More than 10^1 ratings,3.99713
More than 10^2 ratings,4.1553
More than 10^3 ratings,4.31103
More than 10^4 ratings,4.39871
More than 10^5 ratings,4.37838
More than 10^6 ratings,4.5


In [9]:
data1.group('Class',lambda x: np.std(x,ddof=1))

Class,Average User Rating
Less than 10 ratings,0.998707
More than 10^1 ratings,0.741345
More than 10^2 ratings,0.569057
More than 10^3 ratings,0.487984
More than 10^4 ratings,0.508189
More than 10^5 ratings,0.462562
More than 10^6 ratings,0.0


In [10]:
observed1 = np.array((data1_grouped.column('Average User Rating mean'),data11.group('Class',np.mean).column(2)))
observed1 = np.array((data1.group('Class',lambda x: np.std(x,ddof=1)).column('Average User Rating'),data11.group('Class',np.mean).column(2)))
chi2, p1, dof, ex = stats.chi2_contingency(observed1, correction=False)
chi2, p2, dof, ex = stats.chi2_contingency(observed1, correction=False)
print('p-values {}, {}'.format(p1,p2))
print('This proves that average rating and its std are related to the mean of rating counts')

p-values 0.0, 0.0
This proves that average rating and its std are related to the mean of rating counts


From the mean and standard we can discover that, overall, more popular games have high user ratings, with lower standard deviation. The result indicates that better games tend to be more popular, which makes perfect sense to us. Games that are rated by fewer people tend to receive more scattered ratings, which makes their ratings less reliable. This suggests that when we chose the game we want, we should not blindly trust 5-star games, but take the rating count into consideration too.

It is worth mentioning that our method of determining the popularity is not 100% fair. Because some games use popups to ask its player to rate the game in the store, in exchange for some free items or points. Also, ios App store clears the rating data upon each update, so it's not fair for games that have just updated recently.