In [1]:
#importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# We decided to configure out settings exactly like we did for A2

# Configure libraries
# The seaborn library makes plots look nicer
sns.set()
sns.set_context('talk')

# Don't display too many rows/cols of DataFrames
pd.options.display.max_rows = 7
pd.options.display.max_columns = 8

# Round decimals when displaying DataFrames
pd.set_option('precision', 2)



We have two main files in our data set: googleplaystore.csv and googleplaystore_user_reviews.csv

For analysis of app names, we can focus on just the first data set.



In [3]:
df = pd.read_csv("gpsa/googleplaystore.csv")

In [4]:
#show the headers of the different columns
print(list(df.columns.values))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [5]:
#We only need the app names, ratings, reviews (number of) , and rating. 
df = df[['App', 'Rating', 'Reviews']]

In [6]:
#preview the dataframe
df.head

<bound method NDFrame.head of                                                      App  Rating Reviews
0         Photo Editor & Candy Camera & Grid & ScrapBook     4.1     159
1                                    Coloring book moana     3.9     967
2      U Launcher Lite – FREE Live Cool Themes, Hide ...     4.7   87510
...                                                  ...     ...     ...
10838                             Parkinson Exercices FR     NaN       3
10839                      The SCP Foundation DB fr nn5n     4.5     114
10840      iHoroscope - 2018 Daily Horoscope & Astrology     4.5  398307

[10841 rows x 3 columns]>

In [7]:
#It turns out that the reviews (number of) column is a column of strings
#Every time the number of reviews is in the millions, it is denoted with 'M'
#We need to fix all occurences of this to properly typecast

#We replace the instances of 'M' in the original column with E6
df['Reviews'] = df['Reviews'].replace({'M': 'E6'}, regex=True)

In [8]:
#The E6 allows us to convert the column of strings into a column of floats since
        #E6 is evaluated as *1000000
    
df['Reviews'] = df['Reviews'].astype(float)

In [9]:
#We want everything to be within 2 standard deviations of the mean to prevent outliers from
        #skewing our analysis

#store the mean
m = df['Reviews'].mean()

In [10]:
#store the standard deviation
s = df['Reviews'].std()

In [26]:
#We drop everything outside 2 std

((df['Reviews'] < 50)).sum()/((df['Reviews'] > -1)).sum()

0.27573424579412603

10521

0.3200266134397871

33.0