In [2]:
#Dependancies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#Reading in the csv files

#Creates two variables containing the locations of each csv
csvpath1 = "csvFiles/googleplaystore.csv"
csvpath2 = "csvFiles/googleplaystore_user_reviews.csv"

#Translates the csv files into pandas dataframes
rawGoogleApps = pd.read_csv(csvpath1)
rawGoogleReviews = pd.read_csv(csvpath2)

In [4]:
#Cleaning the datasets

#Pulls every column from rawGoogleApps aside from update versions
googleApps = rawGoogleApps[["App", "Category", "Rating", "Reviews", "Size", "Installs", "Type", "Price",
                            "Content Rating", "Last Updated"]]

#Pulls only the App and Sentiment columns from rawGoogleReviews
googleReviews = rawGoogleReviews[["App", "Sentiment"]]

#Modifies column names
googleApps = googleApps.rename(columns={"Last Updated": "Updated",
                                       "Price": "Price in Dollars",
                                       "Size": "Kilobytes",
                                       "Content Rating": "Maturity"})

#Drops rows that are lacking information
googleReviews = googleReviews.dropna()
googleApps = googleApps.dropna()

#Changes the Category column for googleApps to make it more readable
googleApps["Category"] = googleApps["Category"].replace({"ART_AND_DESIGN": "Art and Design",
                                                        "AUTO_AND_VEHICLES": "Auto and Vehicles",
                                                        "BEAUTY": "Beauty",
                                                        "BOOKS_AND_REFERENCE": "Books and Reference",
                                                        "BUSINESS": "Business",
                                                        "COMICS": "Comics",
                                                        "COMMUNICATION": "Communication",
                                                        "DATING": "Dating",
                                                        "EDUCATION": "Education",
                                                        "ENTERTAINMENT": "Entertainment",
                                                        "EVENTS": "Events",
                                                        "FINANCE": "Finance",
                                                        "FOOD_AND_DRINK": "Food and Drink",
                                                        "HEALTH_AND_FITNESS": "Health and Fitness",
                                                        "HOUSE_AND_HOME": "House and Home",
                                                        "LIBRARIES_AND_DEMO": "Libraries and Demo",
                                                        "LIFESTYLE": "Lifestyle",
                                                        "GAME": "Game",
                                                        "FAMILY": "Family",
                                                        "MEDICAL": "Medical",
                                                        "SOCIAL": "Social",
                                                        "SHOPPING": "Shopping",
                                                        "PHOTOGRAPHY": "Photography",
                                                        "SPORTS": "Sports",
                                                        "TRAVEL_AND_LOCAL": "Travel and Local",
                                                        "TOOLS": "Tools",
                                                        "PERSONALIZATION": "Personalization",
                                                        "PRODUCTIVITY": "Productivity",
                                                        "PARENTING": "Parenting",
                                                        "WEATHER": "Weather",
                                                        "VIDEO_PLAYERS": "Video Players",
                                                        "NEWS_AND_MAGAZINES": "News and Magazines",
                                                        "MAPS_AND_NAVIGATION": "Maps and Navigation"})

#Drops rows with values which won't be applicable to our experiments
googleApps = googleApps[googleApps.Kilobytes != 'Varies with device']
googleApps = googleApps[googleApps.Maturity != 'Unrated']
googleApps = googleApps[googleApps.Maturity != 'Adults only 18+']

#Removes the +, $ and other symbols to allow the values to be calculated in the future
googleApps["Installs"] = googleApps["Installs"].str.replace('+', '', regex=True)
googleApps["Installs"] = googleApps["Installs"].str.replace(',', '', regex=True)
googleApps["Price in Dollars"] = googleApps["Price in Dollars"].str.replace('$', '', regex=True)
googleApps["Kilobytes"] = googleApps["Kilobytes"].str.replace('M', '000', regex=True)
googleApps["Kilobytes"] = googleApps["Kilobytes"].str.replace('.', '', regex=True)
googleApps["Kilobytes"] = googleApps["Kilobytes"].str.replace('k', '', regex=True)

#Sets data to a numeric value to allow values to be calculated in the future
googleApps["Installs"] = pd.to_numeric(googleApps["Installs"])
googleApps["Reviews"] = pd.to_numeric(googleApps["Reviews"])
googleApps["Price in Dollars"] = pd.to_numeric(googleApps["Price in Dollars"])
googleApps["Kilobytes"] = pd.to_numeric(googleApps["Kilobytes"])

#Drops rows with duplicate app names, since they would cause our data to become inaccurate
googleApps = googleApps.drop_duplicates(subset='App',keep='last')

#Displays the dataframe
googleApps.head()

Unnamed: 0,App,Category,Rating,Reviews,Kilobytes,Installs,Type,Price in Dollars,Maturity,Updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art and Design,4.1,159,19000,10000,Free,0.0,Everyone,"January 7, 2018"
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art and Design,4.7,87510,87000,5000000,Free,0.0,Everyone,"August 1, 2018"
3,Sketch - Draw & Paint,Art and Design,4.5,215644,25000,50000000,Free,0.0,Teen,"June 8, 2018"
4,Pixel Draw - Number Art Coloring Book,Art and Design,4.3,967,28000,100000,Free,0.0,Everyone,"June 20, 2018"
5,Paper flowers instructions,Art and Design,4.4,167,56000,50000,Free,0.0,Everyone,"March 26, 2017"


In [29]:
#Installed maximum value row
maxInstalls=googleApps.loc[googleApps['Installs'].idxmax()]
maxInstalls

App                 Subway Surfers
Category                      Game
Rating                         4.5
Reviews                   27711703
Kilobytes                    76000
Installs                1000000000
Type                          Free
Price in Dollars                 0
Maturity              Everyone 10+
Updated              July 12, 2018
Name: 3896, dtype: object

In [30]:
#Installed minimum value row
minInstalls=googleApps.loc[googleApps['Installs'].idxmin()]
minInstalls

App                 KBA-EZ Health Guide
Category                        Medical
Rating                                5
Reviews                               4
Kilobytes                         25000
Installs                              1
Type                               Free
Price in Dollars                      0
Maturity                       Everyone
Updated                  August 2, 2018
Name: 2454, dtype: object

In [32]:
#Rating maximum value row
maxRating=googleApps.loc[googleApps['Rating'].idxmax()]
maxRating

App                 Hojiboy Tojiboyev Life Hacks
Category                                  Comics
Rating                                         5
Reviews                                       15
Kilobytes                                  37000
Installs                                    1000
Type                                        Free
Price in Dollars                               0
Maturity                                Everyone
Updated                            June 26, 2018
Name: 329, dtype: object

In [33]:
#Rating minimum value row
minRating=googleApps.loc[googleApps['Rating'].idxmin()]
minRating

App                 House party - live chat
Category                             Dating
Rating                                    1
Reviews                                   1
Kilobytes                             92000
Installs                                 10
Type                                   Free
Price in Dollars                          0
Maturity                         Mature 17+
Updated                       July 31, 2018
Name: 625, dtype: object

In [34]:
#Kilobyte size maximum
maxKb=googleApps.loc[googleApps['Kilobytes'].idxmax()]
maxKb

App                     Post Bank
Category                  Finance
Rating                        4.5
Reviews                     60449
Kilobytes                  100000
Installs                  1000000
Type                         Free
Price in Dollars                0
Maturity                 Everyone
Updated             July 23, 2018
Name: 1080, dtype: object

In [35]:
#Kilobyte size minimum
minKb=googleApps.loc[googleApps['Kilobytes'].idxmin()]
minKb

App                 Market Update Helper
Category              Libraries and Demo
Rating                               4.1
Reviews                            20145
Kilobytes                             11
Installs                         1000000
Type                                Free
Price in Dollars                       0
Maturity                        Everyone
Updated                February 12, 2013
Name: 1553, dtype: object

In [14]:
count = googleApps["Rating"].value_counts()
count.head(10)

4.4    723
4.3    717
4.5    692
4.2    673
4.6    562
4.1    536
4.0    448
4.7    388
3.9    312
5.0    266
Name: Rating, dtype: int64

In [15]:
googleApps.describe()

Unnamed: 0,Rating,Reviews,Kilobytes,Installs,Price in Dollars
count,7026.0,7026.0,7026.0,7026.0,7026.0
mean,4.160447,144955.6,39434.121406,4474559.0,1.17189
std,0.559142,1023960.0,25120.007158,27145000.0,18.198351
min,1.0,1.0,11.0,1.0,0.0
25%,4.0,84.0,20000.0,10000.0,0.0
50%,4.3,1545.5,33000.0,100000.0,0.0
75%,4.5,26583.25,56000.0,1000000.0,0.0
max,5.0,44881450.0,100000.0,1000000000.0,400.0


In [10]:
# Find the averages for each category
categoryAverage = googleApps.groupby(['Category'], as_index=False).mean()
categoryAverage.head(10)

Unnamed: 0,Category,Rating,Reviews,Kilobytes,Installs,Price in Dollars
0,Art and Design,4.363158,18915.140351,38789.473684,1723388.0,0.104737
1,Auto and Vehicles,4.147619,15750.571429,39669.857143,694758.7,0.0
2,Beauty,4.291892,5020.243243,37891.891892,362600.0,0.0
3,Books and Reference,4.322695,23142.453901,39328.510638,814072.0,0.148156
4,Business,4.094118,19013.466063,37175.493213,1725262.0,0.23267
5,Comics,4.158696,12810.586957,33218.543478,368067.4,0.0
6,Communication,4.07672,242971.079365,31360.444444,9623892.0,0.220794
7,Dating,3.963934,18009.909836,39377.04918,693380.4,0.122787
8,Education,4.357333,40277.893333,34654.426667,1296013.0,0.1064
9,Entertainment,4.143396,127862.811321,37075.471698,9065283.0,0.056415


In [18]:
# Find the averages for each category
categoryMin = googleApps.groupby(['Category'], as_index=False).min()
categoryMin.head(10)

Unnamed: 0,Category,App,Rating,Reviews,Kilobytes,Installs,Type,Price in Dollars,Maturity,Updated
0,Art and Design,350 Diy Room Decor Ideas,3.2,1,10000,100,Free,0.0,Everyone,"April 15, 2018"
1,Auto and Vehicles,AE Garage,2.1,2,201,100,Free,0.0,Everyone,"April 21, 2018"
2,Beauty,AI Face Beauty Analysis - IntelliFace (Free),3.1,1,12000,100,Free,0.0,Everyone,"April 11, 2018"
3,Books and Reference,"10,000 Quotes DB (Premium)",2.7,2,93,5,Free,0.0,Everyone,"April 10, 2016"
4,Business,"104 Looking for a job - looking for a job, loo...",1.0,1,23,5,Free,0.0,Everyone,"April 11, 2017"
5,Comics,- Free Comics - Comic Apps,2.8,5,444,100,Free,0.0,Everyone,"August 1, 2018"
6,Communication,/u/app,1.0,1,17,10,Free,0.0,Everyone,"April 1, 2016"
7,Dating,"2Date Dating App, Love and matching",1.0,1,10000,10,Free,0.0,Everyone,"April 12, 2017"
8,Education,ABC Preschool Free,3.5,11,526,1000,Free,0.0,Everyone,"April 29, 2015"
9,Entertainment,AMC Theatres,3.0,303,11000,10000,Free,0.0,Everyone,"April 28, 2016"


In [26]:
categoryMin['Installs'].min()

1

In [12]:
# Find the averages for each category
categoryMax = googleApps.groupby(['Category'], as_index=False).max()
categoryMax.head(10)

Unnamed: 0,Category,App,Rating,Reviews,Kilobytes,Installs,Type,Price in Dollars,Maturity,Updated
0,Art and Design,صور حرف H,5.0,224399,94000,50000000,Paid,1.99,Teen,"September 20, 2017"
1,Auto and Vehicles,m.ride - your motorcycle app,4.9,271920,97000,10000000,Free,0.0,Teen,"October 28, 2016"
2,Beauty,"ipsy: Makeup, Beauty, and Tips",4.9,49790,98000,5000000,Free,0.0,Teen,"October 18, 2017"
3,Books and Reference,日本AV历史,5.0,445756,98000,10000000,Paid,4.6,Teen,"September 29, 2017"
4,Business,sABN,5.0,1002859,99000,100000000,Paid,17.99,Teen,"September 6, 2017"
5,Comics,감성학원 BL 첫사랑,5.0,238970,91000,5000000,Free,0.0,Teen,"September 29, 2015"
6,Communication,pretty Easy privacy p≡p,5.0,17712922,99000,500000000,Paid,4.99,Teen,"September 6, 2016"
7,Dating,stranger chat - anonymous chat,5.0,285838,96000,10000000,Paid,7.99,Teen,"October 25, 2017"
8,Education,"play2prep: ACT, SAT prep",4.9,342918,97000,10000000,Paid,3.99,Teen,"September 28, 2017"
9,Entertainment,🔥 Football Wallpapers 4K | Full HD Backgrounds 😍,4.7,1828284,97000,100000000,Paid,2.99,Teen,"September 22, 2015"


In [22]:
# Installs1 = round(googleApps.loc[googleApps["Category"] == "Art and Design"]["Rating"].mean(), 2)
# Installs2 = round(googleApps.loc[googleApps["Category"] == "Auto and Vehicles"]["Rating"].min(), 2)
Installs2

2.1