In [1]:
#Dependancies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Reading in the csv files

#Creates two variables containing the locations of each csv
csvpath1 = "csvFiles/googleplaystore.csv"
csvpath2 = "csvFiles/googleplaystore_user_reviews.csv"

#Translates the csv files into pandas dataframes
rawGoogleApps = pd.read_csv(csvpath1)
rawGoogleReviews = pd.read_csv(csvpath2)

In [3]:
#Cleaning the datasets

#Pulls every column from rawGoogleApps aside from update versions
googleApps = rawGoogleApps[["App", "Category", "Rating", "Reviews", "Size", "Installs", "Type", "Price",
                            "Content Rating", "Genres", "Last Updated"]]

#Pulls only the App and Sentiment columns from rawGoogleReviews
googleReviews = rawGoogleReviews[["App", "Sentiment"]]

#Modifies column names
googleApps = googleApps.rename(columns={"Last Updated": "Updated",
                                       "Price": "Price in Dollars",
                                       "Size": "Kilobytes"})

#Drops rows that are lacking information
googleReviews = googleReviews.dropna()
googleApps = googleApps.dropna()

#Changes the Category column for googleApps to make it more readable
googleApps["Category"] = googleApps["Category"].replace({"ART_AND_DESIGN": "Art and Design",
                                                        "AUTO_AND_VEHICLES": "Auto and Vehicles",
                                                        "BEAUTY": "Beauty",
                                                        "BOOKS_AND_REFERENCE": "Books and Reference",
                                                        "BUSINESS": "Business",
                                                        "COMICS": "Comics",
                                                        "COMMUNICATION": "Communication",
                                                        "DATING": "Dating",
                                                        "EDUCATION": "Education",
                                                        "ENTERTAINMENT": "Entertainment",
                                                        "EVENTS": "Events",
                                                        "FINANCE": "Finance",
                                                        "FOOD_AND_DRINK": "Food and Drink",
                                                        "HEALTH_AND_FITNESS": "Health and Fitness",
                                                        "HOUSE_AND_HOME": "House and Home",
                                                        "LIBRARIES_AND_DEMO": "Libraries and Demo",
                                                        "LIFESTYLE": "Lifestyle",
                                                        "GAME": "Game",
                                                        "FAMILY": "Family",
                                                        "MEDICAL": "Medical",
                                                        "SOCIAL": "Social",
                                                        "SHOPPING": "Shopping",
                                                        "PHOTOGRAPHY": "Photography",
                                                        "SPORTS": "Sports",
                                                        "TRAVEL_AND_LOCAL": "Travel and Local",
                                                        "TOOLS": "Tools",
                                                        "PERSONALIZATION": "Personalization",
                                                        "PRODUCTIVITY": "Productivity",
                                                        "PARENTING": "Parenting",
                                                        "WEATHER": "Weather",
                                                        "VIDEO_PLAYERS": "Video Players",
                                                        "NEWS_AND_MAGAZINES": "News and Magazines",
                                                        "MAPS_AND_NAVIGATION": "Maps and Navigation"})

#Drops rows where the size varies with device, as they won't be applicable to our experiments
googleApps = googleApps[googleApps.Kilobytes != 'Varies with device']

#Removes the +, $ and other symbols in Installs to allow the values to be calculated in the future
googleApps["Installs"] = googleApps["Installs"].str.replace('+', '', regex=True)
googleApps["Installs"] = googleApps["Installs"].str.replace(',', '', regex=True)
googleApps["Price in Dollars"] = googleApps["Price in Dollars"].str.replace('$', '', regex=True)
googleApps["Kilobytes"] = googleApps["Kilobytes"].str.replace('M', '000', regex=True)
googleApps["Kilobytes"] = googleApps["Kilobytes"].str.replace('.', '', regex=True)
googleApps["Kilobytes"] = googleApps["Kilobytes"].str.replace('k', '', regex=True)

#Sets data to a numeric value to allow values to be calculated in the future
googleApps["Installs"] = pd.to_numeric(googleApps["Installs"])
googleApps["Reviews"] = pd.to_numeric(googleApps["Reviews"])
googleApps["Price in Dollars"] = pd.to_numeric(googleApps["Price in Dollars"])
googleApps["Kilobytes"] = pd.to_numeric(googleApps["Kilobytes"])

#Drops rows with duplicate app names, since they would cause our data to become inaccurate
googleApps = googleApps.drop_duplicates(subset='App',keep='last')