# Data exploration

## Selecting the data

We start by uploading the data and all necessary libraries

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

app_data = pd.read_csv('googleplaystore.csv')

Function below takes in a data frame and returns percentage/number of missing values for each feature(column). It will be used later, to have a better understanding of the data

In [2]:
def missing_values(test):
    total = test.isnull().sum().sort_values(ascending=False)
    percent_1 = test.isnull().sum() / test.isnull().count() * 100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
    
    return missing_data.head((missing_data['Total'] >= 0).sum())

Display the amount of missing values per column and what percentage of the column contains missing values

In [3]:
missing_values(app_data)

Unnamed: 0,Total,%
Rating,1474,13.6
Current Ver,8,0.1
Android Ver,3,0.0
Content Rating,1,0.0
Type,1,0.0
Last Updated,0,0.0
Genres,0,0.0
Price,0,0.0
Installs,0,0.0
Size,0,0.0


Removing app titled "Life Made Wifi Touchscreen Photo Frame". Reason: many missing and unusable values


app_data = app_data.drop(10472)
Removing rows with missing rating 
I don't think I did anything with this line -Ilia

In [4]:
app_data = app_data[app_data.Rating.notnull()]

In [5]:
# Count unique values in App and see that there are duplicates
unique_apps = app_data["App"].nunique()
all_apps = app_data["App"].size

f"total app count: {all_apps}, unique app count: {unique_apps}"

'total app count: 9367, unique app count: 8197'

In [6]:
# Removing duplicate apps
app_data = app_data.drop_duplicates(subset = "App")

new_all_apps = app_data["App"].size
f"total app count after dropping duplicates: {new_all_apps}"

'total app count after dropping duplicates: 8197'

In [7]:
# Convert Installs to numeric

app_data["Installs"].dtype
app_data["Installs"].describe

# idk why the line below is giving an error -Ilia
# app_data_y["Installs"] =  pd.to_numeric((app_data_y["Installs"].str.replace("+", "")).str.replace(",", ""))



<bound method NDFrame.describe of 0            10,000+
1           500,000+
2         5,000,000+
3        50,000,000+
4           100,000+
            ...     
10834           500+
10836         5,000+
10837           100+
10839         1,000+
10840    10,000,000+
Name: Installs, Length: 8197, dtype: object>

In [8]:
#See all unique values in the Android version column
app_data["Android Ver"].unique()
# Note that most are num and up. Hence, we can split the column into 3 by space

#Take the first part of the android version
app_data["Android Ver"] = app_data["Android Ver"].str.split(expand = True)

In [12]:
# Select numeric columns that can be converted as is
num_cols = ["Reviews"]


In [13]:
app_data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

Now, we delete the features that are not normally available when the app is about to launch.

In [14]:
not_available_cols = ['Reviews', 'Installs', 'Last Updated', 'Current Ver']
app_data = app_data.drop(not_available_cols, axis = 1)

In [15]:
missing_values(app_data)

Unnamed: 0,Total,%
Android Ver,3,0.0
Content Rating,1,0.0
Genres,0,0.0
Price,0,0.0
Type,0,0.0
Size,0,0.0
Rating,0,0.0
Category,0,0.0
App,0,0.0


In [16]:
app_data[(app_data["Content Rating"].isnull() | app_data["Android Ver"].isnull())]

Unnamed: 0,App,Category,Rating,Size,Type,Price,Content Rating,Genres,Android Ver
4453,[substratum] Vacuum: P,PERSONALIZATION,4.4,11M,2,$1.49,Everyone,Personalization,
4490,Pi Dark [substratum],PERSONALIZATION,4.5,2.1M,1,0,Everyone,Personalization,
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,"1,000+",0,Everyone,,"February 11, 2018",


Since the number of missing values in the android ver column is small and it intersects with the single missing value in Content rating, we will delete those rows

In [17]:
inds = app_data[(app_data["Content Rating"].isnull() | app_data["Android Ver"].isnull())].index
#Since the only missing values are in Android version and Content rating, we can use dropna
app_data = app_data.dropna()
app_data.head(5)

Unnamed: 0,App,Category,Rating,Size,Type,Price,Content Rating,Genres,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,19M,1,0,Everyone,Art & Design,4.0.3
1,Coloring book moana,ART_AND_DESIGN,3.9,14M,1,0,Everyone,Art & Design;Pretend Play,4.0.3
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,8.7M,1,0,Everyone,Art & Design,4.0.3
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,25M,1,0,Teen,Art & Design,4.2
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,2.8M,1,0,Everyone,Art & Design;Creativity,4.4


Now, we can separate the data into x data and y data

In [19]:
# Create df for Rating
# the y df
app_data_y = app_data["Rating"]

# Create df for the Rest of the features
# the x df
columns_drop = ["Rating", "App"]
app_data_x = app_data.drop(columns_drop, axis = 1)

## Converting the data into workable shape

In [None]:
# There are 3 values in type column: free, paid and nan. There is only 1 nan so I looked at it and
# since the price is set to 0, I will put free for that cell

# Change price column to free = 0 and paid = 1
paid_types = app_data["Type"].unique()
paid_types
app_data[(app_data["Type"] == "0")]

# creating instance of labelencoder
labelencoder = LabelEncoder()
app_data['Type'] = labelencoder.fit_transform(app_data['Type'])

## Data visualisation