In [1]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing the dataset

data = pd.read_csv('googleplaystore.csv')
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [4]:
data['Reviews'] = pd.to_numeric(data['Reviews'], errors='coerce').fillna(0).astype(np.int64)
data['Price'] = pd.to_numeric(data['Price'], errors='coerce').fillna(0).astype(np.int64)
print(data.dtypes)

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price               int64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object


In [5]:
data['Last Updated'] = pd.to_datetime(data['Last Updated'], errors='coerce', infer_datetime_format=True)
data.dtypes

App                       object
Category                  object
Rating                   float64
Reviews                    int64
Size                      object
Installs                  object
Type                      object
Price                      int64
Content Rating            object
Genres                    object
Last Updated      datetime64[ns]
Current Ver               object
Android Ver               object
dtype: object

In [6]:
def datetimecolumns(data):
    data['day'] = data['Last Updated'].dt.day
    data['month'] = data['Last Updated'].dt.month
    data['year'] = data['Last Updated'].dt.year
    return data

data = datetimecolumns(data)
data_updated = data.drop(columns='Last Updated')
data_updated.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Current Ver,Android Ver,day,month,year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,1.0.0,4.0.3 and up,7.0,1.0,2018.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,2.0.0,4.0.3 and up,15.0,1.0,2018.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1.2.4,4.0.3 and up,1.0,8.0,2018.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,Varies with device,4.2 and up,8.0,6.0,2018.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,1.1,4.4 and up,20.0,6.0,2018.0


In [8]:
# Fixing the code to handle commas before converting to int
data_updated['Installs'] = data_updated['Installs'].str.replace('+', '')
data_updated['Installs'] = data_updated['Installs'].str.replace('Free', '0')
data_updated['Installs'] = data_updated['Installs'].str.replace(',', '')

data_updated['Installs'] = pd.to_numeric(data_updated['Installs'])
data_updated['Installs'].head()

0       10000
1      500000
2     5000000
3    50000000
4      100000
Name: Installs, dtype: int64

In [10]:
#Profile of data

data_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  int64  
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  int64  
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Current Ver     10833 non-null  object 
 11  Android Ver     10838 non-null  object 
 12  day             10840 non-null  float64
 13  month           10840 non-null  float64
 14  year            10840 non-null  float64
dtypes: float64(4), int64(3), object(8)
memory usage: 1.2+ MB


In [11]:
# Null Values

data_updated.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Current Ver          8
Android Ver          3
day                  1
month                1
year                 1
dtype: int64

In [12]:
features_with_na = [features for features in data_updated.columns if data_updated[features].isnull().sum()>1]
for feature in features_with_na:
    print(f"{feature} : {np.round(data_updated[feature].isnull().mean()*100,4)} % missing values")

Rating : 13.5965 % missing values
Current Ver : 0.0738 % missing values
Android Ver : 0.0277 % missing values


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=730c9577-57d6-46b8-a5c2-8259fd2fe2f5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>