In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly

# sns.set_style('darkgrid')

%matplotlib inline

# Make some assumptions

## about sources of data
If data comes from authorized sources or API it would be well formated but, \
If data is collected using scraping techniques, we need to cross-check check dtype or format of each features in the dataset.

## about size of data
I mean shape of the dataset.\
decide whether the dataset is near to population data or sample data.

For Example:\
https://www.google.com/search?q=total+apps+on+google+play+store answered 2.7 million apps but,\
I have only 0.4% data of the total apps, so dataset I have is near to the sample data.

In [2]:
# read the dataset
data_dir = '../datasets/googleplaystore'
data = pd.read_csv(f'{data_dir}/1_raw_data.csv')
data

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [4]:
inside = pd.DataFrame(data={
    'null_count':data.isnull().sum(),
    'null_count_%':data.isnull().sum()/100,
    'dtypes': data.dtypes,
    'observed_dtypes': {
        'App': 'object',
        'Category': 'object',
        'Rating': 'ordinal',
        'Reviews': 'int',
        'Size': 'float',
        'Installs': 'float',
        'Type': 'bool',
        'Price': 'float',
        'Content Rating': 'object',
        'Genres': 'object',
        'Last Updated': 'datetime',
        'Current Ver': 'ordinal',
        'Android Ver': 'ordinal',
    }
})




In [5]:
data.duplicated().sum()

483

In [6]:
inside['need_to_change_dtypes'] = ~(inside['dtypes']==inside['observed_dtypes'])
inside

Unnamed: 0,null_count,null_count_%,dtypes,observed_dtypes,need_to_change_dtypes
App,0,0.0,object,object,False
Category,0,0.0,object,object,False
Rating,1474,14.74,float64,ordinal,True
Reviews,0,0.0,object,int,True
Size,0,0.0,object,float,True
Installs,0,0.0,object,float,True
Type,1,0.01,object,bool,True
Price,0,0.0,object,float,True
Content Rating,1,0.01,object,object,False
Genres,0,0.0,object,object,False


## Observation

**App**: Application name \
**Category**: Category the app belongs to \
**Rating**: Overall user rating of the app (as when scraped) \
**Reviews**: Number of user reviews for the app (as when scraped) \
**Size**: Size of the app (as when scraped) \
**Installs**: Number of user downloads/installs for the app (as when scraped) \
**Type**: Paid or Free \
**Price**: Price of the app (as when scraped) \
**Content Rating**: Age group the app is targeted at - Children / Mature 21+ / Adult \
**Genres**: An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to Music, Game, Family genres \
**Last Updated**: Date when the app was last updated on Play Store (as when scraped) \
**Current Ver**: Current version of the app available on Play Store (as when scraped) \
**Android Ver**: Min required Android version (as when scraped) 


- Dataset has (10841 rows × 13 columns) which means it's sample data with repect to 2.7 million apps data


- Features need to be format
  - Rating, Reviews, Size, Installs, Type, Price, Last Updated, Current Ver, Android Ver
  
  
- Drop 483 duplicated rows

# Exploratory Data Analysis (EDA)

In [7]:
df = data.drop(data[data.duplicated()].index)

In [8]:
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [9]:
df.rename(dict(zip(df.columns,df.columns.str.replace(' ', '_').str.lower())), axis=1, inplace=True)

In [10]:
df.to_csv(f'{data_dir}/2_unique_data.csv') #, index_label='index')