In [315]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [316]:
data = pd.read_csv("googleplaystore.csv")

## 1. Display Top 5 rows of Data

In [317]:
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## 2. Check the last 3 Rows of the dataset

In [318]:
data.tail(3)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,4.5,398307,19M,"10,000,000+",Free,0,Everyone,Lifestyle,"July 25, 2018",Varies with device,Varies with device


## 3. Finding the shape of the dataset

In [319]:
data.shape

(10841, 13)

## 4. General info about the dataset

In [320]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


## 5. Overall Statistics

In [321]:
data.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [322]:
data.describe(include='all')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,9367.0,10841.0,10841,10841,10840,10841.0,10840,10841,10841,10833,10838
unique,9660,34,,6002.0,462,22,3,93.0,6,120,1378,2832,33
top,ROBLOX,FAMILY,,0.0,Varies with device,"1,000,000+",Free,0.0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,,596.0,1695,1579,10039,10040.0,8714,842,326,1459,2451
mean,,,4.193338,,,,,,,,,,
std,,,0.537431,,,,,,,,,,
min,,,1.0,,,,,,,,,,
25%,,,4.0,,,,,,,,,,
50%,,,4.3,,,,,,,,,,
75%,,,4.5,,,,,,,,,,


## 6. Total Number of App titles containing Astrology

In [323]:
sum(data['App'].str.contains("Astrology", case=False))

3

## 7. Average App rating 

In [324]:
data['Rating'].mean()

np.float64(4.193338315362443)

## 8. Total Number of Unique Category

In [325]:
data['Category'].nunique()

34

## 9. Which category getting the highest average rating?

In [326]:
data.groupby('Category')['Rating'].mean().sort_values(ascending=False)

Category
1.9                    19.000000
EVENTS                  4.435556
EDUCATION               4.389032
ART_AND_DESIGN          4.358065
BOOKS_AND_REFERENCE     4.346067
PERSONALIZATION         4.335987
PARENTING               4.300000
GAME                    4.286326
BEAUTY                  4.278571
HEALTH_AND_FITNESS      4.277104
SHOPPING                4.259664
SOCIAL                  4.255598
WEATHER                 4.244000
SPORTS                  4.223511
PRODUCTIVITY            4.211396
HOUSE_AND_HOME          4.197368
FAMILY                  4.192272
PHOTOGRAPHY             4.192114
AUTO_AND_VEHICLES       4.190411
MEDICAL                 4.189143
LIBRARIES_AND_DEMO      4.178462
FOOD_AND_DRINK          4.166972
COMMUNICATION           4.158537
COMICS                  4.155172
NEWS_AND_MAGAZINES      4.132189
FINANCE                 4.131889
ENTERTAINMENT           4.126174
BUSINESS                4.121452
TRAVEL_AND_LOCAL        4.109292
LIFESTYLE               4.094904
V

## 10. Total number of apps having 5 star rating 

In [327]:
(data['Rating'] == 5).sum()

np.int64(274)

## 11. Finding average value of reviews

In [328]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [329]:
data['Reviews'].dtype

dtype('O')

In [330]:
data['Reviews'].astype(np.float64).mean()

ValueError: could not convert string to float: '3.0M'

In [331]:
data['Reviews'].replace('3.0M', '3.0',inplace=True)

In [332]:
data['Reviews'] = data['Reviews'].astype(np.float64)

In [333]:
data['Reviews'].mean()

np.float64(444111.9265750392)

## 12. Total number of free and paid apps

In [334]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [335]:
data['Type'].unique

<bound method Series.unique of 0        Free
1        Free
2        Free
3        Free
4        Free
         ... 
10836    Free
10837    Free
10838    Free
10839    Free
10840    Free
Name: Type, Length: 10841, dtype: object>

In [336]:
data['Type'].value_counts()

Type
Free    10039
Paid      800
0           1
Name: count, dtype: int64

## 13. Which app has maximum reviews ?

In [337]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [338]:
data[data['Reviews'] == data['Reviews'].max()][['App','Reviews']]

Unnamed: 0,App,Reviews
2544,Facebook,78158306.0


## 14. Top 5 Apps having the highest reviews

In [339]:
data.sort_values(by='Reviews', ascending=False)['App'].head()

2544              Facebook
3943              Facebook
381     WhatsApp Messenger
336     WhatsApp Messenger
3904    WhatsApp Messenger
Name: App, dtype: object

In [340]:
data.loc[data['Reviews'].sort_values(ascending=False).head().index,'App']

2544              Facebook
3943              Facebook
381     WhatsApp Messenger
336     WhatsApp Messenger
3904    WhatsApp Messenger
Name: App, dtype: object

## 15. Average Rating of Free and Paid Apps

In [341]:
data.groupby('Type')['Rating'].mean()

Type
0       19.000000
Free     4.186203
Paid     4.266615
Name: Rating, dtype: float64

## 16. Top 5 Apps having maximum installs

In [342]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [343]:
data['Installs'].dtype

dtype('O')

In [344]:
data['Installs']

0            10,000+
1           500,000+
2         5,000,000+
3        50,000,000+
4           100,000+
            ...     
10836         5,000+
10837           100+
10838         1,000+
10839         1,000+
10840    10,000,000+
Name: Installs, Length: 10841, dtype: object

In [345]:
data['Installs'] = data['Installs'].apply(lambda x:x[:-1])

In [346]:
data['Installs'] = data['Installs'].str.replace(',','')
data['Installs'] = data['Installs'].str.replace('+','')

In [347]:
data['Installs'].value_counts()

Installs
1000000       1579
10000000      1252
100000        1169
10000         1054
1000           907
5000000        752
100            719
500000         539
50000          479
5000           477
100000000      409
10             386
500            330
50000000       289
50             205
5               82
500000000       72
1               67
1000000000      58
0               14
                 1
Fre              1
Name: count, dtype: int64

In [352]:
data.drop(data[data['Installs'] == ''].index, inplace=True)
data.drop(data[data['Installs'] == 'Fre'].index, inplace=True)

In [353]:
data['Installs'].value_counts()

Installs
1000000       1579
10000000      1252
100000        1169
10000         1054
1000           907
5000000        752
100            719
500000         539
50000          479
5000           477
100000000      409
10             386
500            330
50000000       289
50             205
5               82
500000000       72
1               67
1000000000      58
0               14
Name: count, dtype: int64

In [354]:
data['Installs'] = data['Installs'].astype('int')

In [356]:
data.sort_values(by='Installs', ascending=False).head()['App']

5856    Google Play Games
5395        Google Photos
2853        Google Photos
2884        Google Photos
4170         Google Drive
Name: App, dtype: object

In [358]:
data.loc[data['Installs'].sort_values(ascending=False).head().index, 'App']

5856    Google Play Games
5395        Google Photos
2853        Google Photos
2884        Google Photos
4170         Google Drive
Name: App, dtype: object