# Load data and copy to df_chages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_original = pd.read_csv('googleplaystore.csv')

In [3]:
df_original.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [4]:
df_original.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [5]:
df_changes= df_original.copy()

## Clean data 

In [6]:
filtro = df_changes['Reviews'].str.contains('M')

In [7]:
df_changes[filtro]['Reviews'].apply(lambda x: float(x.split('M')[0])*(10**6))

10472    3000000.0
Name: Reviews, dtype: float64

In [8]:
def change_M(splited_data):
    numero = float(splited_data[0])
    if(len(splited_data)==2):
        return numero*(10**6)
    elif (len(splited_data)==1):
        return numero

In [9]:
df_changes['Reviews']=df_changes['Reviews'].apply(lambda x: x.split('M')).apply(change_M)

In [10]:
def installs_float(num):
    if(num == 'Free'):
        return 0
    else:
        return int(''.join(num.split('+')[0].split(',')))

In [11]:
df_changes['Installs']=df_changes['Installs'].apply(installs_float)

In [12]:
df_changes['Type']= df_changes['Type'].apply(lambda x: 'Free' if (x == '0') else x)

In [13]:
df_changes['Type']=df_changes['Type'].astype('category')

In [14]:
def price_to_float(num):
    if(num == 'Everyone' or num == '0'):
        return 0.0
    else:
        return float(num.split('$')[1])

In [15]:
df_changes['Price']=df_changes['Price'].apply(price_to_float)

In [16]:
df_changes['Rating']=df_changes['Rating'].fillna(0)

In [17]:
df_changes['Category']=df_changes['Category'].astype('category')

In [18]:
df_changes.dtypes

App                 object
Category          category
Rating             float64
Reviews            float64
Size                object
Installs             int64
Type              category
Price              float64
Content Rating      object
Genres              object
Last Updated        object
Current Ver         object
Android Ver         object
dtype: object

## Most popular apps

In [19]:
df_changes.sort_values(by=['Installs'],inplace= True)
df_changes.reset_index(drop= True, inplace= True)
df_changes_no_dup= df_changes.drop_duplicates(['App'],keep='first').copy()

In [20]:
df_changes_no_dup.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [21]:
df_most_popular = df_changes_no_dup[df_changes_no_dup['Installs'] == 1000000000].copy()

In [22]:
df_most_popular.sort_values(by=['Reviews'],inplace= True)
df_most_popular.reset_index(drop= True, inplace= True)

In [23]:
df_most_popular.to_csv('most_installed_apps.csv',index=False)

## Most popular category

In [24]:
df_changes_no_dup.reset_index(drop= True, inplace= True)

In [25]:
df_categories=  df_changes_no_dup.groupby(['Category'])['Installs'].sum()

In [26]:
df_categories = pd.DataFrame(df_categories)

In [27]:
df_categories.sort_values('Installs',inplace= True)
df_categories.reset_index(inplace= True)

In [28]:
df_categories.to_csv('categories.csv', index=False)

## Analysis by Type

In [29]:
df_by_cost = df_changes_no_dup.sort_values(by=['Type'])
df_by_cost['Type']= df_by_cost['Type'].fillna('Free')
df_by_cost = df_by_cost.sort_values(by=['Type'])
df_by_cost.reset_index(drop= True, inplace= True)
df_by_cost

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Google Play Games,FAMILY,4.3,7168735.0,Varies with device,1000000000,Free,0.00,Teen,Entertainment,"July 16, 2018",Varies with device,Varies with device
1,DS Vision,BUSINESS,0.0,0.0,38M,5,Free,0.00,Everyone,Business,"May 21, 2018",1.2,4.1 and up
2,Skype - free IM & video calls,COMMUNICATION,4.1,10484169.0,Varies with device,1000000000,Free,0.00,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device
3,Test Application DT 02,ART_AND_DESIGN,0.0,0.0,1.2M,0,Free,0.00,Everyone,Art & Design,"March 14, 2017",4.0,4.2 and up
4,Instagram,SOCIAL,4.5,66509917.0,Varies with device,1000000000,Free,0.00,Teen,Social,"July 31, 2018",Varies with device,Varies with device
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9655,Eu Sou Rico,FINANCE,0.0,0.0,1.4M,0,Paid,394.99,Everyone,Finance,"July 11, 2018",1.0,4.0.3 and up
9656,Eu sou Rico,FINANCE,0.0,0.0,2.6M,0,Paid,30.99,Everyone,Finance,"January 9, 2018",1.0,4.0 and up
9657,Popsicle Launcher for Android P 9.0 launcher,PERSONALIZATION,0.0,0.0,5.5M,0,Paid,1.49,Everyone,Personalization,"July 11, 2018",1.1,4.2 and up
9658,Visualmed,MEDICAL,0.0,0.0,3.1M,1,Paid,2.99,Everyone,Medical,"August 1, 2018",1.0,4.1 and up


In [30]:
df_free = df_by_cost[df_by_cost['Type'] == 'Free'].copy()
df_paid = df_by_cost[df_by_cost['Type'] == 'Paid'].copy()
df_free.reset_index(drop= True, inplace= True)
df_paid.reset_index(drop= True, inplace= True)

In [31]:
df_free['Rating'] =df_free['Rating'].apply(lambda x: x/10 if (x>10) else x)

In [32]:
df_free_ratings = df_free.groupby(['Rating'])['Rating'].count().to_frame(name= 'count')
df_free_ratings.reset_index(inplace= True)

In [33]:
df_paid_ratings = df_paid.groupby(['Rating'])['Rating'].count().to_frame(name= 'count')
df_paid_ratings.reset_index(inplace= True)

In [34]:
df_free_ratings.columns,df_paid_ratings.columns

(Index(['Rating', 'count'], dtype='object'),
 Index(['Rating', 'count'], dtype='object'))

In [35]:
def porcentaje(df_ratings):
    total = df_ratings['count'].sum()
    return df_ratings['count'].apply(lambda num: ((100*num)/total)/100)

In [36]:
df_free_ratings['percentage']=porcentaje(df_free_ratings)
df_paid_ratings['percentage']=porcentaje(df_paid_ratings)

In [37]:
df_free_ratings.to_csv('free_ratings.csv',index=False)

In [38]:
df_paid_ratings.to_csv('paid_ratings.csv',index=False)