In [474]:
## Goal: Understand app on Google Play market

In [475]:
import pandas as pd
import sys

In [476]:
file_path = sys.path[0] + "/../data/"

In [477]:
app_df = pd.read_csv(file_path + "apps.csv", index_col = 0)
app_df.head() 

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


App: The name of app
Category: The category of app
Rating: The average rating of app
Reviews: A number of reviews about app
Size: A capacity of app
Installs:  A number of app users
Type: Free or Paid App
Price: The price of app
Content Rating: Target user
Genres: Genres of app
Last Updated: Recored update time
Current Ver: Current version of app
Android Ver: Version of android that app can be compatible

All of them are Categorical data (For each numerical data, I will divide it into many ranges as categories)

In [478]:
app_df.shape
## So we have 9659 rows & 13 features

(9659, 13)

In [479]:
## Checking null values
app_df.isna().sum()

App                  0
Category             0
Rating            1463
Reviews              0
Size              1227
Installs             0
Type                 1
Price                0
Content Rating       0
Genres               0
Last Updated         0
Current Ver          8
Android Ver          2
dtype: int64

In [480]:
## We see null values in Rating, Size, Type, Current Ver & Android Ver. So we will solve them. Only Current Ver is a minor feature
## so I will ignore it
app_df = app_df[~app_df["Rating"].isna() & ~app_df["Size"].isna() & ~app_df["Android Ver"].isna()].drop("Current Ver", axis = 1)
app_df.shape

(7025, 12)

In [481]:
## Only app we need to check duplicate
app_df[app_df["App"].duplicated() == False]
app_df.shape
## So App doesn't have any duplicate value

(7025, 12)

In [482]:
app_df["Category"].nunique()
## We have 33 app categories

33

In [499]:
## I check that Genres based on Category but they don't match each other so I will solve it
def custom_format(text):
    return " ".join([word.title() for word in text.split()])

In [508]:
type("a".join([i for i in ["b", "c"]]))


str

In [503]:
app_df["Category"] = app_df["Category"].astype("string") ## That's why object type gets on my nerves when working with data
app_df["Category"] = app_df["Category"].str.replace("_", " ", regex = True)
app_df["Category"] = app_df["Category"].str.replace("AND", "&", regex = False)
app_df["Category"] = app_df["Category"].apply(custom_format)
app_df[["Category", "Genres"]].head()
app_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7025 entries, 0 to 10840
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             7025 non-null   object        
 1   Category        7025 non-null   object        
 2   Rating          7025 non-null   float64       
 3   Reviews         7025 non-null   int32         
 4   Size            7025 non-null   float64       
 5   Installs        7025 non-null   int32         
 6   Type            7025 non-null   object        
 7   Price           7025 non-null   float64       
 8   Content Rating  7025 non-null   object        
 9   Genres          7025 non-null   object        
 10  Last Updated    7025 non-null   datetime64[ns]
 11  Android Ver     7025 non-null   object        
dtypes: datetime64[ns](1), float64(3), int32(2), object(6)
memory usage: 658.6+ KB


Numerical data: Rating, Reviews, Size, Installs, Price

In [485]:
## I check that Size, Installs, Price have a specical character. It will impede their conversion to a numerical data type. So I will solve 
## them
special_characters = ["M", "$", "+"]
target_numerical_cols = ["Size", "Installs", "Price"]
for col in target_numerical_cols:
    for character in special_characters:
        if character != "M":
            character = "\\" + character
            app_df[col] = app_df[col].str.replace(",", "", regex = True)  
        if app_df[col].str.contains(character).any():
            app_df[col] = app_df[col].str.replace(character, "", regex = True)
app_df[target_numerical_cols].head()            

Unnamed: 0,Size,Installs,Price
0,19.0,10000,0
1,14.0,500000,0
2,8.7,5000000,0
3,25.0,50000000,0
4,2.8,100000,0


In [486]:
## Check data type
app_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7025 entries, 0 to 10840
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             7025 non-null   object 
 1   Category        7025 non-null   object 
 2   Rating          7025 non-null   float64
 3   Reviews         7025 non-null   int64  
 4   Size            7025 non-null   object 
 5   Installs        7025 non-null   object 
 6   Type            7025 non-null   object 
 7   Price           7025 non-null   object 
 8   Content Rating  7025 non-null   object 
 9   Genres          7025 non-null   object 
 10  Last Updated    7025 non-null   object 
 11  Android Ver     7025 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 713.5+ KB


In [488]:
lst_int_cols = ["Reviews", "Installs"]
lst_float_cols = ["Rating", "Size", "Price"]
for col in app_df.columns.to_list():
    if col in lst_int_cols:
        app_df[col] = app_df[col].astype("int")
    if col in lst_float_cols:
        app_df[col] = app_df[col].astype("float")
    if col == "Last Updated":
        app_df[col] = pd.to_datetime(app_df[col]) 
app_df.dtypes                  

App                       object
Category                  object
Rating                   float64
Reviews                    int32
Size                     float64
Installs                   int32
Type                      object
Price                    float64
Content Rating            object
Genres                    object
Last Updated      datetime64[ns]
Android Ver               object
dtype: object