In [18]:
import pandas as pd
import numpy as np
    
df = pd.read_csv('./assets/googleplaystore.csv')
df.head()  

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [19]:
# Step 1 :- Data Cleaning

# Checking for Removing Duplicates
def remove_duplicates(df):
    print("Found Duplicates:", df[df.duplicated()].__len__())
    print("Removed all duplicates successfully")
    df.drop_duplicates()
    
remove_duplicates(df)
print()
df.info()

Found Duplicates: 483
Removed all duplicates successfully

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [20]:
# Checking for Null Values :- We find Over 1.4K null values in Ratings
print(df.isna().sum(axis = 0))

# Instead of dropping, we fill it with mean
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())
print("\nMissing values in Rating: ", df['Rating'].isna().sum(axis = 0))

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

Missing values in Rating:  0


In [21]:
# Filter other rows with missing values
df = df[df.notna().all(axis = 1)]
print(df.isna().sum(axis = 0))

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64


In [22]:
# Checking for Outliers in Rating :- No outliers found
print(df['Rating'].describe())

count    10829.000000
mean         4.192041
std          0.479038
min          1.000000
25%          4.100000
50%          4.200000
75%          4.500000
max          5.000000
Name: Rating, dtype: float64


In [23]:
# Cheking for category column 
#       - Verdict :- looks fine
df['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION'],
      dtype=object)

In [24]:
# Checking for Ratings Column 
#       - Verdict :- One value is not rounded off
print(df['Rating'].unique(), df['Rating'].unique().shape[0])

# Round off the column
df['Rating'] = df['Rating'].apply(lambda x: round(x,1))
print()
print(df['Rating'].unique())

[4.1        3.9        4.7        4.5        4.3        4.4
 3.8        4.2        4.6        4.         4.19333832 4.8
 4.9        3.6        3.7        3.2        3.3        3.4
 3.5        3.1        5.         2.6        3.         1.9
 2.5        2.8        2.7        1.         2.9        2.3
 2.2        1.7        2.         1.8        2.4        1.6
 2.1        1.4        1.5        1.2       ] 40

[4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.2 4.6 4.  4.8 4.9 3.6 3.7 3.2 3.3 3.4 3.5
 3.1 5.  2.6 3.  1.9 2.5 2.8 2.7 1.  2.9 2.3 2.2 1.7 2.  1.8 2.4 1.6 2.1
 1.4 1.5 1.2]


In [25]:
# Checking Reviews Column
#       - Verdict :- All values are numeric only

print("Total Records: ", df.shape[0])
print('Reviews with (d+) pattern: ', df[df['Reviews'].str.fullmatch(r'(\d+)')].shape[0])
print()

# Convert into int datatype
df['Reviews'] = df['Reviews'].astype(int)
df.info()

Total Records:  10829
Reviews with (d+) pattern:  10829

<class 'pandas.core.frame.DataFrame'>
Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  int64  
 4   Size            10829 non-null  object 
 5   Installs        10829 non-null  object 
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  object 
 8   Content Rating  10829 non-null  object 
 9   Genres          10829 non-null  object 
 10  Last Updated    10829 non-null  object 
 11  Current Ver     10829 non-null  object 
 12  Android Ver     10829 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.2+ MB


In [26]:
# Checking Size Column

print("Total Records: ", df.shape[0])
print('Total Records having Size with d+(.d+)?k or d+(.d+)?M pattern:',
    df[df['Size'].str.fullmatch(r"((\d+)(\.\d+)?M)|((\d+)(\.\d+)?k)")]['Size'].shape[0]
)

# Finding the other patterns except \d+M and \d+k
print('Other Values:', 
    df[~df['Size'].str.fullmatch(r"((\d+)(\.\d+)?M)|((\d+)(\.\d+)?k)")]['Size'].unique()
)
# Found the value 'Varies with Device'
# Count is significant. Replace with NaN for now

# set Size = NaN for now where Size = 'Varies with device'
df['Size'].replace('Varies with device', np.nan, inplace = True)
print('Replaced:', df.loc[37, 'Size'])
print()

# Conver the units 'M' and k
df['Size'] = df['Size'].str.replace('M', '000000')
df['Size'] = df['Size'].str.replace('k', '000')

# convert into integer
df['Size'] = df['Size'].astype(float)
df.info()

Total Records:  10829


Total Records having Size with d+(.d+)?k or d+(.d+)?M pattern: 9135
Other Values: ['Varies with device']
Replaced: nan

<class 'pandas.core.frame.DataFrame'>
Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  int64  
 4   Size            9135 non-null   float64
 5   Installs        10829 non-null  object 
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  object 
 8   Content Rating  10829 non-null  object 
 9   Genres          10829 non-null  object 
 10  Last Updated    10829 non-null  object 
 11  Current Ver     10829 non-null  object 
 12  Android Ver     10829 non-null  object 
dtypes: float64(2), int64(1), object(10)
memory usage: 1.4+ MB


In [27]:
# Checking Installs Column
#       - Verdict :- All datapoints follow same pattern. Leave as it is
print(df['Installs'].unique())
print()

# Remove , and +
df['Installs'] = df['Installs'].str.replace(',','')
df['Installs'] = df['Installs'].str.replace('+','')

# Convert into int
df['Installs'] = df['Installs'].astype(int)
df.info()


['10,000+' '500,000+' '5,000,000+' '50,000,000+' '100,000+' '50,000+'
 '1,000,000+' '10,000,000+' '5,000+' '100,000,000+' '1,000,000,000+'
 '1,000+' '500,000,000+' '50+' '100+' '500+' '10+' '1+' '5+' '0+']

<class 'pandas.core.frame.DataFrame'>
Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  int64  
 4   Size            9135 non-null   float64
 5   Installs        10829 non-null  int64  
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  object 
 8   Content Rating  10829 non-null  object 
 9   Genres          10829 non-null  object 
 10  Last Updated    10829 non-null  object 
 11  Current Ver     10829 non-null  object 
 12  Android Ver     10829 non-null  object 
dtypes: float6

In [28]:
# Checking Type Column
#       - Low Cardinality Categorical Variable. Leave as it is

print(df['Type'].unique())

['Free' 'Paid']


In [29]:
# Checking Price Column
#       - Price following the pattern $(float)

print(df['Price'].unique())
print()

# Remove dollar symbol and convert into float type
df['Price'] = df['Price'].str.replace('$', '')
df['Price'] = df['Price'].astype(float)
df.info()

['0' '$4.99' '$3.99' '$6.99' '$1.49' '$2.99' '$7.99' '$5.99' '$3.49'
 '$1.99' '$9.99' '$7.49' '$0.99' '$9.00' '$5.49' '$10.00' '$24.99'
 '$11.99' '$79.99' '$16.99' '$14.99' '$1.00' '$29.99' '$12.99' '$2.49'
 '$10.99' '$1.50' '$19.99' '$15.99' '$33.99' '$74.99' '$39.99' '$3.95'
 '$4.49' '$1.70' '$8.99' '$2.00' '$3.88' '$25.99' '$399.99' '$17.99'
 '$400.00' '$3.02' '$1.76' '$4.84' '$4.77' '$1.61' '$2.50' '$1.59' '$6.49'
 '$1.29' '$5.00' '$13.99' '$299.99' '$379.99' '$37.99' '$18.99' '$389.99'
 '$19.90' '$8.49' '$1.75' '$14.00' '$4.85' '$46.99' '$109.99' '$154.99'
 '$3.08' '$2.59' '$4.80' '$1.96' '$19.40' '$3.90' '$4.59' '$15.46' '$3.04'
 '$4.29' '$2.60' '$3.28' '$4.60' '$28.99' '$2.95' '$2.90' '$1.97'
 '$200.00' '$89.99' '$2.56' '$30.99' '$3.61' '$394.99' '$1.26' '$1.20'
 '$1.04']

<class 'pandas.core.frame.DataFrame'>
Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App       

In [30]:
# Checking Content Rating Column
#       - Low Cardinality Categorical Variable. Leave as it is

print(df['Content Rating'].unique())

['Everyone' 'Teen' 'Everyone 10+' 'Mature 17+' 'Adults only 18+' 'Unrated']


In [31]:
# Checking Genres Column
#       - We already have Category Column, so we can drop Genres

df.drop('Genres', axis = 1, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10829 entries, 0 to 10840
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  int64  
 4   Size            9135 non-null   float64
 5   Installs        10829 non-null  int64  
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  float64
 8   Content Rating  10829 non-null  object 
 9   Last Updated    10829 non-null  object 
 10  Current Ver     10829 non-null  object 
 11  Android Ver     10829 non-null  object 
dtypes: float64(3), int64(2), object(7)
memory usage: 1.3+ MB


In [32]:
# Checking Last Updated Column
#       - All are datetime. Conver into datetime datatype
print(df['Last Updated'].unique().tolist())

df['Last Updated'] = pd.to_datetime(df['Last Updated'])
print(df['Last Updated'].unique().tolist())
print()

# Make Date, Month and Year Column
df['Last Updated Date'] = df['Last Updated'].dt.day
df['Last Updated Month'] = df['Last Updated'].dt.month
df['Last Updated Year'] = df['Last Updated'].dt.year

# drop the old Last
df.drop('Last Updated', axis = 1, inplace = True)
df.info()

['January 7, 2018', 'January 15, 2018', 'August 1, 2018', 'June 8, 2018', 'June 20, 2018', 'March 26, 2017', 'April 26, 2018', 'June 14, 2018', 'September 20, 2017', 'July 3, 2018', 'October 27, 2017', 'July 31, 2018', 'April 2, 2018', 'June 26, 2018', 'August 3, 2018', 'November 7, 2017', 'July 30, 2018', 'April 20, 2018', 'March 20, 2018', 'July 12, 2018', 'March 7, 2018', 'July 7, 2018', 'April 25, 2018', 'October 11, 2017', 'March 21, 2018', 'August 22, 2017', 'May 31, 2018', 'July 19, 2018', 'January 6, 2018', 'April 27, 2018', 'July 11, 2018', 'August 2, 2018', 'July 14, 2018', 'November 29, 2017', 'March 31, 2018', 'July 20, 2018', 'April 15, 2018', 'January 3, 2018', 'November 14, 2017', 'December 17, 2017', 'May 28, 2018', 'May 10, 2018', 'July 26, 2018', 'July 18, 2018', 'July 29, 2018', 'August 26, 2014', 'July 8, 2018', 'July 28, 2018', 'June 12, 2018', 'October 14, 2016', 'May 25, 2018', 'July 16, 2018', 'July 27, 2018', 'July 4, 2018', 'March 27, 2018', 'July 15, 2018', '

In [33]:
# Checking Current Ver and Android Ver Columns
#       Verdict - No Cleaning Needed
print(df['Android Ver'].unique().tolist())
print(df['Current Ver'].unique().tolist())

['4.0.3 and up', '4.2 and up', '4.4 and up', '2.3 and up', '3.0 and up', '4.1 and up', '4.0 and up', '2.3.3 and up', 'Varies with device', '2.2 and up', '5.0 and up', '6.0 and up', '1.6 and up', '1.5 and up', '2.1 and up', '7.0 and up', '5.1 and up', '4.3 and up', '4.0.3 - 7.1.1', '2.0 and up', '3.2 and up', '4.4W and up', '7.1 and up', '7.0 - 7.1.1', '8.0 and up', '5.0 - 8.0', '3.1 and up', '2.0.1 and up', '4.1 - 7.1.1', '5.0 - 6.0', '1.0 and up', '2.2 - 7.1.1', '5.0 - 7.1.1']
['1.0.0', '2.0.0', '1.2.4', 'Varies with device', '1.1', '1.0', '6.1.61.1', '2.9.2', '2.8', '1.0.4', '1.0.15', '3.8', '1.2.3', '3.1', '2.2.5', '5.5.4', '4.0', '2.2.6.2', '1.1.3', '1.5', '1.0.8', '1.03', '6.0', '6.7.12.2018', '1.2', '2.20', '1.1.0', '1.6', '2.1', '1.0.9', '1.3', '1', '2.0.1', '1.46', '1.6.1', '11.0', '3.0', '1.7.1', '2.5.1', '1.0.1', '2.493', '1.9.1', '1.7', '2.20 Build 02', '1.37', '0.2.1', '4.47.3', '1.9.7', '2.2.21', '2.9', '1.79', '2.3.5.1', '8.31', '1.1.5.0', '10.0.2', '1.10.3', '3.20.1', '1

In [34]:
# Save the Cleaned Dataset

df.to_csv('./assets/googleplaystore_cleaned.csv', index = False)