In [8]:
import pandas as pd
import numpy as np



In [168]:
# making a dataframe off the csv file
df = pd.read_csv('googleplaystore.csv')
df.head(100)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,Free,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,All of the parking lot - National Park applica...,AUTO_AND_VEHICLES,4.0,1754,14M,"500,000+",Free,0,Everyone,Auto & Vehicles,2-Jun-18,2.3.4,4.0 and up
96,Inquiry Fines and Debits of Vehicles,AUTO_AND_VEHICLES,4.4,2680,2.2M,"500,000+",Free,0,Everyone,Auto & Vehicles,20-Mar-18,1.03,4.0.3 and up
97,Gas Station,AUTO_AND_VEHICLES,4.0,1288,4.5M,"100,000+",Free,0,Everyone,Auto & Vehicles,21-Apr-18,2.17,4.0 and up
98,Hush - Beauty for Everyone,BEAUTY,4.7,18900,17M,"500,000+",Free,0,Everyone,Beauty,2-Aug-18,6.10.1,5.0 and up


In [14]:
# checking which columns have null values
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [121]:
# cleaning the Rating column. Replacing all the incorrect ratings to None
# and then replacing all the None values with the mean Rating
for x in df.index:
    if df.loc[x, 'Rating'] > 5 or df.loc[x, 'Rating'] < 0:
                df.loc[x, 'Rating'] = None
        
mean = df['Rating'].mean()
df.fillna({'Rating': mean}, inplace=True)

df.dropna(inplace=True)
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [169]:
# Alternative method for cleaning Rating column
df.loc[df['Rating'] > 5, 'Rating'] = np.nan
df.fillna({'Rating': df['Rating'].mean()}, inplace=True)

df.dropna(inplace=True)

In [170]:
# cleaning the Reviews column. Converting suffixes to numbers
df.loc[df['Reviews'].str.contains('M'), 'Reviews'] = (pd.to_numeric(df.loc[df['Reviews'].str.contains('M'), 'Reviews'].str.replace('M', '')) * 1_000_000).astype('str')
df['Reviews'] = pd.to_numeric(df['Reviews'])


In [106]:
# count number of duplicate apps
df.duplicated(subset='App', keep=False).sum()

1979

In [171]:
# dropping duplicated apps keeping ones with the greatest reviews
df_sorted = df.sort_values(by=['App', 'Reviews'])

df_sorted.loc[
    df_sorted['App'].duplicated(keep=False) & ~df_sorted.duplicated(keep=False),
    ['App', 'Reviews']
].head(5)

df = df_sorted.drop_duplicates(subset=['App'], keep='last')

In [172]:
# Format the Category column
df.loc[:, 'Category'] = df.loc[:, 'Category'].str.replace('_', ' ').str.capitalize()
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.197727,27.0,3.6M,500+,Free,0,Everyone,Sports,7-Oct-17,0.22,4.1 and up
324,#NAME?,Comics,3.500000,115.0,9.1M,"10,000+",Free,0,Mature 17+,Comics,13-Jul-18,5.0.12,5.0 and up
8532,+Download 4 Instagram Twitter,Social,4.500000,40467.0,22M,"1,000,000+",Free,0,Everyone,Social,2-Aug-18,5.03,4.1 and up
4541,.R,Tools,4.500000,259.0,203k,"10,000+",Free,0,Everyone,Tools,16-Sep-14,1.1.06,1.5 and up
4636,/u/app,Communication,4.700000,573.0,53M,"10,000+",Free,0,Mature 17+,Communication,3-Jul-18,4.2.4,4.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",Video players,4.197727,414.0,59M,"100,000+",Free,0,Mature 17+,Video Players & Editors,18-Jul-18,4.0.7,4.0.3 and up
4362,💎 I'm rich,Lifestyle,3.800000,718.0,26M,"10,000+",Paid,$399.99,Everyone,Lifestyle,11-Mar-18,1.0.0,4.4 and up
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",Social,4.600000,22098.0,18M,"1,000,000+",Free,0,Everyone,Social,24-Jul-18,4.2.4,4.0.3 and up
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,Tools,4.000000,19.0,3.2M,"10,000+",Free,0,Everyone,Tools,21-Oct-17,1,4.2 and up


In [173]:
# Clean and convert the Installs column values to numeric
df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '')
df['Installs'] = pd.to_numeric(df['Installs'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Installs'] = pd.to_numeric(df['Installs'])


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.197727,27.0,3.6M,500,Free,0,Everyone,Sports,7-Oct-17,0.22,4.1 and up
324,#NAME?,Comics,3.500000,115.0,9.1M,10000,Free,0,Mature 17+,Comics,13-Jul-18,5.0.12,5.0 and up
8532,+Download 4 Instagram Twitter,Social,4.500000,40467.0,22M,1000000,Free,0,Everyone,Social,2-Aug-18,5.03,4.1 and up
4541,.R,Tools,4.500000,259.0,203k,10000,Free,0,Everyone,Tools,16-Sep-14,1.1.06,1.5 and up
4636,/u/app,Communication,4.700000,573.0,53M,10000,Free,0,Mature 17+,Communication,3-Jul-18,4.2.4,4.1 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",Video players,4.197727,414.0,59M,100000,Free,0,Mature 17+,Video Players & Editors,18-Jul-18,4.0.7,4.0.3 and up
4362,💎 I'm rich,Lifestyle,3.800000,718.0,26M,10000,Paid,$399.99,Everyone,Lifestyle,11-Mar-18,1.0.0,4.4 and up
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",Social,4.600000,22098.0,18M,1000000,Free,0,Everyone,Social,24-Jul-18,4.2.4,4.0.3 and up
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,Tools,4.000000,19.0,3.2M,10000,Free,0,Everyone,Tools,21-Oct-17,1,4.2 and up


In [175]:
# cleaning the size column
df['Size'] = df['Size'].replace('Varies with device', "0").astype(str)
df.loc[:, 'Size'] = df.loc[:, 'Size'].str.replace('+', '').str.replace(',', '')
df.loc[df['Size'].str.contains('M'), 'Size'] = (pd.to_numeric(df.loc[df['Size'].str.contains('M'), 'Size'].str.replace('M', '')) * 1_024 * 1_024).astype('str')
df.loc[df['Size'].str.contains('k'), 'Size'] = (pd.to_numeric(df.loc[df['Size'].str.contains('k'), 'Size'].str.replace('k', '')) * 1_024).astype('str')

df['Size'] = pd.to_numeric(df['Size'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Size'] = df['Size'].replace('Varies with device', "0").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Size'] = pd.to_numeric(df['Size'])


In [177]:
# converting Price to numeric
df['Price'] = df['Price'].str.replace('Free', "0").astype(str)
df['Price'] = pd.to_numeric(df['Price'].str.replace('$', ''))
df['Price'] = pd.to_numeric(df['Price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price'] = df['Price'].str.replace('Free', "0").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price'] = pd.to_numeric(df['Price'].str.replace('$', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price'] = pd.to_numeric(df['Price'])


In [178]:
# Creating a new column for Free/Paid based on Price
df['Distribution'] = df['Price'].apply(lambda x: "Paid" if x > 0 else "Free")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Distribution'] = df['Price'].apply(lambda x: "Paid" if x > 0 else "Free")


In [179]:
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.197727,27.0,3774873.6,500,Free,0.00,Everyone,Sports,7-Oct-17,0.22,4.1 and up,Free
324,#NAME?,Comics,3.500000,115.0,9542041.6,10000,Free,0.00,Mature 17+,Comics,13-Jul-18,5.0.12,5.0 and up,Free
8532,+Download 4 Instagram Twitter,Social,4.500000,40467.0,23068672.0,1000000,Free,0.00,Everyone,Social,2-Aug-18,5.03,4.1 and up,Free
4541,.R,Tools,4.500000,259.0,207872.0,10000,Free,0.00,Everyone,Tools,16-Sep-14,1.1.06,1.5 and up,Free
4636,/u/app,Communication,4.700000,573.0,55574528.0,10000,Free,0.00,Mature 17+,Communication,3-Jul-18,4.2.4,4.1 and up,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6334,"뽕티비 - 개인방송, 인터넷방송, BJ방송",Video players,4.197727,414.0,61865984.0,100000,Free,0.00,Mature 17+,Video Players & Editors,18-Jul-18,4.0.7,4.0.3 and up,Free
4362,💎 I'm rich,Lifestyle,3.800000,718.0,27262976.0,10000,Paid,399.99,Everyone,Lifestyle,11-Mar-18,1.0.0,4.4 and up,Paid
2575,"💘 WhatsLov: Smileys of love, stickers and GIF",Social,4.600000,22098.0,18874368.0,1000000,Free,0.00,Everyone,Social,24-Jul-18,4.2.4,4.0.3 and up,Free
7559,📏 Smart Ruler ↔️ cm/inch measuring for homework!,Tools,4.000000,19.0,3355443.2,10000,Free,0.00,Everyone,Tools,21-Oct-17,1,4.2 and up,Free
