## Import Libraries

In [1]:
import pandas as pd

## Read Data 

In [2]:
#Define the CSV Path
path = r'D:\projects\Data Analysis\App Store Analysis Dataset\Google-Playstore.csv'
#Read Csv File 
data = pd.read_csv(path)

In [3]:
data

Unnamed: 0,App Name,App Id,Category,Rating,Rating Count,Installs,Minimum Installs,Maximum Installs,Free,Price,...,Developer Website,Developer Email,Released,Last Updated,Content Rating,Privacy Policy,Ad Supported,In App Purchases,Editors Choice,Scraped Time
0,Gakondo,com.ishakwe.gakondo,Adventure,0.0,0.0,10+,10.0,15,True,0.0,...,https://beniyizibyose.tk/#/,jean21101999@gmail.com,"Feb 26, 2020","Feb 26, 2020",Everyone,https://beniyizibyose.tk/projects/,False,False,False,2021-06-15 20:19:35
1,Ampere Battery Info,com.webserveis.batteryinfo,Tools,4.4,64.0,"5,000+",5000.0,7662,True,0.0,...,https://webserveis.netlify.app/,webserveis@gmail.com,"May 21, 2020","May 06, 2021",Everyone,https://dev4phones.wordpress.com/licencia-de-uso/,True,False,False,2021-06-15 20:19:35
2,Vibook,com.doantiepvien.crm,Productivity,0.0,0.0,50+,50.0,58,True,0.0,...,,vnacrewit@gmail.com,"Aug 9, 2019","Aug 19, 2019",Everyone,https://www.vietnamairlines.com/vn/en/terms-an...,False,False,False,2021-06-15 20:19:35
3,Smart City Trichy Public Service Vehicles 17UC...,cst.stJoseph.ug17ucs548,Communication,5.0,5.0,10+,10.0,19,True,0.0,...,http://www.climatesmarttech.com/,climatesmarttech2@gmail.com,"Sep 10, 2018","Oct 13, 2018",Everyone,,True,False,False,2021-06-15 20:19:35
4,GROW.me,com.horodyski.grower,Tools,0.0,0.0,100+,100.0,478,True,0.0,...,http://www.horodyski.com.pl,rmilekhorodyski@gmail.com,"Feb 21, 2020","Nov 12, 2018",Everyone,http://www.horodyski.com.pl,False,False,False,2021-06-15 20:19:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2312939,大俠客—熱血歸來,com.rxsj.ssjj,Role Playing,4.3,16775.0,"100,000+",100000.0,337109,True,0.0,...,http://www.4399sy.com.hk/,ssjjcomhk@gmail.com,,"Jun 01, 2021",Teen,http://a.4399sy.com.hk/user/aggreement,False,False,False,2021-06-16 12:59:18
2312940,ORU Online,com.threedream.oruonline,Education,0.0,0.0,100+,100.0,430,True,0.0,...,http://www.oru.edu/,3DreamDeveloper@gmail.com,"Jan 17, 2018","Feb 02, 2018",Everyone,http://www.oru.edu/about-oru/privacy-policy.php,False,False,False,2021-06-16 12:59:19
2312941,Data Structure,datastructure.appoworld.datastucture,Education,0.0,0.0,100+,100.0,202,True,0.0,...,,appoworld.official@gmail.com,"Aug 19, 2018","Aug 19, 2018",Everyone,https://appoworld.000webhostapp.com/datastruct...,False,False,False,2021-06-16 12:59:19
2312942,Devi Suktam,ishan.devi.suktam,Music & Audio,3.5,8.0,"1,000+",1000.0,2635,True,0.0,...,https://a70f78905.app-ads-txt.com,ruchisono@gmail.com,"Aug 1, 2016","May 05, 2021",Everyone,https://docs.google.com/document/d/1x-9reZuLRX...,True,False,False,2021-06-16 12:59:19


In [4]:
data.dtypes

App Name              object
App Id                object
Category              object
Rating               float64
Rating Count         float64
Installs              object
Minimum Installs     float64
Maximum Installs       int64
Free                    bool
Price                float64
Currency              object
Size                  object
Minimum Android       object
Developer Id          object
Developer Website     object
Developer Email       object
Released              object
Last Updated          object
Content Rating        object
Privacy Policy        object
Ad Supported            bool
In App Purchases        bool
Editors Choice          bool
Scraped Time          object
dtype: object

In [5]:
#change all columns name to lowercase for easily calling columns
data.columns = data.columns.str.lower()

## Data Cleaning

In [6]:
missing_value = data.isnull().sum()
missing_value

app name                  5
app id                    0
category                  0
rating                22883
rating count          22883
installs                107
minimum installs        107
maximum installs          0
free                      0
price                     0
currency                135
size                    196
minimum android        6530
developer id             33
developer website    760835
developer email          31
released              71053
last updated              0
content rating            0
privacy policy       420953
ad supported              0
in app purchases          0
editors choice            0
scraped time              0
dtype: int64

In [7]:
# Convert the 'installs' column to string to handle the replacements
data['installs'] = data['installs'].astype(str)

# Remove commas and plus signs
data['installs'] = (
    data['installs']
    .str.replace(',', '', regex=False)  # Remove commas
    .str.replace('+', '', regex=False)  # Remove plus signs
)

# Handle 'M' (Mega) values and convert to float
data['installs'] = data['installs'].str.replace('M', '000000', regex=False)  # Replace 'M' with 1,000,000
data['installs'] = data['installs'].str.replace('k', '000', regex=False)      # Replace 'k' with 1,000

# Convert to numeric, treating non-numeric values as NaN
data['installs'] = pd.to_numeric(data['installs'], errors='coerce')


In [8]:
#delete columns that threshold of data is missing(developer website, privacy policy & released)
data = data.drop(columns=['developer website', 'privacy policy', 'released'])
#drop not applicable columns for our questions
data = data.drop(columns=['size'])

In [10]:
#fill the missing values (22883) in 'rating' column
data['rating'] = data['rating'].fillna(data['rating'].mean())

In [11]:
#fill the missing values (22883) in 'rating count' column (assuming that no ratings for missing)
data['rating count'] = data['rating count'].fillna(0)

In [12]:
#fill missing values (135) in 'currency' column with the most frequent currency
data['currency'] = data['currency'].fillna('USD')

In [13]:
#drop missing values(5) in 'app name' column and (33) in 'developer id' column 
data = data.dropna(subset=['app name', 'developer id'])

In [14]:
#drop 'minimum installs' columns because it seems like minimum installs is extracted from number of installs 
data = data.drop(columns=['minimum installs'])

In [15]:
#fill missing values (31) in 'developer email' with "undefined"
data['developer email'] = data['developer email'].fillna("Undefined")

In [16]:
#remove strings from the minimum android column
data['minimum android'] = data['minimum android'].str.replace(' and up', '')
#change data type of minimum android column to numeric
data['minimum android'] = pd.to_numeric(data['minimum android'], errors='coerce')
#get the mode for the column
mode_value = data['minimum android'].mode()[0]
#fill the minimum android column with the mode value
data['minimum android'] = data['minimum android'].fillna(mode_value)

In [18]:
#check for duplicates
data.duplicated().sum()

0