# 1. Imports

In [1]:
import pandas as pd
import numpy as np

# Tweaks
pd.options.display.max_rows = 12000

# 2. Data Reading

In [2]:
apps_df = pd.read_csv('./data/googleplaystore.csv')
user_reviews_df = pd.read_csv('./data/googleplaystore_user_reviews.csv')

# 3. Data Exploration I

In [3]:
apps_df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
6745,Beauty Rental Shop,FAMILY,4.6,11480,93M,"100,000+",Free,0,Everyone,Simulation,"May 25, 2018",1.0.4,4.4 and up
10644,South Florida MLS,LIFESTYLE,4.4,133,13M,"10,000+",Free,0,Everyone,Lifestyle,"June 2, 2018",2.8.4,4.0.3 and up
5146,Ah! Monster,FAMILY,4.1,1998,9.8M,"100,000+",Free,0,Everyone,Casual,"June 21, 2017",4.0.0,4.0 and up
8849,DS-82 form,BUSINESS,,1,28M,100+,Free,0,Everyone,Business,"April 27, 2018",1.7.7,4.1 and up
1474,Home Security Camera WardenCam - reuse old phones,HOUSE_AND_HOME,4.3,43800,Varies with device,"1,000,000+",Free,0,Everyone,House & Home,"July 6, 2018",Varies with device,Varies with device


In [4]:
user_reviews_df.sample(5)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
33202,Dashlane Free Password Manager,"What piece junk. Got email company saying, usi...",Negative,-0.074167,0.426667
37241,Dropbox,"Videos crash halfway, much UI, can't export en...",Positive,0.05,0.55625
7317,Angry Birds 2,The newest update fixed issues had.. totally l...,Positive,0.2,0.342857
8724,Apk Installer,Useless It useful qualities hence waste time,Negative,-0.133333,0.066667
56962,Google Translate,,,,


# 4. Data Cleaning

## I. Handling Noisy Data

### NOTE: Record 10472 has wrong inputs in `Category`, `Rating`, `Reviews`, `Size`, `Installs`, `Type`, `Price` thus it needs to be removed.

In [5]:
apps_df = apps_df.drop(10472)

### A. Removing `M`, `k`, `Varies with device` from `Size` column

In [6]:
# Removing `M`, Changing Size To KB
apps_df['Size'] = apps_df['Size'].map(
    lambda value :
        str(int(float(value.rstrip('M')) * 1024)) if value[-1] == 'M' else value
)

# Removing `k`
apps_df['Size'] = apps_df['Size'].map(
    lambda value :
        str(value.rstrip('k')) if value[-1] == 'k' else value
)

# Setting `Varies with device` to NaN
apps_df['Size'] = apps_df['Size'].map(
    lambda value :
        np.nan if value == 'Varies with device' else value
)

### B. Removing `+`, `,` from `Installs` column

In [7]:
# Removing `+`
apps_df['Installs'] = apps_df['Installs'].map(
    lambda value :
        value.rstrip('+')
)

# Removing `,`
apps_df['Installs'] = apps_df['Installs'].map(
    lambda value :
        ''.join(value.split(','))
)

### C. Removing `$` from `Price` column

In [8]:
# Removing `$`
apps_df['Price'] = apps_df['Price'].map(
    lambda value :
        value.lstrip('$')
)

### D. Changing Columns To Numeric

In [9]:
# Changing Values From String To Numeric in `Rating` Column
apps_df['Rating'] = pd.to_numeric(apps_df['Rating'])

# Changing Values From String To Numeric In `Reviews` Column
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'])

# Changing Values From String To Numeric in `Size` Column
apps_df['Size'] = pd.to_numeric(apps_df['Size'])

# Changing Values From String To Numeric in `Installs` Column
apps_df['Installs'] = pd.to_numeric(apps_df['Installs'])

# Changing Values From String To Numeric in `Price` Column
apps_df['Price'] = pd.to_numeric(apps_df['Price'])

### Special Cases

In [10]:
# Should I handle Outliers?
# In Type there is 2 Unrated, 3 Adults +18. should they be removed?

## II. Handling Missing Values

### A. Replacing `NaN` with `Free` in `Type` Column

In [11]:
# Replacing `NaN`
apps_df['Type'] = apps_df['Type'].fillna('Free')

### B. Replacing `NaN` with `1.0` in `Current Ver` Column

In [12]:
# Replacing `NaN`
apps_df['Current Ver'] = apps_df['Current Ver'].fillna('1.0')

### C. Replacing `NaN` with Most Repeated Value in `Android Ver` Column

In [13]:
# Replacing `NaN`
apps_df['Android Ver'] = apps_df['Android Ver'].fillna(apps_df['Android Ver'].value_counts().idxmax())

# 5. Data Transformation

# 6. Data Integration

# 7. Data Reduction