# 1. Imports

In [1]:
import pandas as pd
import numpy as np

# Tweaks
pd.options.display.max_rows = 12000

# 2. Data Reading

In [2]:
apps_df = pd.read_csv('./data/googleplaystore.csv')
user_reviews_df = pd.read_csv('./data/googleplaystore_user_reviews.csv')

# 3. Data Exploration

In [3]:
apps_df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
5890,Graffiti Letters (A-Z),LIFESTYLE,3.9,386,7.9M,"50,000+",Free,0,Everyone,Lifestyle,"February 18, 2017",2.1.5,4.0 and up
3933,Hungry Hearts Diner: A Tale of Star-Crossed Souls,FAMILY,4.9,46253,56M,"500,000+",Free,0,Everyone 10+,Simulation,"February 7, 2018",1.0.1,4.0 and up
9615,Texas Holdem & Omaha Poker: Pokerist,GAME,4.3,187200,28M,"10,000,000+",Free,0,Teen,Card,"July 31, 2018",18.4.0,4.1 and up
970,HISTORY: Watch TV Show Full Episodes & Specials,ENTERTAINMENT,4.1,33387,20M,"1,000,000+",Free,0,Teen,Entertainment,"July 16, 2018",3.1.4,4.4 and up
1429,White Sound Pro,HEALTH_AND_FITNESS,4.7,16570,73M,"500,000+",Free,0,Everyone,Health & Fitness,"July 29, 2018",4.8.0,4.4 and up


In [4]:
user_reviews_df.sample(5)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
41604,Extreme Car Driving Simulator,,,,
31654,DC Super Hero Girls™,This game much fun loved dress game much fun,Positive,0.1,0.4
40993,"Even - organize your money, get paid early",,,,
920,4 in a Row,,,,
2253,A+ Gallery - Photos & Videos,It Worth pay,Positive,0.3,0.1


# 4. Data Cleaning

## I. Handling Noisy Data

### NOTE: Record 10472 has wrong inputs in `Category`, `Rating`, `Reviews`, `Size`, `Installs`, `Type`, `Price` thus it needs to be removed.

In [5]:
apps_df = apps_df.drop(10472)

### A. Removing `M`, `k`, `Varies with device` from `Size` column in `Apps` table.

In [6]:
# Removing `M`, Changing Size To KB
apps_df.Size = apps_df.Size.map(
    lambda value :
        str(int(float(value.rstrip('M')) * 1024)) if value[-1] == 'M' else value
)

# Removing `k`
apps_df.Size = apps_df.Size.map(
    lambda value :
        str(value.rstrip('k')) if value[-1] == 'k' else value
)

# Setting `Varies with device` to NaN
apps_df.Size = apps_df.Size.map(
    lambda value :
        np.nan if value == 'Varies with device' else value
)

### B. Removing `+`, `,` from `Installs` column in `Apps` table.

In [7]:
# Removing `+`
apps_df.Installs = apps_df.Installs.map(
    lambda value :
        value.rstrip('+')
)

# Removing `,`
apps_df.Installs = apps_df.Installs.map(
    lambda value :
        ''.join(value.split(','))
)

### C. Removing `$` from `Price` column in `Apps` table.

In [8]:
# Removing `$`
apps_df.Price = apps_df.Price.map(
    lambda value :
        value.lstrip('$')
)

### Changing Columns To Numeric

In [9]:
# Changing Values From String To Numeric in `Rating` Column In `Apps` Table
apps_df.Rating = pd.to_numeric(apps_df.Rating)

# Changing Values From String To Numeric In `Reviews` Column In `Apps` Table
apps_df.Reviews = pd.to_numeric(apps_df.Reviews)

# Changing Values From String To Numeric in `Size` Column in `Apps` Table
apps_df.Size = pd.to_numeric(apps_df.Size)

# Changing Values From String To Numeric in `Installs` Column in `Apps` Table
apps_df.Installs = pd.to_numeric(apps_df.Installs)

# Changing Values From String To Numeric in `Price` Column in `Apps` Table
apps_df.Price = pd.to_numeric(apps_df.Price)

### Special Cases

In [11]:
apps_df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

## II. Handling Missing Values

In [10]:
apps_df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price
count,9366.0,10840.0,9145.0,10840.0,10840.0
mean,4.191757,444152.9,22032.397212,15464340.0,1.027368
std,0.515219,2927761.0,23131.336868,85029360.0,15.949703
min,1.0,0.0,8.5,0.0,0.0
25%,4.0,38.0,5017.0,1000.0,0.0
50%,4.3,2094.0,13312.0,100000.0,0.0
75%,4.5,54775.5,30720.0,5000000.0,0.0
max,5.0,78158310.0,102400.0,1000000000.0,400.0


# 5. Data Transformation

# 6. Data Integration

# 7. Data Reduction