<div style="text-align:center">
    <img src="../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 03: EDA, Data Preprocessing, Visualization, Google play apps project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Load and Prepare Data

In [None]:
google_play = pd.read_csv("googleplaystore.csv")

## EDA (Exploratory Data Analysis)

Exploratory Data Analysis refers to the critical process of performing initial investigations on data so as to discover patterns,to spot anomalies,to test hypothesis and to check assumptions with the help of summary statistics and graphical representations.

In [None]:
google_play.head()

In [None]:
google_play.info()

In [None]:
google_play.describe()

In [None]:
type(google_play)

## Data Preprocessing

In [None]:
google_play

#### Convert all ratings to 0-5 scale (replace other values with NaN)

In [None]:
google_play["Rating"] = [
    i if i >= 0 and i <= 5 else np.nan for i in google_play["Rating"]
]

In [None]:
google_play

#### Convert all reviews to float (convert M to 1000000)

In [None]:
# 100M
for i in range(len(google_play["Reviews"])):
    if str(google_play["Reviews"][i]).find("M") > -1:
        google_play["Reviews"][i] = float(google_play["Reviews"][i].replace("M", "")) * 1000000 #Convert to million
    else:
        google_play["Reviews"][i] = float(google_play["Reviews"][i])

In [None]:
google_play

#### Remove unwanted values from sizes

In [None]:
google_play["Size"] = (
    google_play["Size"]
    .astype(str)
    .replace("[Varies with device | Varieswithdevice]", np.nan, regex=True)
    .replace("[,+]", "", regex=True)
)

In [None]:
google_play

#### Conver all sizes to megabytes (remove "M" and "k" and convert to float)

In [None]:
for i in range(len(google_play["Size"])):
    if str(google_play["Size"][i]).find("k") > -1:
        google_play["Size"][i] = float(google_play["Size"][i].replace("k", "")) / 1024 # Convert to megabytes
    if str(google_play["Size"][i]).find("M") > -1:
        google_play["Size"][i] = float(google_play["Size"][i].replace("M", "")) # It's already in megabytes

google_play["Size"] = google_play["Size"].astype(float)

In [None]:
google_play

#### Remove unwanted values from installs, remove "," , "+" and "Free"

In [None]:
google_play["Installs"] = (
    google_play["Installs"]
    .astype(str)
    .replace("[,+]", "", regex=True)
    .replace("Free", np.nan)
    .astype(float)
)

In [None]:
google_play

#### Change type 0 to NAN

In [None]:
google_play["Type"] = google_play["Type"].replace("0", np.nan)

In [None]:
google_play

#### Remove $ sign and unwanted characters from the prices

In [None]:
google_play["Price"] = google_play["Price"].astype(str).replace("[$]", "",regex=True).replace("[A-Za-z]", np.nan, regex=True).astype(float)

In [None]:
google_play

#### Create 3 new columns to seperate each part of the last update dates

In [None]:
years = []
months = []
days = []
for i in range(len(google_play["Last Updated"])):
    date = pd.to_datetime(google_play["Last Updated"][i],errors='coerce')
    years.append(date.year)
    months.append(date.month)
    days.append(date.day)
 
google_play = google_play.drop(["Last Updated"], axis=1)
google_play["Last Updated Year"] = years
google_play["Last Updated Month"] = months
google_play["Last Updated Day"] = days

In [None]:
google_play

#### Remove unwanted values from Andriod Ver and convert all values to float (The format is *.*)

In [None]:
google_play["Android Ver"] = (
    google_play["Android Ver"]
    .replace("and up", "", regex=True)
    .replace("Varies with device", np.nan)
)

In [None]:
google_play

#### This formation is done only for better demonstration of data (it's not necessary)

In [None]:
google_play["Android Ver"] = list(
    map(lambda x: x if str(x) == "nan" else str(x)[0:3], google_play["Android Ver"]) # Only get the first 3 digits (to convert numbers to *.* format)
)

In [None]:
google_play

## Storytelling - Visualization

#### Count of each type

In [None]:
types_countplot = sns.countplot(x="Type", data=google_play)
types_countplot.set_xticklabels(types_countplot.get_xticklabels(),rotation=90)

#### Count of each rating separated by type

In [None]:
rating_type_catplot = sns.catplot(x="Rating",hue='Type', kind="count",palette=sns.color_palette("Set2"), data=google_play, height=6, aspect=4)

#### Count of each andriod version

In [None]:
andriodVers_countplot = sns.countplot(x = sorted(google_play["Android Ver"].dropna().values))
andriodVers_countplot.set_xticklabels(andriodVers_countplot.get_xticklabels(),rotation=90)
andriodVers_countplot.figure.set_size_inches(10,6)

#### Count of each content rating

In [None]:
contentRating_countplot = sns.countplot(x="Content Rating",data=google_play)
contentRating_countplot.set_xticklabels(contentRating_countplot.get_xticklabels(),rotation=90)
contentRating_countplot.figure.set_size_inches(10,6)

#### Relativity of rating and size

In [None]:
random_sample = google_play.sample(n=1000)
rating_size_plot = sns.regplot(x="Rating", y="Size", data=random_sample)

#### Top ten most popular cateogires based on the number of installs

In [None]:
result = google_play.groupby("Category")["Installs"].agg(["count","mean"]).sort_values(by="mean", ascending=False).head(10)
sns.heatmap(result, annot=True)

#### Top ten most popular genres based on the number of installs

In [None]:
result = google_play.groupby("Genres")["Installs"].agg(["count","mean"]).sort_values(by="mean", ascending=False).head(10)
sns.heatmap(result, annot=True)

#### Relativity of rating and reviews

In [None]:
sns.relplot(x="Rating", y="Reviews", hue="Type", data=google_play,height=5, aspect=2)

#### Top largest and smalles (by size) installed apps (Relativity between mean size and count of installs)

In [None]:
top = google_play.groupby("Installs")["Size"].agg(["count","mean"]).sort_values(by="mean", ascending=False).head(15)
lowest = google_play.groupby("Installs")["Size"].agg(["count","mean"]).sort_values(by="mean", ascending=True).head(15)
top["Size_Type"] = "Top"
lowest["Size_Type"] = "Lowest"
result = pd.concat([top, lowest])

g = sns.FacetGrid(result, col="Size_Type")
g.map(sns.scatterplot, "count","mean")

#### Relativity between Last update date and rating filtered by type

In [None]:
sns.pointplot(x="Last Updated Year", y="Rating",hue="Type", data=google_play)

####  Top 10 apps by rating

In [None]:
google_play.sort_values(by="Rating", ascending=False).head(10)

#### Top 10 most expensive apps

In [None]:
google_play.sort_values(by="Price", ascending=False).head(10)

#### Top 10 genres by rating mean

In [None]:
google_play.groupby("Genres").agg({"Rating": "mean"}).sort_values(by="Rating", ascending=False).head(10)

#### Top 10 free apps with more than 1000 installs

In [None]:
top_install_free_apps = google_play.query("Type == 'Free' and Installs > 1000")
top_install_free_apps.sort_values(by="Installs", ascending=False).head(10)

#### Top apps with more than 1M reviews and more than 4 rating

In [None]:
google_play.query("Reviews > 1000000 and Rating > 4")

In [None]:
correlation = google_play.corr()
heatmapcorr = sns.heatmap(correlation, annot=True)