In [89]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import col, to_date , isnan, when, count, year, month, avg, sum as pyspark_sum , when , lit , regexp_replace , mean, stddev, when

In [37]:
spark=SparkSession.builder.appName("play Store") \
      .getOrCreate()

In [65]:
df = spark.read.csv('Google-Playstore.csv', header=True, inferSchema=False)
df.show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+------------+--------------+--------------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Free|Price|Currency|Size|Minimum Android|        Developer Id|   Developer Website|     Developer Email|    Released|Last Updated|Content Rating|      Privacy Policy|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+----+-----+--------+----+---------------+--------------------+--------------------+--------------------+------------+-----

In [66]:
df.printSchema()

root
 |-- App Name: string (nullable = true)
 |-- App Id: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Rating Count: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Minimum Installs: string (nullable = true)
 |-- Maximum Installs: string (nullable = true)
 |-- Free: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Developer Id: string (nullable = true)
 |-- Developer Website: string (nullable = true)
 |-- Developer Email: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Privacy Policy: string (nullable = true)
 |-- Ad Supported: string (nullable = true)
 |-- In App Purchases: string (nullable = true)
 |-- Editors Choice: string (nullable = true)
 |-- Scra

# Dropping unnecessary columns

In [67]:


df = df.drop('Developer Website', 'Free','Privacy Policy')
df.show()


+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+--------------------+--------------------+------------+------------+--------------+------------+----------------+--------------+-------------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Price|Currency|Size|Minimum Android|        Developer Id|     Developer Email|    Released|Last Updated|Content Rating|Ad Supported|In App Purchases|Editors Choice|       Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+--------------------+--------------------+------------+------------+--------------+------------+----------------+--------------+-------------------+
|                           Gak

## Changing the datatype of Scraping time column and reducing it into a unified format 'YYYY-MM-DD'

In [68]:
df = df.withColumn("Scraped Time", col("Scraped Time").substr(1, 10))


df.show(truncate=False)

+-------------------------------------------------------+---------------------------------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+-----------------------------------+-------------------------------+------------+------------+--------------+------------+----------------+--------------+------------+
|App Name                                               |App Id                                       |Category         |Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Price|Currency|Size|Minimum Android|Developer Id                       |Developer Email                |Released    |Last Updated|Content Rating|Ad Supported|In App Purchases|Editors Choice|Scraped Time|
+-------------------------------------------------------+---------------------------------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+----

# there is no any missing value in our data

In [58]:
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+--------+------+--------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+------------+---------------+--------+------------+--------------+------------+----------------+--------------+------------+
|App Name|App Id|Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Price|Currency|Size|Minimum Android|Developer Id|Developer Email|Released|Last Updated|Content Rating|Ad Supported|In App Purchases|Editors Choice|Scraped Time|
+--------+------+--------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+------------+---------------+--------+------------+--------------+------------+----------------+--------------+------------+
|       0|     0|       0|     0|           0|       0|               0|               0|    0|       0|   0|              0|           0|              0|       0|           0|             0|           0|               0|             0|           0|


### The content rating column is divided into several categories.
#### Everyone
#### Teenager Adult 17 years of age or older
#### Everyone 10+
#### Not rated
#### adults only (18)
### For easier comprehension, we are now going to reduce these categories to just three: Everyone, including teenagers and adults

#### Mature 17+ ---------> to Adults
#### Adults only 18 -----------> to Adults
#### Everyone 10+ ------------> to Teen
#### Unrated -------------> Everyone

In [69]:
df = df.withColumn("Content Rating", 
                   when(df["Content Rating"] == "Unrated", "Everyone")
                   .when(df["Content Rating"] == "Adults only 18+", "Adults")
                   .when(df["Content Rating"] == "Mature 17+", "Adults")
                   .when(df["Content Rating"] == "Everyone 10+", "Teen")
                   .otherwise(df["Content Rating"]))

df.show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+--------------------+--------------------+------------+------------+--------------+------------+----------------+--------------+------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Price|Currency|Size|Minimum Android|        Developer Id|     Developer Email|    Released|Last Updated|Content Rating|Ad Supported|In App Purchases|Editors Choice|Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+--------------------+--------------------+------------+------------+--------------+------------+----------------+--------------+------------+
|                           Gakondo| com.ishakwe.gak

# changing the datatype of dates column from string to date

In [70]:
df = df.withColumn("Scraped Time", to_date(col("Scraped Time")))
df.printSchema()

root
 |-- App Name: string (nullable = true)
 |-- App Id: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Rating Count: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Minimum Installs: string (nullable = true)
 |-- Maximum Installs: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Developer Id: string (nullable = true)
 |-- Developer Email: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Ad Supported: string (nullable = true)
 |-- In App Purchases: string (nullable = true)
 |-- Editors Choice: string (nullable = true)
 |-- Scraped Time: date (nullable = true)



# Changing the data types of columns to be ready for EDA

In [71]:
df=df.withColumn("Rating", df["Rating"].cast(FloatType()))
df=df.withColumn("Rating Count", df["Rating Count"].cast(IntegerType()))

df=df.withColumn("Price", df["Price"].cast(FloatType()))
df=df.withColumn("Minimum Installs", df["Minimum Installs"].cast(IntegerType()))
df=df.withColumn("Maximum Installs", df["Maximum Installs"].cast(IntegerType()))



df.printSchema()

root
 |-- App Name: string (nullable = true)
 |-- App Id: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Rating Count: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Minimum Installs: integer (nullable = true)
 |-- Maximum Installs: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Developer Id: string (nullable = true)
 |-- Developer Email: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Ad Supported: string (nullable = true)
 |-- In App Purchases: string (nullable = true)
 |-- Editors Choice: string (nullable = true)
 |-- Scraped Time: date (nullable = true)



In [76]:
df = df.withColumn("Installs", regexp_replace("Installs", "[^0-9]", "").cast("int"))

df.show()

+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+--------------------+--------------------+------------+------------+--------------+------------+----------------+--------------+------------+
|                          App Name|              App Id|         Category|Rating|Rating Count|Installs|Minimum Installs|Maximum Installs|Price|Currency|Size|Minimum Android|        Developer Id|     Developer Email|    Released|Last Updated|Content Rating|Ad Supported|In App Purchases|Editors Choice|Scraped Time|
+----------------------------------+--------------------+-----------------+------+------------+--------+----------------+----------------+-----+--------+----+---------------+--------------------+--------------------+------------+------------+--------------+------------+----------------+--------------+------------+
|                           Gakondo| com.ishakwe.gak

# EXPLORATORY DATA ANALYSIS

## what is the percentage of Rated apps ?

In [17]:
# filter data rating equal 0
rate_0 = df.filter(col('Rating') == 0)

rate_0_count = rate_0.count()

no_imputed = df

print(f'\nNo Rated Apps: {rate_0_count / df.count() * 100 :.2f}%\nRated Apps : {(1-rate_0_count/ df.count())* 100 :.2f}%')

#fiter data rating with out 0
df = df.filter(col('Rating') != 0)



No Rated Apps: 47.31%
Rated Apps : 52.69%


## Top 15 apps in Rating in the Google Store?

In [77]:
store_df=df.orderBy(col('Installs').desc()).limit(15)

store_df.select('App Name','Installs','Rating','Rating Count','Price').show()

+--------------------+----------+------+------------+-----+
|            App Name|  Installs|Rating|Rating Count|Price|
+--------------------+----------+------+------------+-----+
|     Files by Google|1000000000|   4.6|     4495688|  0.0|
|            Messages|1000000000|   4.4|     5305758|  0.0|
|Samsung Security ...|1000000000|   4.2|      304658|  0.0|
|SHAREit - Transfe...|1000000000|   4.2|    16023749|  0.0|
|            Snapchat|1000000000|   4.3|    26340056|  0.0|
|    Carrier Services|1000000000|   4.3|      755024|  0.0|
|Samsung Voice Rec...|1000000000|   4.3|      162889|  0.0|
|Google Play Servi...|1000000000|   4.1|      383330|  0.0|
|Spotify: Listen t...|1000000000|   4.4|    22752840|  0.0|
|         Device Care|1000000000|   4.3|      360421|  0.0|
|Samsung Push Service|1000000000|   4.2|     1160802|  0.0|
|      Subway Surfers|1000000000|   4.6|     5304826|  0.0|
|       Secure Folder|1000000000|   4.2|      188987|  0.0|
|Microsoft PowerPo...|1000000000|   4.3|

## The most installed apps

In [24]:
store_df=df.orderBy(col('Rating').desc()).limit(15)

store_df.select('App Name','Installs','Rating','Rating Count','Price').show()

+--------------------+--------+------------+------+-----+
|            App Name|Installs|Rating Count|Rating|Price|
+--------------------+--------+------------+------+-----+
|             Fazzaco| 10,000+|         156|   5.0|    0|
|Gold Rose Live Wa...|  1,000+|         187|   5.0|    0|
|        Duo Cylinder|  1,000+|           5|   5.0|    0|
|  Colour Paino Tiles|     10+|          16|   5.0|    0|
| Inverted Space Ship|     10+|           8|   5.0|    0|
|Niagara Falls Wal...|    500+|          11|   5.0|    0|
|        POOL CAR APP|     50+|           5|   5.0|    0|
|Triple Point Academy|     10+|           5|   5.0|    0|
|Philly's Favor 100.7|  1,000+|          10|   5.0|    0|
|Smart City Trichy...|     10+|           5|   5.0|    0|
|  Football Questions|    100+|          21|   5.0|    0|
|Let's Fact? Let's...|     10+|           5|   5.0|    0|
|     CliQer Merchant|    100+|           9|   5.0|    0|
|           SensorLab|    100+|           6|   5.0|    0|
|Beef Recipes 

## Who are the top developers by the number of apps?

In [23]:
developer_app_counts = df.groupBy("Developer Id").agg(count("*").alias("App Count")).orderBy("App Count", ascending=False)
developer_app_counts.show()

+--------------------+---------+
|        Developer Id|App Count|
+--------------------+---------+
|       Subsplash Inc|     5422|
|          TRAINERIZE|     5153|
|             ChowNow|     4865|
|           OrderYOYO|     2884|
|             Phorest|     2821|
|BH App Developmen...|     2453|
|          Sharefaith|     2077|
|            Flipdish|     1969|
|          J&M Studio|     1942|
|          CyJ Studio|     1741|
|             Apptegy|     1729|
|      +HOME by Ateam|     1609|
|        Magzter Inc.|     1604|
|Branded Apps by M...|     1570|
|Currency Converte...|     1520|
|             echurch|     1463|
|           Skalpelis|     1333|
|           TTMA Apps|     1312|
|Virtuagym Profess...|     1268|
|Branded MINDBODY ...|     1184|
+--------------------+---------+
only showing top 20 rows



## What are the most common app categories?

In [31]:
category_counts = df.groupBy("Category").agg(count("*").alias("App Count"))

popular_categories = category_counts.orderBy("App Count", ascending=False)

popular_categories.show()

+-----------------+---------+
|         Category|App Count|
+-----------------+---------+
|        Education|   241086|
|    Music & Audio|   154905|
|            Tools|   143987|
|         Business|   143770|
|    Entertainment|   138271|
|        Lifestyle|   118331|
|Books & Reference|   116726|
|  Personalization|    89210|
| Health & Fitness|    83510|
|     Productivity|    79695|
|         Shopping|    75253|
|     Food & Drink|    73926|
|   Travel & Local|    67288|
|          Finance|    65465|
|           Arcade|    53792|
|           Puzzle|    51168|
|           Casual|    50813|
|    Communication|    48167|
|           Sports|    47483|
|           Social|    44733|
+-----------------+---------+
only showing top 20 rows



## How are installs distributed among different apps?

In [78]:
install_stats = df.selectExpr(
    "min(Installs) as Min_Installs",
    "percentile_approx(Installs, 0.25) as Q1_Installs",
    "percentile_approx(Installs, 0.5) as Median_Installs",
    "percentile_approx(Installs, 0.75) as Q3_Installs",
    "max(Installs) as Max_Installs"
)

install_stats.show()

+------------+-----------+---------------+-----------+------------+
|Min_Installs|Q1_Installs|Median_Installs|Q3_Installs|Max_Installs|
+------------+-----------+---------------+-----------+------------+
|           0|         50|            500|       5000|  1000000000|
+------------+-----------+---------------+-----------+------------+



## How many apps support ads or in-app purchases

In [79]:
ads_count = df.filter(col("Ad Supported") == "True").count()


in_app_purchases_count = df.filter(col("In App Purchases") == "True").count()

print("Number of apps that support ads:", ads_count)
print("Number of apps that have in-app purchases:", in_app_purchases_count)

Number of apps that support ads: 1150750
Number of apps that have in-app purchases: 195319


## What is the category with the most installs?

In [92]:
category_installs = df.groupBy("Category").agg(sum("Installs").alias("Total Installs"))


category_installs = category_installs.orderBy(col("Total Installs").desc())


category_installs.show()

+--------------------+--------------+
|            Category|Total Installs|
+--------------------+--------------+
|               Tools|   46440471469|
|        Productivity|   23314050628|
|       Communication|   18216594754|
|              Action|   17400377388|
|       Entertainment|   17108498394|
|              Casual|   16837071701|
|              Arcade|   14501333037|
|       Music & Audio|   14239411974|
|         Photography|   13998958963|
|              Social|   12165995316|
|          Simulation|   11690020565|
|              Puzzle|   10796428127|
|     Personalization|    9252977243|
|              Racing|    9218081047|
|Video Players & E...|    8591154209|
|              Sports|    7954127503|
|            Shopping|    7108602885|
|             Finance|    6158488850|
|           Lifestyle|    5997923997|
|           Education|    5983961831|
+--------------------+--------------+
only showing top 20 rows



## The most Downloaded Apps Are Free Or Paid?

In [97]:
category_counts = df.groupBy("Category") \
                    .agg(count(when(col("Price") == "0", True)).alias("Install Free"),
                         count(when(col("Price") != "0", True)).alias("Paid"))

category_counts = category_counts.withColumn("Total Apps", col("Install Free") + col("Paid"))


category_percentages = category_counts.withColumn("% Install Free", (col("Install Free") / col("Total Apps")) * 100) \
                                      .withColumn("% Paid", (col("Paid") / col("Total Apps")) * 100)


category_percentages.show()

+--------------------+------------+----+----------+-----------------+------------------+
|            Category|Install Free|Paid|Total Apps|   % Install Free|            % Paid|
+--------------------+------------+----+----------+-----------------+------------------+
|       Music & Audio|      153381|1524|    154905|99.01617120170427|0.9838287982957297|
|           Education|      234539|6547|    241086|97.28437155206025|2.7156284479397392|
|              Trivia|       11683| 112|     11795|99.05044510385757|0.9495548961424332|
|     Auto & Vehicles|       17998| 282|     18280|98.45733041575492|1.5426695842450766|
|       Entertainment|      136889|1382|    138271|99.00051348438936|0.9994865156106486|
|           Adventure|       22221| 982|     23203|95.76778864801965|4.2322113519803475|
|com.free074a81ba9...|           0|   0|         0|             NULL|              NULL|
|              Arcade|       53029| 763|     53792|98.58157346817372|1.4184265318262939|
|              Sports

In [96]:
total_counts = df.agg(count(when(col("Price") == "0", True)).alias("Total Install Free"),
                      count(when(col("Price") != "0", True)).alias("Total Paid"),
                      count(lit(1)).alias("Total Apps"))


overall_percentages = total_counts.withColumn("% Install Free", (col("Total Install Free") / col("Total Apps")) * 100) \
                                  .withColumn("% Paid", (col("Total Paid") / col("Total Apps")) * 100)


overall_percentages.show()

+------------------+----------+----------+----------------+------------------+
|Total Install Free|Total Paid|Total Apps|  % Install Free|            % Paid|
+------------------+----------+----------+----------------+------------------+
|           2267982|     44943|   2312944|98.0560705317552|1.9431080043442468|
+------------------+----------+----------+----------------+------------------+

