In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
     ------------------------------------ 310.8/310.8 MB 886.9 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py4j==0.10.9.7
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
     -------------------------------------- 200.5/200.5 kB 2.4 MB/s eta 0:00:00
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285411 sha256=cc6d1516c5fa1e5ec81c7d3dfc24680b3823a2c41a88f4214dbce8b80fdd2eda
  Stored in directory: c:\users\saini\appdata\local\pip\cache\wheels\2b\9a\39\d8019ffbfb76a39433455e3d5799e94d3e3cae8f41229f6bf8
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.4.1


# **Importing the Libraries**

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
from pyspark.sql.functions import *

In [24]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [32]:
from pyspark.sql.types import FloatType

# **Create a SparkSession**

In [13]:
spark = SparkSession.builder.appName("GooglePlayStoreAnalysis").getOrCreate()

# **Creating the dataframe**

In [14]:
# Load the Google Play Store dataset
data_path = "/content/googleplaystore.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# **Check the schema and the first few rows of the DataFrame**

In [15]:
df.printSchema()
df.show(5)

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)

+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+------------------+------------+
|                 App|      Category|Rating|Reviews|Size|   Installs|Type|Price|Content Rating|              Genres|    Last Updated|       Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+-----------+----+-----+--------------+--------------------+----------------+--------------

# **Data Cleaning**

In [16]:
df= df.drop("size", "Content Rating", "Last Updated", "Android Ver", "Current Ver")

In [17]:
df.show(5)

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|    10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|   500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5,000,000+|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50,000,000+|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|   100,000+|Free|    0|Art & Design;Crea...|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
only showing top 5 rows



In [18]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



# **Coverting the datatype of the columns into required one**

In [19]:
from pyspark.sql.types import IntegerType

# Convert 'Rating' column from string to integer
df = df.withColumn("Rating", df["Rating"].cast(IntegerType()))

# Convert 'Reviews' column from string to integer
df = df.withColumn("Reviews", df["Reviews"].cast(IntegerType()))

In [20]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Genres: string (nullable = true)



In [22]:
df.show(5)

+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|                 App|      Category|Rating|Reviews|   Installs|Type|Price|              Genres|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|     4|    159|    10,000+|Free|    0|        Art & Design|
| Coloring book moana|ART_AND_DESIGN|     3|    967|   500,000+|Free|    0|Art & Design;Pret...|
|U Launcher Lite –...|ART_AND_DESIGN|     4|  87510| 5,000,000+|Free|    0|        Art & Design|
|Sketch - Draw & P...|ART_AND_DESIGN|     4| 215644|50,000,000+|Free|    0|        Art & Design|
|Pixel Draw - Numb...|ART_AND_DESIGN|     4|    967|   100,000+|Free|    0|Art & Design;Crea...|
+--------------------+--------------+------+-------+-----------+----+-----+--------------------+
only showing top 5 rows



# **Find out the Top 10 Reviews given to the apps**

In [25]:
# Use Window function to rank the apps based on reviews
windowSpec = Window.orderBy(col("Reviews").desc())

# Rank the apps based on reviews
df = df.withColumn("Rank", F.rank().over(windowSpec))

# Filter the top 10 reviews
top_10_reviews = df.filter(col("Rank") <= 10)

# Select only the required columns ('App' and 'Reviews')
top_10_reviews = top_10_reviews.select("App", "Reviews")

# Show the top 10 reviews
top_10_reviews.show()

+--------------------+--------+
|                 App| Reviews|
+--------------------+--------+
|            Facebook|78158306|
|            Facebook|78128208|
|  WhatsApp Messenger|69119316|
|  WhatsApp Messenger|69119316|
|  WhatsApp Messenger|69109672|
|           Instagram|66577446|
|           Instagram|66577313|
|           Instagram|66577313|
|           Instagram|66509917|
|Messenger – Text ...|56646578|
+--------------------+--------+



# Conclusions:

1. The code finds the top 10 reviews given to the apps in the Google Play Store dataset.
2. It converts the 'Reviews' column from string to integer and then ranks the apps based on the number of reviews.
3. The result shows the top 10 apps with the highest number of reviews.

# **Top 10 installs apps and distribution of type(free/paid)**

In [37]:
# Convert 'Installs' column from string to integer
df = df.withColumn("Installs", F.regexp_replace(col("Installs"), "[+,]", "").cast(IntegerType()))

# Use Window function to rank the apps based on installs
windowSpec = Window.orderBy(col("Installs").desc())

# Rank the apps based on installs
df = df.withColumn("Rank", F.rank().over(windowSpec))

# Filter the top 10 installed apps
top_10_installed_apps = df.filter(col("Rank") <= 10)

# Select only the required columns ('App' and 'Installs')
top_10_installed_apps = top_10_installed_apps.select("App", "Installs")

# Show the top 10 installed apps
top_10_installed_apps.show(5)

+--------------------+----------+
|                 App|  Installs|
+--------------------+----------+
|   Google Play Books|1000000000|
|Messenger – Text ...|1000000000|
|  WhatsApp Messenger|1000000000|
|Google Chrome: Fa...|1000000000|
|               Gmail|1000000000|
+--------------------+----------+
only showing top 5 rows



In [36]:
# Calculate the distribution of app types (free/paid)
app_type_distribution = df.groupBy("Type").count()

# Show the distribution of app types
app_type_distribution.show(5)

+------+-----+
|  Type|count|
+------+-----+
|     0|    1|
|102248|    1|
|   NaN|    1|
|  Free|10037|
|  Paid|  800|
+------+-----+
only showing top 5 rows



# Comclusions:

1. The code finds the top 10 installed apps in the Google Play Store dataset.
2. It converts the 'Installs' column from string to integer and then ranks the apps based on the number of installs.
3. The result shows the top 10 apps with the highest number of installs.
4. Additionally, the code calculates the distribution of app types (free and paid) in the dataset.

# **Category wise distribution of installed apps**

In [28]:
# Calculate the category-wise distribution of installed apps
category_wise_distribution = df.groupBy("Category").agg(F.sum("Installs").alias("TotalInstalls"))

# Show the category-wise distribution of installed apps
category_wise_distribution.show(5)

+-------------+-------------+
|     Category|TotalInstalls|
+-------------+-------------+
|       EVENTS|     15973161|
|       COMICS|     56086150|
|       SPORTS|   1751174498|
|      WEATHER|    426100520|
|VIDEO_PLAYERS|   6222002720|
+-------------+-------------+
only showing top 5 rows



# Conclusions:

1. The code calculates the category-wise distribution of installed apps in the Google Play Store dataset.
2. It converts the 'Installs' column from string to integer and groups the apps based on their respective categories.
3. The result shows the total number of installs for each category.

# **Top Paid Apps**

In [30]:
# Filter and select the top paid apps
top_paid_apps = df.filter(col("Type") == "Paid").orderBy(col("Price").desc())

# Show the top paid apps
top_paid_apps.show(5)

+--------------------+---------------+------+-------+--------+----+-----+---------------+----+
|                 App|       Category|Rating|Reviews|Installs|Type|Price|         Genres|Rank|
+--------------------+---------------+------+-------+--------+----+-----+---------------+----+
|        ASCCP Mobile|        MEDICAL|     4|     63|   10000|Paid|$9.99|        Medical|6597|
|Critical Care Par...|        MEDICAL|     4|     17|    1000|Paid|$9.99|        Medical|8128|
|        ASCCP Mobile|        MEDICAL|     4|     63|   10000|Paid|$9.99|        Medical|6597|
|Baldur's Gate: En...|         FAMILY|     4|  20101|  100000|Paid|$9.99|   Role Playing|4949|
|        BIG Launcher|PERSONALIZATION|     4|    881|   10000|Paid|$9.99|Personalization|6597|
+--------------------+---------------+------+-------+--------+----+-----+---------------+----+
only showing top 5 rows



# Conclusions:

1. The code finds the top paid apps in the Google Play Store dataset.
2. It filters the DataFrame to include only the apps with the 'Type' column value equal to 'Paid'.
3. The result shows the top paid apps sorted by their prices in descending order.

# **Top Paid Rating Apps**

In [34]:
# Convert 'Rating' column from string to float
df = df.withColumn("Rating", df["Rating"].cast(FloatType()))

# Filter and select the top paid rating apps
top_paid_rating_apps = df.filter((col("Type") == "Paid") & (col("Rating").isNotNull())).orderBy(col("Rating").desc())

# Show the top paid rating apps
top_paid_rating_apps.show(5)

+--------------------+-------------------+------+-------+--------+----+-----+-----------------+----+
|                 App|           Category|Rating|Reviews|Installs|Type|Price|           Genres|Rank|
+--------------------+-------------------+------+-------+--------+----+-----+-----------------+----+
|      FHR 5-Tier 2.0|            MEDICAL|   5.0|      2|     500|Paid|$2.99|          Medical|9035|
|Hey AJ! It's Satu...|BOOKS_AND_REFERENCE|   5.0|     12|     100|Paid|$3.99|Books & Reference|9365|
|Super Hearing Sec...|            MEDICAL|   5.0|      3|     100|Paid|$2.99|          Medical|9365|
|        ADS-B Driver|              TOOLS|   5.0|      2|     100|Paid|$1.99|            Tools|9365|
|     P-Home for KLWP|    PERSONALIZATION|   5.0|      4|     100|Paid|$0.99|  Personalization|9365|
+--------------------+-------------------+------+-------+--------+----+-----+-----------------+----+
only showing top 5 rows



# Conclusions:

1. The code finds the top paid rating apps in the Google Play Store dataset.
2. It filters the DataFrame to include only the apps with the 'Type' column value equal to 'Paid' and a non-null 'Rating'.
3. The result shows the top paid rating apps sorted by their ratings in descending order.