<a href="https://colab.research.google.com/github/Mr-Hackrr/pySpark/blob/main/movies%20pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile movies.csv
user_id,series,season,timestamp,genre,duration_mins
521,”Mirzapur”,3,2024-07-30 15:00:00,action,300
672,”Panchayat”,3,2024-07-30 15:00:00,comedy,200
197,”Family Man”,2,2024-07-30 15:00:00,action,500
521,”Mirzapur”,2,2024-07-29 15:00:00,action,280
211,”Queens Gambit”,1,2024-07-30 15:00:00,drama,170
521,”Mirzapur”,1,2024-07-28 15:00:00,action,230
844,”Westworld”,3,2024-07-30 15:00:00,sci-fi,310
672,”Panchayat”,3,2024-07-29 15:00:00,comedy,210
256,”Homecoming”,2,2024-07-30 15:00:00,thriller,310
489,”Outer Range”,1,2024-07-30 15:00:00,sci-fi,340
200,”Black Mirror”,2,2024-07-30 15:00:00,sci-fi,140
256,”Outer Range”,2,2024-07-30 15:00:00,thriller,250
489,”Outer Range”,2,2024-07-28 15:00:00,sci-fi,170
200,”Black Mirror”,3,2024-07-29 15:00:00,sci-fi,190
672,”Panchayat”,2,2024-07-28 15:00:00,comedy,160
672,”Outer Range”,1,2024-07-25 15:00:00,sci-fi,250
200,”Black Mirror”,4,2024-07-28 15:00:00,sci-fi,200
844,”Westworld”,2,2024-07-29 15:00:00,sci-fi,300
672,”Black Mirror”,5,2024-07-28 15:00:00,sci-fi,150
672,”Panchayat”,1,2024-07-27 15:00:00,comedy,190

Writing movies.csv


In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=e8e569ca186de506029c5c9646b0409981a93a3df83444580f359c5720a39707
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
#Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [4]:
#Create SparkSession for app (Streaming Analysis)
spark = SparkSession.builder.appName('Movie analysis').getOrCreate()

In [5]:
#Create Dataframe series_df
moviesdf = spark.read.csv("movies.csv",header=True)

In [6]:
moviesdf.show()

+-------+---------------+------+-------------------+--------+-------------+
|user_id|         series|season|          timestamp|   genre|duration_mins|
+-------+---------------+------+-------------------+--------+-------------+
|    521|     ”Mirzapur”|     3|2024-07-30 15:00:00|  action|          300|
|    672|    ”Panchayat”|     3|2024-07-30 15:00:00|  comedy|          200|
|    197|   ”Family Man”|     2|2024-07-30 15:00:00|  action|          500|
|    521|     ”Mirzapur”|     2|2024-07-29 15:00:00|  action|          280|
|    211|”Queens Gambit”|     1|2024-07-30 15:00:00|   drama|          170|
|    521|     ”Mirzapur”|     1|2024-07-28 15:00:00|  action|          230|
|    844|    ”Westworld”|     3|2024-07-30 15:00:00|  sci-fi|          310|
|    672|    ”Panchayat”|     3|2024-07-29 15:00:00|  comedy|          210|
|    256|   ”Homecoming”|     2|2024-07-30 15:00:00|thriller|          310|
|    489|  ”Outer Range”|     1|2024-07-30 15:00:00|  sci-fi|          340|
|    200| ”B

In [26]:
#Find the user with maximum watchtime
#moviesdf.filter(moviesdf.duration_mins == moviesdf.duration_mins.max())
moviesdf.groupBy('user_id').agg(sum('duration_mins')).sort(col('sum(duration_mins)').desc()).first()[0]

'672'

In [16]:
#Calculate overall total Watchtime
moviesdf.agg({'duration_mins':'sum'}).collect()[0][0]

4850.0

In [19]:
#Find most popular shows (based on watchtime)
popular = moviesdf.groupBy('series').agg({'duration_mins':'sum'}).orderBy(col('sum(duration_mins)').desc()).limit(3)
popular.rdd.flatMap(lambda x : [x[0]]).collect()

['”Outer Range”', '”Mirzapur”', '”Panchayat”']

In [23]:
#Find most popular shows (based on user popularity)
user_popularity = moviesdf.groupBy('series').agg({'user_id':'count'}).orderBy(col('count(user_id)').desc()).limit(3)
user_popularity.rdd.flatMap(lambda x : [x[0]]).collect()

['”Outer Range”', '”Panchayat”', '”Black Mirror”']

In [27]:
#Find the most popular genre
genre = moviesdf.groupBy('genre').agg(count('user_id'))
genre.rdd.flatMap(lambda x : [x[0]]).collect()[0]

'action'

In [35]:
#Calculate total watchtime per user
moviesdf.groupBy('user_id').agg(sum('duration_mins')).select(col("user_id"),col("sum(duration_mins)").alias('total_watch_time')).show()

+-------+----------------+
|user_id|total_watch_time|
+-------+----------------+
|    521|           810.0|
|    200|           530.0|
|    672|          1160.0|
|    256|           560.0|
|    197|           500.0|
|    211|           170.0|
|    844|           610.0|
|    489|           510.0|
+-------+----------------+



In [None]:
#Find most popular genre (based on engagement count)


In [None]:
#Find average watchtime per genre


In [None]:
#Find peak traffic days
#(Output 1 = Full Date)


#(Output 2 = Only Day)



In [None]:
#Find the user with most diverse show preference


In [None]:
#Find the binge-watchers


In [None]:
#Find the user with longest watching streak


In [None]:
#Total Seasons available


In [None]:
#Fetch a list of all series
