In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install findspark



In [None]:
# Install PySpark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark


0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 14.2 kB/129 kB 11%] [Connected t                                                                                                    Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 129 kB/129 kB 100%] [Connected t0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Connected to r2u.stat.illinois.edu (192.17.1                                                                                                    Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy

In [None]:
# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

In [None]:
spark= SparkSession.builder\
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1")\
    .getOrCreate()

In [None]:
spark

In [None]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "60d13a9ee7896326846ff53638bdb4c1.serveo.net:80") \
    .option("subscribe", "media_titles") \
    .option("startingOffsets", "earliest") \
    .load()

In [None]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [None]:
schema = StructType([
    StructField("show_id", StringType(), True),
    StructField("type", StringType(), True),
    StructField("title", StringType(), True),
    StructField("director", StringType(), True),
    StructField("cast", StringType(), True),
    StructField("country", StringType(), True),
    StructField("date_added", StringType(), True),
    StructField("release_year", StringType(), True),
    StructField("rating", StringType(), True),
    StructField("duration", StringType(), True),
    StructField("listed_in", StringType(), True),
    StructField("description", StringType(), True),
    StructField("source", StringType(), True)
])

In [None]:
json_df = df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")

In [None]:
json_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)
 |-- source: string (nullable = true)



In [None]:
Updateemptystring = json_df.replace("", None)


In [None]:
removewhitespace = Updateemptystring.select([ltrim(c).alias(c) for c in Updateemptystring.columns])

In [None]:
drop_date_added = removewhitespace.drop("date_added")

In [None]:
explode_listed_in= drop_date_added.withColumn("Category", explode(split("listed_in", ","))).drop("listed_in")

In [None]:
director_nulls= explode_listed_in.replace("null", "unknown")

In [None]:
cast_nulls= director_nulls.replace("null", "unknown")

In [None]:
 # Regex to match rows that are fully numeric (integers)
director_df = cast_nulls.filter(~(regexp_extract(col("director"), '^\d+$', 0) != ''))

In [None]:
director_df.drop()

DataFrame[show_id: string, type: string, title: string, director: string, cast: string, country: string, release_year: string, rating: string, duration: string, description: string, source: string, Category: string]

In [None]:
 # Regex to match rows that are fully numeric (integers)
cast_df = cast_nulls.filter(~(regexp_extract(col("cast"), '^\d+$', 0) != ''))

In [None]:
cast_df.drop()

DataFrame[show_id: string, type: string, title: string, director: string, cast: string, country: string, release_year: string, rating: string, duration: string, description: string, source: string, Category: string]

In [None]:
description_nulls= cast_nulls.replace("null", "unknown")

In [None]:
casting_year = drop_date_added.withColumn("release_year", col("release_year").cast(IntegerType()))

In [None]:
casting_year.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)
 |-- source: string (nullable = true)



In [None]:
# Define thresholds for classification
short_threshold = 90   # minutes
medium_threshold = 150 # minutes

In [None]:
# Classify movies based on duration directly
bins_movies = casting_year.withColumn(
    "movie_duration_category",
    when(regexp_extract(col("duration"), r"(\d+)", 1).cast("integer") <= short_threshold, "short")
    .when(
        (regexp_extract(col("duration"), r"(\d+)", 1).cast("integer") > short_threshold) &
        (regexp_extract(col("duration"), r"(\d+)", 1).cast("integer") <= medium_threshold),
        "medium"
    )
    .otherwise("long")
)

In [None]:
bins_movies.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)
 |-- source: string (nullable = true)
 |-- movie_duration_category: string (nullable = false)



In [None]:
query = bins_movies.writeStream.outputMode("append").format("console").start()

In [None]:
bins_movies.isStreaming

True