In [None]:
!pip -q install pyspark==3.5.1
!apt-get -q install openjdk-11-jre-headless

Reading package lists...
Building dependency tree...
Reading state information...
openjdk-11-jre-headless is already the newest version (11.0.28+6-1ubuntu1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, length

In [None]:
spark = SparkSession.builder \
    .appName("IMDB") \
    .getOrCreate()

spark

In [None]:
#EXPLORE
csv_path = "/content/IMDB Dataset.csv"
df = spark.read.csv(
    csv_path,
    header=True,
    inferSchema=True,
    multiLine=True,
    escape='"',
    quote='"'
)
df.printSchema()
df.show(5)

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
|Basically there's...| negative|
|Petter Mattei's "...| positive|
+--------------------+---------+
only showing top 5 rows



In [None]:
#CLEAN
df_clean = df.na.drop(subset=["review"])
df_clean = df_clean.filter(length(trim(col("review"))) > 0)

In [None]:
#ANALYSE
total_reviews = df_clean.count()
print("Total reviews:", total_reviews)
df_clean.groupBy("sentiment").count().show()

Total reviews: 50000
+---------+-----+
|sentiment|count|
+---------+-----+
| positive|25000|
| negative|25000|
+---------+-----+



In [None]:
#SAVE
df_transformed = df_clean.withColumn("review_length", length(col("review")))
long_reviews = df_transformed.filter(col("review_length") > 500)
print("Number of long reviews:", long_reviews.count())
long_reviews.select("review_length", "sentiment").show(5)

Number of long reviews: 45017
+-------------+---------+
|review_length|sentiment|
+-------------+---------+
|         1761| positive|
|          998| positive|
|          926| positive|
|          748| negative|
|         1317| positive|
+-------------+---------+
only showing top 5 rows



In [None]:
#SAVE
output_dir = "/content/imdb_cleaned_transformed"

df_transformed.write \
    .mode("overwrite") \
    .option("header", True) \
    .option("multiLine", True) \
    .option("escape", '"') \
    .option("quote", '"') \
    .csv(output_dir)

print("✅ Saved results to:", output_dir)

✅ Saved results to: /content/imdb_cleaned_transformed
