In [1]:
!pip -q install pyspark==3.5.1
!apt-get -q install openjdk-11-jre-headless

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  ca-certificates-java java-common libpcsclite1
Suggested packages:
  default-jre pcscd libnss-mdns fonts-dejavu-extra fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei | fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  ca-certificates-java java-common libpcsclite1 openjdk-11-jre-headless
0 upgraded, 4 newly installed, 0 to remove and 41 not upgraded.
Need to get 42.6 MB of archives.
After this operation, 176 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 java-common all 0.72build2 [6,782 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpcsclite1 amd64 1.9.5-3ubuntu1 [19.8 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 openjdk-11-jre-headless amd64 11.0.28+6-1ubuntu1~22.04.1 [42.6 MB]
Get:4 http://archive.ubuntu.com/ubunt

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, length

In [3]:
spark = SparkSession.builder \
    .appName("IMDB") \
    .getOrCreate()

spark

In [4]:
#EXPLORE
csv_path = "/content/IMDB Dataset.csv"
df = spark.read.csv(
    csv_path,
    header=True,
    inferSchema=True,
    multiLine=True,
    escape='"',
    quote='"'
)
df.printSchema()
df.show(5)

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|A wonderful littl...| positive|
|I thought this wa...| positive|
|Basically there's...| negative|
|Petter Mattei's "...| positive|
+--------------------+---------+
only showing top 5 rows



In [5]:
#CLEAN
df_clean = df.na.drop(subset=["review"])
df_clean = df_clean.filter(length(trim(col("review"))) > 0)

In [6]:
#ANALYSE
total_reviews = df_clean.count()
print("Total reviews:", total_reviews)
df_clean.groupBy("sentiment").count().show()

Total reviews: 50000
+---------+-----+
|sentiment|count|
+---------+-----+
| positive|25000|
| negative|25000|
+---------+-----+



In [7]:
#SAVE
df_transformed = df_clean.withColumn("review_length", length(col("review")))
long_reviews = df_transformed.filter(col("review_length") > 500)
print("Number of long reviews:", long_reviews.count())
long_reviews.select("review_length", "sentiment").show(5)

Number of long reviews: 45017
+-------------+---------+
|review_length|sentiment|
+-------------+---------+
|         1761| positive|
|          998| positive|
|          926| positive|
|          748| negative|
|         1317| positive|
+-------------+---------+
only showing top 5 rows



In [8]:
#SAVE
output_dir = "/content/imdb_cleaned_transformed"

df_transformed.write \
    .mode("overwrite") \
    .option("header", True) \
    .option("multiLine", True) \
    .option("escape", '"') \
    .option("quote", '"') \
    .csv(output_dir)

print("✅ Saved results to:", output_dir)

✅ Saved results to: /content/imdb_cleaned_transformed
