In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, explode, lit
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("ETL").getOrCreate()

df = spark.read.text("WordData.txt")
df.show()

+--------------------+
|               value|
+--------------------+
|This is a Japanes...|
|The team members ...|
|As the years pass...|
|If you don't like...|
|He was disappoint...|
|When he encounter...|
|Situps are a terr...|
|Toddlers feeding ...|
|Edith could decid...|
|Her daily goal wa...|
|Tomorrow will bri...|
|His son quipped t...|
|He wondered why a...|
|If my calculator ...|
|The hummingbird's...|
|He went on a whis...|
|This is the last ...|
|I come from a tri...|
|The delicious aro...|
|Weather is not tr...|
+--------------------+
only showing top 20 rows


In [4]:
# transformations
df2 = df.withColumn("splitedData", F.split(F.col("value"), " "))
df3 = df2.withColumn("words", F.explode(F.col("splitedData")))
wordsDF = df3.select("words")
wordCount = wordsDF.groupBy("words").count().orderBy(col("count").desc())
wordCount.show()

+-----+-----+
|words|count|
+-----+-----+
|   to|   88|
|  the|   76|
|    a|   76|
|  was|   44|
|   is|   40|
|   he|   40|
|   it|   36|
|  The|   32|
|   He|   28|
|  and|   28|
| that|   28|
|  you|   24|
|   in|   24|
|   be|   24|
|   of|   24|
|    I|   24|
|   so|   24|
|  her|   24|
|   if|   20|
| from|   16|
+-----+-----+
only showing top 20 rows


In [9]:
# load
# NOTE: To write to PostgreSQL, you need the PostgreSQL JDBC driver JAR file
# Download postgresql-42.7.3.jar from https://jdbc.postgresql.org/download/
# and place it in your Spark jars directory or use --jars when submitting

# For now, save to CSV instead (using pandas to avoid Windows Hadoop issues)
try:
    import os
    from datetime import datetime
    # Create unique filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"wordcount_output_{timestamp}.csv"
    wordCount.toPandas().to_csv(output_file, index=False)
    print(f"Word count data saved to {output_file}")
except Exception as e:
    print(f"Error saving file: {e}")
    print("Please close any open CSV files and try again.")

# Uncomment the JDBC code below once you have the PostgreSQL driver:
# driver = "org.postgresql.Driver"
# url = "jdbc:postgresql://database-1.c0sanhw4ymut.us-west-2.rds.amazonaws.com/"
# table = "ahmad_schema_pyspark.WordCount"
# user = "postgres"
# password = ""
# wordCount.write.format("jdbc").option("driver", driver).option("url",url).option("dbtable", table).option("mode", "append").option("user",user).option("password", password).save()

Word count data saved to wordcount_output_20251028_170032.csv
