Read in a single file and make duplicates

In [2]:
window = Window.orderBy("value")

In [5]:
df_1 = (spark.read.text("textfile.txt")
        .withColumn("rn", F.row_number().over(window))
     )
df_dup = df_1.unionAll(df_1).orderBy(F.col("rn"))
df_nodup = df_1
df_dup.show(10)

+-----+---+
|value| rn|
+-----+---+
|     |  1|
|     |  1|
|     |  2|
|     |  2|
|     |  3|
|     |  3|
|     |  4|
|     |  4|
|     |  5|
|     |  5|
+-----+---+
only showing top 10 rows



# validate that PKs are unique

Any method of the dedup notebook will work but the above is the easiest and computationally least expensive

In [9]:
df_dup.select("rn").count() == df_dup.select("rn").distinct().count() # should be false

False

In [10]:
df_nodup.select("rn").count() == df_nodup.select("rn").distinct().count() # should be true

True

# validate existince of a value in a specified column

In [15]:
df_nodup.where(F.col("rn") == 20).count() > 0

True

# casting to datetime

In [17]:
spark.sql("""select to_date("2025-01-02", "yyyy-MM-dd") as dt""").show()

+----------+
|        dt|
+----------+
|2025-01-02|
+----------+



In [27]:
df_dates = spark.sparkContext.parallelize([("2025-01-02",),("2025-02-02",)]).toDF(["a"])
df_dates.show()

+----------+
|         a|
+----------+
|2025-01-02|
|2025-02-02|
+----------+



In [29]:
df_dates_1 = df_dates.withColumn("b", F.to_date(F.col("a"), "yyyy-MM-dd"))
df_dates_1.printSchema()
df_dates_1.show()

root
 |-- a: string (nullable = true)
 |-- b: date (nullable = true)

+----------+----------+
|         a|         b|
+----------+----------+
|2025-01-02|2025-01-02|
|2025-02-02|2025-02-02|
+----------+----------+



In [33]:
df_dates_2 = df_dates_1.withColumn("c", F.to_timestamp(F.col("a"), "yyyy-MM-dd"))
df_dates_2.printSchema()
df_dates_2.show()

root
 |-- a: string (nullable = true)
 |-- b: date (nullable = true)
 |-- c: timestamp (nullable = true)

+----------+----------+-------------------+
|         a|         b|                  c|
+----------+----------+-------------------+
|2025-01-02|2025-01-02|2025-01-02 00:00:00|
|2025-02-02|2025-02-02|2025-02-02 00:00:00|
+----------+----------+-------------------+



In [38]:
df_dates_3 = (df_dates_2
              .withColumn("d", F.to_date(F.col("c")))
              .withColumn("e", F.date_format(F.col("c"), "MM-dd-yyyy")) 
             )
df_dates_3.printSchema()
df_dates_3.show()

root
 |-- a: string (nullable = true)
 |-- b: date (nullable = true)
 |-- c: timestamp (nullable = true)
 |-- d: date (nullable = true)
 |-- e: string (nullable = true)

+----------+----------+-------------------+----------+----------+
|         a|         b|                  c|         d|         e|
+----------+----------+-------------------+----------+----------+
|2025-01-02|2025-01-02|2025-01-02 00:00:00|2025-01-02|01-02-2025|
|2025-02-02|2025-02-02|2025-02-02 00:00:00|2025-02-02|02-02-2025|
+----------+----------+-------------------+----------+----------+



# extracting a specific pattern from incoming data

In [45]:
df_ext_1 = df_nodup.withColumn("extracted 1", F.regexp_extract("value", "the", 0)).where(F.col("extracted 1") != "")
df_ext_1.show()

+--------------------+---+-----------+
|               value| rn|extracted 1|
+--------------------+---+-----------+
|At last, she foun...| 13|        the|
|Disclaimer: While...| 14|        the|
|Elara, though fri...| 15|        the|
|Finally, Elara re...| 16|        the|
|In the days of yo...| 17|        the|
|Note: This is jus...| 18|        the|
|One fateful morn,...| 19|        the|
|The kingdom was s...| 22|        the|
|With the sword in...| 23|        the|
+--------------------+---+-----------+



In [49]:
df_ext_2 = df_ext_1.where(F.col("value").contains("she")) # startswith and endswith too
df_ext_2.show()

+--------------------+---+-----------+
|               value| rn|extracted 1|
+--------------------+---+-----------+
|At last, she foun...| 13|        the|
|Elara, though fri...| 15|        the|
|In the days of yo...| 17|        the|
|The kingdom was s...| 22|        the|
|With the sword in...| 23|        the|
+--------------------+---+-----------+



# extracting from nested fields

In [55]:
df_array = df_nodup.select(F.split(F.col("value")," ").alias("a")).where(F.size(F.col("a"))>1).show()

+--------------------+
|                   a|
+--------------------+
|[Disclaimer:, Whi...|
|[Prompt:, Generat...|
|[A, Tale, of, Woe...|
|[In, the, days, o...|
|[One, fateful, mo...|
|[Elara,, though, ...|
|[With, the, sword...|
|[Finally,, Elara,...|
|[At, last,, she, ...|
|[The, kingdom, wa...|
|[Note:, This, is,...|
+--------------------+

