Read in a single file and make duplicates

In [1]:
import pyspark
from delta import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import json
spark = (SparkSession.builder.appName("SparkSample").getOrCreate())
spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/13 03:17:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/13 03:17:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/02/13 03:17:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/02/13 03:17:42 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/02/13 03:17:42 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [2]:
window = Window.orderBy("value")

df_1 = (spark.read.text("textfile.txt")
        .withColumn("rn", F.row_number().over(window))
     )
df_dup = df_1.unionAll(df_1).orderBy(F.col("rn"))
df_nodup = df_1
df_dup.show(10)

+-----+---+
|value| rn|
+-----+---+
|     |  1|
|     |  1|
|     |  2|
|     |  2|
|     |  3|
|     |  3|
|     |  4|
|     |  4|
|     |  5|
|     |  5|
+-----+---+
only showing top 10 rows



# validate that PKs are unique

Any method of the dedup notebook will work but the above is the easiest and computationally least expensive

In [3]:
df_dup.select("rn").count() == df_dup.select("rn").distinct().count() # should be false

False

In [4]:
df_nodup.select("rn").count() == df_nodup.select("rn").distinct().count() # should be true

True

# validate existince of a value in a specified column

In [5]:
df_nodup.where(F.col("rn") == 20).count() > 0

True

# casting to datetime

In [6]:
spark.sql("""select to_date("2025-01-02", "yyyy-MM-dd") as dt""").show()

+----------+
|        dt|
+----------+
|2025-01-02|
+----------+



In [7]:
df_dates = spark.sparkContext.parallelize([("2025-01-02",),("2025-02-02",)]).toDF(["a"])
df_dates.show()

+----------+
|         a|
+----------+
|2025-01-02|
|2025-02-02|
+----------+



In [8]:
df_dates_1 = df_dates.withColumn("b", F.to_date(F.col("a"), "yyyy-MM-dd"))
df_dates_1.printSchema()
df_dates_1.show()

root
 |-- a: string (nullable = true)
 |-- b: date (nullable = true)

+----------+----------+
|         a|         b|
+----------+----------+
|2025-01-02|2025-01-02|
|2025-02-02|2025-02-02|
+----------+----------+



In [9]:
df_dates_2 = df_dates_1.withColumn("c", F.to_timestamp(F.col("a"), "yyyy-MM-dd"))
df_dates_2.printSchema()
df_dates_2.show()

root
 |-- a: string (nullable = true)
 |-- b: date (nullable = true)
 |-- c: timestamp (nullable = true)

+----------+----------+-------------------+
|         a|         b|                  c|
+----------+----------+-------------------+
|2025-01-02|2025-01-02|2025-01-02 00:00:00|
|2025-02-02|2025-02-02|2025-02-02 00:00:00|
+----------+----------+-------------------+



In [10]:
df_dates_3 = (df_dates_2
              .withColumn("d", F.to_date(F.col("c")))
              .withColumn("e", F.date_format(F.col("c"), "MM-dd-yyyy")) 
             )
df_dates_3.printSchema()
df_dates_3.show()

root
 |-- a: string (nullable = true)
 |-- b: date (nullable = true)
 |-- c: timestamp (nullable = true)
 |-- d: date (nullable = true)
 |-- e: string (nullable = true)

+----------+----------+-------------------+----------+----------+
|         a|         b|                  c|         d|         e|
+----------+----------+-------------------+----------+----------+
|2025-01-02|2025-01-02|2025-01-02 00:00:00|2025-01-02|01-02-2025|
|2025-02-02|2025-02-02|2025-02-02 00:00:00|2025-02-02|02-02-2025|
+----------+----------+-------------------+----------+----------+



# extracting a specific pattern from incoming data

In [11]:
df_ext_1 = df_nodup.withColumn("extracted 1", F.regexp_extract("value", "the", 0)).where(F.col("extracted 1") != "")
df_ext_1.show()

+--------------------+---+-----------+
|               value| rn|extracted 1|
+--------------------+---+-----------+
|At last, she foun...| 13|        the|
|Disclaimer: While...| 14|        the|
|Elara, though fri...| 15|        the|
|Finally, Elara re...| 16|        the|
|In the days of yo...| 17|        the|
|Note: This is jus...| 18|        the|
|One fateful morn,...| 19|        the|
|The kingdom was s...| 22|        the|
|With the sword in...| 23|        the|
+--------------------+---+-----------+



In [12]:
df_ext_2 = df_ext_1.where(F.col("value").contains("she")) # startswith and endswith too
df_ext_2.show()

+--------------------+---+-----------+
|               value| rn|extracted 1|
+--------------------+---+-----------+
|At last, she foun...| 13|        the|
|Elara, though fri...| 15|        the|
|In the days of yo...| 17|        the|
|The kingdom was s...| 22|        the|
|With the sword in...| 23|        the|
+--------------------+---+-----------+



# extracting from nested fields

In [13]:
df_array = df_nodup.select(F.split(F.col("value")," ").alias("a")).where(F.size(F.col("a"))>1)
df_array.show()

+--------------------+
|                   a|
+--------------------+
|[Disclaimer:, Whi...|
|[Prompt:, Generat...|
|[A, Tale, of, Woe...|
|[In, the, days, o...|
|[One, fateful, mo...|
|[Elara,, though, ...|
|[With, the, sword...|
|[Finally,, Elara,...|
|[At, last,, she, ...|
|[The, kingdom, wa...|
|[Note:, This, is,...|
+--------------------+



# explode

In [14]:
df_array.withColumn("b", F.explode("a")).show()

+--------------------+-----------+
|                   a|          b|
+--------------------+-----------+
|[Disclaimer:, Whi...|Disclaimer:|
|[Disclaimer:, Whi...|      While|
|[Disclaimer:, Whi...|          I|
|[Disclaimer:, Whi...|        can|
|[Disclaimer:, Whi...|   generate|
|[Disclaimer:, Whi...|       text|
|[Disclaimer:, Whi...|         in|
|[Disclaimer:, Whi...|          a|
|[Disclaimer:, Whi...|      style|
|[Disclaimer:, Whi...|reminiscent|
|[Disclaimer:, Whi...|         of|
|[Disclaimer:, Whi...|    classic|
|[Disclaimer:, Whi...|    archaic|
|[Disclaimer:, Whi...|   English,|
|[Disclaimer:, Whi...|       it's|
|[Disclaimer:, Whi...|  important|
|[Disclaimer:, Whi...|         to|
|[Disclaimer:, Whi...|       note|
|[Disclaimer:, Whi...|       that|
|[Disclaimer:, Whi...|       true|
+--------------------+-----------+
only showing top 20 rows



# flatten

In [15]:
df_array = ["""{"id": "2111", "name": "OLIVIA", "age": "37", "metadata":[["a",1], ["b",2]]}"""]
df_array = spark.sparkContext.parallelize(df_array)
df_array = spark.read.json(df_array)
df_array.show()

+---+----+----------------+------+
|age|  id|        metadata|  name|
+---+----+----------------+------+
| 37|2111|[[a, 1], [b, 2]]|OLIVIA|
+---+----+----------------+------+



In [16]:
df_array.withColumn("b", F.flatten("metadata")).show()

+---+----+----------------+------+------------+
|age|  id|        metadata|  name|           b|
+---+----+----------------+------+------------+
| 37|2111|[[a, 1], [b, 2]]|OLIVIA|[a, 1, b, 2]|
+---+----+----------------+------+------------+



# pivot

In [17]:
df_pivot = spark.sparkContext.parallelize( [(100, 'John', 30, 1, 'Street 1'),
    (200, 'Rodger', None, 1, 'Street 2'),
    (300, 'Tim', 80, 3, 'Street 3'),
    (400, 'Dan', 50, 4, 'Street 4')] ).toDF(("id", "name", "value", "pos", "address"))
df_pivot.show()

+---+------+-----+---+--------+
| id|  name|value|pos| address|
+---+------+-----+---+--------+
|100|  John|   30|  1|Street 1|
|200|Rodger| NULL|  1|Street 2|
|300|   Tim|   80|  3|Street 3|
|400|   Dan|   50|  4|Street 4|
+---+------+-----+---+--------+



In [18]:
(df_pivot
     .groupBy("name")
     .pivot("pos")
     .agg(F.first("value"))
     .show())

+------+----+----+----+
|  name|   1|   3|   4|
+------+----+----+----+
|Rodger|NULL|NULL|NULL|
|  John|  30|NULL|NULL|
|   Tim|NULL|  80|NULL|
|   Dan|NULL|NULL|  50|
+------+----+----+----+



SQL implementation is more restrictive than dot notation

In [19]:
df_pivot.createOrReplaceTempView("df_pivot")
sql = """
SELECT * FROM df_pivot
    PIVOT (
        FIRST(value) AS v
        FOR name IN ('John' AS john, 'Tim' AS mike)
    );
"""
spark.sql(sql).show()

+---+---+--------+----+----+
| id|pos| address|john|mike|
+---+---+--------+----+----+
|300|  3|Street 3|NULL|  80|
|100|  1|Street 1|  30|NULL|
|400|  4|Street 4|NULL|NULL|
|200|  1|Street 2|NULL|NULL|
+---+---+--------+----+----+



# case ... when

In [26]:
df_nodup.withColumn("a",
    F.when(F.col("rn")%3 == 0, F.concat(F.lit("!"), F.col("value")))
    .otherwise(F.col("value"))).show()

+--------------------+---+--------------------+
|               value| rn|                   a|
+--------------------+---+--------------------+
|                    |  1|                    |
|                    |  2|                    |
|                    |  3|                   !|
|                    |  4|                    |
|                    |  5|                    |
|                    |  6|                   !|
|                    |  7|                    |
|                    |  8|                    |
|                    |  9|                   !|
|                    | 10|                    |
|                    | 11|                    |
|A Tale of Woe and...| 12|!A Tale of Woe an...|
|At last, she foun...| 13|At last, she foun...|
|Disclaimer: While...| 14|Disclaimer: While...|
|Elara, though fri...| 15|!Elara, though fr...|
|Finally, Elara re...| 16|Finally, Elara re...|
|In the days of yo...| 17|In the days of yo...|
|Note: This is jus...| 18|!Note: This is