In [1]:
import pyspark
from delta import *
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
spark = (SparkSession.builder.appName("SparkSample").getOrCreate())
spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/12 16:17:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Read in a single file and make duplicates

In [2]:
window = Window.orderBy("value")

In [3]:
df_1 = (spark.read.text("textfile.txt")
        .withColumn("rn", F.row_number().over(window))
     )
df = df_1.unionAll(df_1).orderBy(F.col("rn"))
df.show(10)

+-----+---+
|value| rn|
+-----+---+
|     |  1|
|     |  1|
|     |  2|
|     |  2|
|     |  3|
|     |  3|
|     |  4|
|     |  4|
|     |  5|
|     |  5|
+-----+---+
only showing top 10 rows



# method #1 - distinct

In [4]:
df.where(F.col("rn")>10).distinct().orderBy("rn").show(10)

+--------------------+---+
|               value| rn|
+--------------------+---+
|                    | 11|
|A Tale of Woe and...| 12|
|At last, she foun...| 13|
|Disclaimer: While...| 14|
|Elara, though fri...| 15|
|Finally, Elara re...| 16|
|In the days of yo...| 17|
|Note: This is jus...| 18|
|One fateful morn,...| 19|
|Prompt: Generate ...| 20|
+--------------------+---+
only showing top 10 rows



# method 2 - group by

In [5]:
df.where(F.col("rn")>10).groupby("value", "rn").agg(F.first("value")).show(20)

+--------------------+---+--------------------+
|               value| rn|        first(value)|
+--------------------+---+--------------------+
|                    | 11|                    |
|A Tale of Woe and...| 12|A Tale of Woe and...|
|At last, she foun...| 13|At last, she foun...|
|Disclaimer: While...| 14|Disclaimer: While...|
|Elara, though fri...| 15|Elara, though fri...|
|Finally, Elara re...| 16|Finally, Elara re...|
|In the days of yo...| 17|In the days of yo...|
|Note: This is jus...| 18|Note: This is jus...|
|One fateful morn,...| 19|One fateful morn,...|
|Prompt: Generate ...| 20|Prompt: Generate ...|
|           Response:| 21|           Response:|
|The kingdom was s...| 22|The kingdom was s...|
|With the sword in...| 23|With the sword in...|
+--------------------+---+--------------------+



# method 3 - window

dedup by specific fields

In [6]:
window = Window.partitionBy("value").orderBy("rn")
(
    df
    .withColumn("_rn", F.row_number().over(window))
    .where(F.col("_rn") == 1)
).where(F.col("rn")>10).show()

+--------------------+---+---+
|               value| rn|_rn|
+--------------------+---+---+
|A Tale of Woe and...| 12|  1|
|At last, she foun...| 13|  1|
|Disclaimer: While...| 14|  1|
|Elara, though fri...| 15|  1|
|Finally, Elara re...| 16|  1|
|In the days of yo...| 17|  1|
|Note: This is jus...| 18|  1|
|One fateful morn,...| 19|  1|
|Prompt: Generate ...| 20|  1|
|           Response:| 21|  1|
|The kingdom was s...| 22|  1|
|With the sword in...| 23|  1|
+--------------------+---+---+



# method 4 - dropDuplicates

In [7]:
df.dropDuplicates(("value",)).where(F.col("rn")>10).orderBy("rn").show()

+--------------------+---+
|               value| rn|
+--------------------+---+
|A Tale of Woe and...| 12|
|At last, she foun...| 13|
|Disclaimer: While...| 14|
|Elara, though fri...| 15|
|Finally, Elara re...| 16|
|In the days of yo...| 17|
|Note: This is jus...| 18|
|One fateful morn,...| 19|
|Prompt: Generate ...| 20|
|           Response:| 21|
|The kingdom was s...| 22|
|With the sword in...| 23|
+--------------------+---+

