In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, concat, row_number
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("Data Augmentation").getOrCreate()

# Sample data (replace this with your DataFrame)
data = [
    ("DT240123999", "FX2312178", 1, "ivB", "owners", "RightToPurchaseOwnersTitlePolicy_1"),
    ("DT240123999", "FX2312178", 2, "ivB", "owners", "RightToPurchaseOwnersTitlePolicy_0"),
    ("DT240218948", "FX24012799", 1, "ivB", "owners", "RightToPurchaseOwnersTitlePolicy_1"),
    ("DT240218948", "FX24012799", 2, "ivB", "owners", "RightToPurchaseOwnersTitlePolicy_0"),
]

columns = ["documenttypeid", "folderid", "pagenumber", "image", "text", "label"]

# Create the base DataFrame
df = spark.createDataFrame(data, columns)

# Define a multiplier for the number of duplicates you want
n = 3  # Change this to create more duplicates (n = 3 means 3 copies per row)

# Step 1: Cross join the data with a range to duplicate rows
range_df = spark.range(1, n + 1).withColumnRenamed("id", "suffix")
df_exploded = df.crossJoin(range_df)

# Step 2: Append the unique suffix to documenttypeid and folderid
df_augmented = df_exploded.withColumn("documenttypeid", concat(col("documenttypeid"), lit("_"), col("suffix"))) \
                          .withColumn("folderid", concat(col("folderid"), lit("_"), col("suffix")))

# Step 3: Select the required columns
df_final = df_augmented.select("documenttypeid", "folderid", "pagenumber", "image", "text", "label")

# Show the result
df_final.show(truncate=False)


25/02/06 23:56:21 WARN Utils: Your hostname, Sameers-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.29.18 instead (on interface en0)
25/02/06 23:56:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 23:56:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+--------------+------------+----------+-----+------+----------------------------------+
|documenttypeid|folderid    |pagenumber|image|text  |label                             |
+--------------+------------+----------+-----+------+----------------------------------+
|DT240123999_1 |FX2312178_1 |1         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_1|
|DT240123999_2 |FX2312178_2 |1         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_1|
|DT240123999_3 |FX2312178_3 |1         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_1|
|DT240123999_1 |FX2312178_1 |2         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_0|
|DT240123999_2 |FX2312178_2 |2         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_0|
|DT240123999_3 |FX2312178_3 |2         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_0|
|DT240218948_1 |FX24012799_1|1         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_1|
|DT240218948_2 |FX24012799_2|1         |ivB  |owners|RightToPurchaseOwnersTitlePolicy_1|
|DT240218948_3 |FX240

In [3]:
display(df_final)

DataFrame[documenttypeid: string, folderid: string, pagenumber: bigint, image: string, text: string, label: string]