#Write a PySpark code to perform:
- Check for duplicates and drop them if found in primary keys (1-10).
- Filter out rows with null or empty primary keys.
- Replace nulls in non-primary keys with 0.
- Reject rows where all values in columns 11 to 20 are null (non-primary keys).

In [0]:
from pyspark.sql import SparkSession, types as T, functions as F
from pyspark.sql.window import Window as W
spark = SparkSession.builder.appName('spark').getOrCreate()

In [0]:
# Sample data
data = [
("A1","B1","C1","D1","E1","F1","G1","H1","I1","J1",10,20,30,40,50,60,70,80,90,100),
("A1","B1","C1","D1","E1","F1","G1","H1","I1","J1",1,2,3,4,5,6,7,8,9,10),
(None,"B2","C2","D2","E2","F2","G2","H2","I2","J2",5,5,5,5,5,5,5,5,5,5),
("","B3","C3","D3","E3","F3","G3","H3","I3","J3",1,1,1,1,1,1,1,1,1,1),
("A4","B4","C4","D4","E4","F4","G4","H4","I4","J4",None,None,None,None,None,None,None,None,None,None),
("A5","B5","C5","D5","E5","F5","G5","H5","I5","J5",None,2,None,4,None,6,None,8,None,10),
("A6","B6","C6","D6","E6","F6","G6","H6","I6","J6",11,22,33,44,55,66,77,88,99,111)
]
columns = [f'col{i}' for i in range(1,21)]
df=spark.createDataFrame(data, schema=columns)
df=df.withColumn('id', F.row_number().over(W.orderBy(F.lit('1'))))
# df=df.withColumn('id', F.monotonically_increasing_id())
df.show(truncate=False)

In [0]:
pk_columns = [f'col{i}' for i in range(1,11)]
npk_columns = [f'col{i}' for i in range(11,21)]
print(pk_columns, npk_columns, sep='\n')

###1. Check for duplicates and drop them if found in primary keys(1-10).

In [0]:
df=df.dropDuplicates(pk_columns)
df.show(truncate=False)

###2. Filter out rows with null or empty primary keys.

In [0]:
pk_condition=None
for cols in pk_columns:
    condition = (F.col(cols).isNotNull()) & (F.trim(cols)!='')
    pk_condition=condition if pk_condition is None else pk_condition & condition
df=df.filter(pk_condition)
df.show(truncate=False)

###4. Reject rows where all values in columns 11 to 20 are null (non-primary keys).

In [0]:
npk_condition=None
for cols in npk_columns:
    condition=(F.col(cols).isNull())
    npk_condition=condition if npk_condition is None else npk_condition & condition
df=df.filter(~npk_condition)
df.show(truncate=False)

###3. Replace nulls in non-primary keys with 0.

In [0]:
df=df.fillna(0, subset=npk_columns)
df.show(truncate=False)