In [0]:
# how to generate seq / sk in pyspark (e.g. row numbers [1,2,3,4,5,6,7,8,9,10])
spark

In [0]:
%fs
ls /FileStore/tables

path,name,size,modificationTime
dbfs:/FileStore/tables/2011_summary-1.csv,2011_summary-1.csv,7069,1698831038000
dbfs:/FileStore/tables/2011_summary.csv,2011_summary.csv,7069,1698830281000
dbfs:/FileStore/tables/Cricket_data_set_odi.csv,Cricket_data_set_odi.csv,1535,1701856689000
dbfs:/FileStore/tables/Dataset.txt,Dataset.txt,405,1700558040000
dbfs:/FileStore/tables/Employee_Salary_Dataset.csv,Employee_Salary_Dataset.csv,809,1699373636000
dbfs:/FileStore/tables/HR_Employee_Attrition.csv,HR_Employee_Attrition.csv,227977,1701176079000
dbfs:/FileStore/tables/Matches.csv,Matches.csv,1275423,1700642533000
dbfs:/FileStore/tables/Multi_line_correct.json,Multi_line_correct.json,310,1698917967000
dbfs:/FileStore/tables/Multi_line_incorrect.json,Multi_line_incorrect.json,304,1698917935000
dbfs:/FileStore/tables/Sample_Dataset-1.txt,Sample_Dataset-1.txt,405,1700558796000


In [0]:
sk_df = spark.read.option("nullValue", "null")\
    .csv("/FileStore/tables/emp_data.csv", header=True, inferSchema=True)
sk_df.limit(10).display()

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
7369,SMITH,CLERK,7902.0,1980-12-17,800,,20
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30
7566,JONES,MANAGER,7839.0,1981-02-04,2975,,20
7654,MARTIN,SALESMAN,7698.0,1981-09-21,1250,1400.0,30
7698,SGR,MANAGER,7839.0,1981-01-05,2850,,30
7782,RAVI,MANAGER,7839.0,1981-09-06,2450,,10
7788,SCOTT,ANALYST,7566.0,1987-04-19,3000,,20
7839,KING,PRESIDENT,,1981-11-01,5000,,10
7844,TURNER,SALESMAN,7698.0,1981-08-09,1500,0.0,30


In [0]:
# Method 1 -  using monotonically_increasing_id or hash functions we can generate sequence or surrogate key
from pyspark.sql.functions import monotonically_increasing_id
sk_df.withColumn("monotonically_increasing_id", monotonically_increasing_id()).limit(10).display()

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,monotonically_increasing_id
7369,SMITH,CLERK,7902.0,1980-12-17,800,,20,0
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30,1
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30,2
7566,JONES,MANAGER,7839.0,1981-02-04,2975,,20,3
7654,MARTIN,SALESMAN,7698.0,1981-09-21,1250,1400.0,30,4
7698,SGR,MANAGER,7839.0,1981-01-05,2850,,30,5
7782,RAVI,MANAGER,7839.0,1981-09-06,2450,,10,6
7788,SCOTT,ANALYST,7566.0,1987-04-19,3000,,20,7
7839,KING,PRESIDENT,,1981-11-01,5000,,10,8
7844,TURNER,SALESMAN,7698.0,1981-08-09,1500,0.0,30,9


In [0]:
#if you want to start with 1 or any other range of digits use this
sk_df.withColumn("monotonically_increasing_id", monotonically_increasing_id()+1).limit(10).display()

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,monotonically_increasing_id
7369,SMITH,CLERK,7902.0,1980-12-17,800,,20,1
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30,2
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30,3
7566,JONES,MANAGER,7839.0,1981-02-04,2975,,20,4
7654,MARTIN,SALESMAN,7698.0,1981-09-21,1250,1400.0,30,5
7698,SGR,MANAGER,7839.0,1981-01-05,2850,,30,6
7782,RAVI,MANAGER,7839.0,1981-09-06,2450,,10,7
7788,SCOTT,ANALYST,7566.0,1987-04-19,3000,,20,8
7839,KING,PRESIDENT,,1981-11-01,5000,,10,9
7844,TURNER,SALESMAN,7698.0,1981-08-09,1500,0.0,30,10


In [0]:
# Method 2 - Using window function row_number()
# PySpark lit() function is used to add constant or literal value as a new column to the DataFrame.
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window

sk_df.withColumn("row_number()", row_number().over(Window.partitionBy(lit('')).orderBy(lit('')))).limit(10).display()

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,row_number()
7369,SMITH,CLERK,7902.0,1980-12-17,800,,20,1
7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30,2
7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30,3
7566,JONES,MANAGER,7839.0,1981-02-04,2975,,20,4
7654,MARTIN,SALESMAN,7698.0,1981-09-21,1250,1400.0,30,5
7698,SGR,MANAGER,7839.0,1981-01-05,2850,,30,6
7782,RAVI,MANAGER,7839.0,1981-09-06,2450,,10,7
7788,SCOTT,ANALYST,7566.0,1987-04-19,3000,,20,8
7839,KING,PRESIDENT,,1981-11-01,5000,,10,9
7844,TURNER,SALESMAN,7698.0,1981-08-09,1500,0.0,30,10


In [0]:
# there are another methods too like
# CRC32
# sha2
# md5