In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,['Manoj','Mandhapati'],3000),
        (2,['Megha','Mandhapati'],4000),
        (3,['Manasa','Mandhapati'],3800),
        (4,['Naveen','Jaggavarapu'],4000)]

structName = StructType().add(field = 'FirstName', data_type = StringType())\
                        .add(field = 'LastName',data_type = StringType())
schema = StructType().add(field = 'Id', data_type = IntegerType())\
                  .add(field= 'Name', data_type = structName)\
                    .add(field= 'Salary',data_type = IntegerType())

df = spark.createDataFrame(data,schema)
display(df)
df.printSchema()

Id,Name,Salary
1,"List(Manoj, Mandhapati)",3000
2,"List(Megha, Mandhapati)",4000
3,"List(Manasa, Mandhapati)",3800
4,"List(Naveen, Jaggavarapu)",4000


root
 |-- Id: integer (nullable = true)
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,['Manoj','Mandhapati'],3000),
        (2,['Megha','Mandhapati'],4000),
        (3,['Manasa','Mandhapati'],3800),
        (4,['Naveen','Jaggavarapu'],4000)]
schema = ['id','name','salary']
df = spark.createDataFrame(data,schema)

In [0]:
df1 = df.withColumn('First_Last',explode(col('name')))
df1.show()

+---+--------------------+------+-----------+
| id|                name|salary| First_Last|
+---+--------------------+------+-----------+
|  1| [Manoj, Mandhapati]|  3000|      Manoj|
|  1| [Manoj, Mandhapati]|  3000| Mandhapati|
|  2| [Megha, Mandhapati]|  4000|      Megha|
|  2| [Megha, Mandhapati]|  4000| Mandhapati|
|  3|[Manasa, Mandhapati]|  3800|     Manasa|
|  3|[Manasa, Mandhapati]|  3800| Mandhapati|
|  4|[Naveen, Jaggavar...|  4000|     Naveen|
|  4|[Naveen, Jaggavar...|  4000|Jaggavarapu|
+---+--------------------+------+-----------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,'Manoj','Azure,Python,SQL',3000),
        (2,'Megha','HPLC, Dissolution',4000),
        (3,'Manasa','Manual, Automation',3800),
        (4,'Naveen','Jira, Agile',4000)]
schema = ['Id','name','skills','Salary']
df = spark.createDataFrame(data,schema)
df.show()

+---+------+------------------+------+
| Id|  name|            skills|Salary|
+---+------+------------------+------+
|  1| Manoj|  Azure,Python,SQL|  3000|
|  2| Megha| HPLC, Dissolution|  4000|
|  3|Manasa|Manual, Automation|  3800|
|  4|Naveen|       Jira, Agile|  4000|
+---+------+------------------+------+



In [0]:
skills_df = df.withColumn('skillsArray',split(col('skills'),','))
skills_df.show()

+---+------+------------------+------+--------------------+
| Id|  name|            skills|Salary|         skillsArray|
+---+------+------------------+------+--------------------+
|  1| Manoj|  Azure,Python,SQL|  3000|[Azure, Python, SQL]|
|  2| Megha| HPLC, Dissolution|  4000|[HPLC,  Dissolution]|
|  3|Manasa|Manual, Automation|  3800|[Manual,  Automat...|
|  4|Naveen|       Jira, Agile|  4000|      [Jira,  Agile]|
+---+------+------------------+------+--------------------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,'Manoj','Azure','Python',3000),
        (2,'Megha','HPLC', 'Dissolution',4000),
        (3,'Manasa','Manual', 'Automation',3800),
        (4,'Naveen','Jira', 'Agile',4000)]
schema = ['Id','name','Primaryskill','Secondaryskill','Salary']
df = spark.createDataFrame(data,schema)
df.show()

+---+------+------------+--------------+------+
| Id|  name|Primaryskill|Secondaryskill|Salary|
+---+------+------------+--------------+------+
|  1| Manoj|       Azure|        Python|  3000|
|  2| Megha|        HPLC|   Dissolution|  4000|
|  3|Manasa|      Manual|    Automation|  3800|
|  4|Naveen|        Jira|         Agile|  4000|
+---+------+------------+--------------+------+



In [0]:
skills = df.withColumn('Skills',array(col('Primaryskill'),col('Secondaryskill')))
skills.show()

+---+------+------------+--------------+------+--------------------+
| Id|  name|Primaryskill|Secondaryskill|Salary|              Skills|
+---+------+------------+--------------+------+--------------------+
|  1| Manoj|       Azure|        Python|  3000|     [Azure, Python]|
|  2| Megha|        HPLC|   Dissolution|  4000| [HPLC, Dissolution]|
|  3|Manasa|      Manual|    Automation|  3800|[Manual, Automation]|
|  4|Naveen|        Jira|         Agile|  4000|       [Jira, Agile]|
+---+------+------------+--------------+------+--------------------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,'Manoj',['Azure','Python','SQL'],3000),
        (2,'Megha',['HPLC', 'Dissolution'],4000),
        (3,'Manasa',['Manual', 'Automation'],3800),
        (4,'Naveen',['Jira', 'Agile','Python'],4000)]
schema = ['Id','name','skills','Salary']
df = spark.createDataFrame(data,schema)
df.show()

+---+------+--------------------+------+
| Id|  name|              skills|Salary|
+---+------+--------------------+------+
|  1| Manoj|[Azure, Python, SQL]|  3000|
|  2| Megha| [HPLC, Dissolution]|  4000|
|  3|Manasa|[Manual, Automation]|  3800|
|  4|Naveen|[Jira, Agile, Pyt...|  4000|
+---+------+--------------------+------+



In [0]:
df.withColumn('HasSkillPython', array_contains(col('skills'),'Python')).show()

+---+------+--------------------+------+--------------+
| Id|  name|              skills|Salary|HasSkillPython|
+---+------+--------------------+------+--------------+
|  1| Manoj|[Azure, Python, SQL]|  3000|          true|
|  2| Megha| [HPLC, Dissolution]|  4000|         false|
|  3|Manasa|[Manual, Automation]|  3800|         false|
|  4|Naveen|[Jira, Agile, Pyt...|  4000|          true|
+---+------+--------------------+------+--------------+

