In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,ArrayType
from pyspark.sql.functions import col,array

In [0]:
data = [('Manoj',[30,30000]), ('Megha',[27,40000]),('Manasa',[32,35000]),('Naveen',[34,38500]),('Vikky',[3,3000])]
#schema = ['Name','Age_Salary']

schema = StructType().add(field = 'Name',data_type = StringType())\
                    .add(field = 'Age_salary', data_type = ArrayType(IntegerType()))

In [0]:
df = spark.createDataFrame(data, schema)
df.show()
df.printSchema()

+------+-----------+
|  Name| Age_salary|
+------+-----------+
| Manoj|[30, 30000]|
| Megha|[27, 40000]|
|Manasa|[32, 35000]|
|Naveen|[34, 38500]|
| Vikky|  [3, 3000]|
+------+-----------+

root
 |-- Name: string (nullable = true)
 |-- Age_salary: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [0]:
df1 = df.withColumn('Age',col('Age_salary')[0])
df1.show()

+------+-----------+---+
|  Name| Age_salary|Age|
+------+-----------+---+
| Manoj|[30, 30000]| 30|
| Megha|[27, 40000]| 27|
|Manasa|[32, 35000]| 32|
|Naveen|[34, 38500]| 34|
| Vikky|  [3, 3000]|  3|
+------+-----------+---+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,['Manoj','Mandhapati'],3000),
        (2,['Megha','Mandhapati'],4000),
        (3,['Manasa','Mandhapati'],3800),
        (4,['Naveen','Jaggavarapu'],4000)]
schema = ['id','name','salary']
df = spark.createDataFrame(data,schema)

In [0]:
df1 = df.withColumn('First_Last',explode(col('name')))
df1.show()

+---+--------------------+------+-----------+
| id|                name|salary| First_Last|
+---+--------------------+------+-----------+
|  1| [Manoj, Mandhapati]|  3000|      Manoj|
|  1| [Manoj, Mandhapati]|  3000| Mandhapati|
|  2| [Megha, Mandhapati]|  4000|      Megha|
|  2| [Megha, Mandhapati]|  4000| Mandhapati|
|  3|[Manasa, Mandhapati]|  3800|     Manasa|
|  3|[Manasa, Mandhapati]|  3800| Mandhapati|
|  4|[Naveen, Jaggavar...|  4000|     Naveen|
|  4|[Naveen, Jaggavar...|  4000|Jaggavarapu|
+---+--------------------+------+-----------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,'Manoj','Azure,Python,SQL',3000),
        (2,'Megha','HPLC, Dissolution',4000),
        (3,'Manasa','Manual, Automation',3800),
        (4,'Naveen','Jira, Agile',4000)]
schema = ['Id','name','skills','Salary']
df = spark.createDataFrame(data,schema)
df.show()

+---+------+------------------+------+
| Id|  name|            skills|Salary|
+---+------+------------------+------+
|  1| Manoj|  Azure,Python,SQL|  3000|
|  2| Megha| HPLC, Dissolution|  4000|
|  3|Manasa|Manual, Automation|  3800|
|  4|Naveen|       Jira, Agile|  4000|
+---+------+------------------+------+



In [0]:
skills_df = df.withColumn('skillsArray',split(col('skills'),','))
skills_df.show()

+---+------+------------------+------+--------------------+
| Id|  name|            skills|Salary|         skillsArray|
+---+------+------------------+------+--------------------+
|  1| Manoj|  Azure,Python,SQL|  3000|[Azure, Python, SQL]|
|  2| Megha| HPLC, Dissolution|  4000|[HPLC,  Dissolution]|
|  3|Manasa|Manual, Automation|  3800|[Manual,  Automat...|
|  4|Naveen|       Jira, Agile|  4000|      [Jira,  Agile]|
+---+------+------------------+------+--------------------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,'Manoj','Azure','Python',3000),
        (2,'Megha','HPLC', 'Dissolution',4000),
        (3,'Manasa','Manual', 'Automation',3800),
        (4,'Naveen','Jira', 'Agile',4000)]
schema = ['Id','name','Primaryskill','Secondaryskill','Salary']
df = spark.createDataFrame(data,schema)
df.show()

+---+------+------------+--------------+------+
| Id|  name|Primaryskill|Secondaryskill|Salary|
+---+------+------------+--------------+------+
|  1| Manoj|       Azure|        Python|  3000|
|  2| Megha|        HPLC|   Dissolution|  4000|
|  3|Manasa|      Manual|    Automation|  3800|
|  4|Naveen|        Jira|         Agile|  4000|
+---+------+------------+--------------+------+



In [0]:
skills = df.withColumn('Skills',array(col('Primaryskill'),col('Secondaryskill')))
skills.show()

+---+------+------------+--------------+------+--------------------+
| Id|  name|Primaryskill|Secondaryskill|Salary|              Skills|
+---+------+------------+--------------+------+--------------------+
|  1| Manoj|       Azure|        Python|  3000|     [Azure, Python]|
|  2| Megha|        HPLC|   Dissolution|  4000| [HPLC, Dissolution]|
|  3|Manasa|      Manual|    Automation|  3800|[Manual, Automation]|
|  4|Naveen|        Jira|         Agile|  4000|       [Jira, Agile]|
+---+------+------------+--------------+------+--------------------+



In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType, ArrayType
from pyspark.sql.functions import *
data = [(1,'Manoj',['Azure','Python','SQL'],3000),
        (2,'Megha',['HPLC', 'Dissolution'],4000),
        (3,'Manasa',['Manual', 'Automation'],3800),
        (4,'Naveen',['Jira', 'Agile','Python'],4000)]
schema = ['Id','name','skills','Salary']
df = spark.createDataFrame(data,schema)
df.show()

+---+------+--------------------+------+
| Id|  name|              skills|Salary|
+---+------+--------------------+------+
|  1| Manoj|[Azure, Python, SQL]|  3000|
|  2| Megha| [HPLC, Dissolution]|  4000|
|  3|Manasa|[Manual, Automation]|  3800|
|  4|Naveen|[Jira, Agile, Pyt...|  4000|
+---+------+--------------------+------+

