In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructType, StructField, StringType,IntegerType

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.printSchema()
df.show(truncate=False)

df2 = df.withColumn("salary",col("salary").cast("Integer"))
df2.printSchema()
df2.show(truncate=False)

df3 = df.withColumn("salary",col("salary")*100)
df3.printSchema()
df3.show(truncate=False) 

df4 = df.withColumn("CopiedColumn",col("salary")* -1)
df4.printSchema()

df5 = df.withColumn("Country", lit("USA"))
df5.printSchema()

df6 = df.withColumn("Country", lit("USA")) \
   .withColumn("anotherColumn",lit("anotherValue"))
df6.printSchema()

df.withColumnRenamed("gender","sex") \
  .show(truncate=False) 
  
df4.drop("CopiedColumn") \
.show(truncate=False) 

dataStruct = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

schemaStruct = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
          StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', StringType(), True)
         ])


df7 = spark.createDataFrame(data=dataStruct, schema = schemaStruct)
df7.printSchema()
df7.show(truncate=False)


"""
columns = ["name","address"]
data = [("Robert, Smith", "1 Main st, Newark, NJ, 92537"), \
        ("Maria, Garcia","3456 Walnut st, Newark, NJ, 94732")]

dfFromData = spark.createDataFrame(data=data, schema = schema)

newDF = dfFromData.map(f=>{
nameSplit = f.getAs[String](0).split(",")
addSplit = f.getAs[String](1).split(",")
      (nameSplit(0),nameSplit(1),addSplit(0),addSplit(1),addSplit(2),addSplit(3))
    })
finalDF = newDF.toDF("First Name","Last Name",
             "Address Line1","City","State","zipCode")
finalDF.printSchema()
finalDF.show(false)
"""


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+------

In [0]:
#The code starts by importing the necessary libraries for working with PySpark and Spark DataFrames.

#A SparkSession is created using SparkSession.builder.appName('SparkByExamples.com').getOrCreate(). This session serves as the entry point for interacting with Spark.

#The code defines a list of tuples named data, which represents the data to be loaded into the DataFrame. Each tuple corresponds to a row in the DataFrame and contains values for different columns.

#The columns variable is a list that specifies the names of the columns in the DataFrame.

#The DataFrame df is created using spark.createDataFrame(data=data, schema=columns). It uses the createDataFrame() method of the SparkSession to create a DataFrame by passing the data and schema.

#The printSchema() method is called on df to display the schema of the DataFrame. It shows the names and data types of each column.

#The show() method is used to display the contents of the DataFrame. The parameter truncate=False ensures that the complete values of each column are displayed without truncation.

#The code then performs several operations on the DataFrame to demonstrate different transformations:

#Modifying the data type of the "salary" column to Integer using withColumn() and cast(), resulting in a new DataFrame df2.
#Multiplying the values in the "salary" column by 100 using withColumn(), creating DataFrame df3.
#Creating a new column "CopiedColumn" that is the negation of the "salary" column using withColumn(), resulting in DataFrame df4.
#Adding a new column "Country" with a constant value "USA" using withColumn(), creating DataFrame df5.
#Adding multiple new columns "Country" and "anotherColumn" with constant values "USA" and "anotherValue" respectively using chained withColumn() calls, creating DataFrame df6.
#Renaming the "gender" column to "sex" using withColumnRenamed(), resulting in a new DataFrame.
#Dropping the "CopiedColumn" from df4 using drop(), creating a new DataFrame.
#Creating a DataFrame df7 with a nested structure using a StructType schema.
#Finally, the code displays the schema and contents of the df7 DataFrame using printSchema() and show() respectively.

#The code demonstrates various operations like data type conversion, column manipulation, adding new columns, renaming columns, and working with nested structures in Spark DataFrames.
