In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)


df.select("firstname","lastname").show()

#Using Dataframe object name
df.select(df.firstname,df.lastname).show()
df.select(df["firstname"],df["lastname"]).show()

# Using col function
from pyspark.sql.functions import col
df.select(col("firstname").alias("fname"),col("lastname")).show()

# Show all columns
df.select("*").show()
df.select([col for col in df.columns]).show()
df.select(*columns).show()

df.select(df.columns[:3]).show(3)
df.select(df.columns[2:4]).show(3)

df.select(df.colRegex("`^.*name*`")).show()

data = [
        (("James",None,"Smith"),"OH","M"),
        (("Anna","Rose",""),"NY","F"),
        (("Julia","","Williams"),"OH","F"),
        (("Maria","Anne","Jones"),"NY","M"),
        (("Jen","Mary","Brown"),"NY","M"),
        (("Mike","Mary","Williams"),"OH","M")
        ]

from pyspark.sql.types import StructType,StructField, StringType        
schema = StructType([
    StructField('name', StructType([
         StructField('firstname', StringType(), True),
         StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
         ])),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
     ])


df2 = spark.createDataFrame(data = data, schema = schema)
df2.printSchema()
df2.show(truncate=False) # shows all columns
df2.select("name").show(truncate=False)
df2.select("name.firstname","name.lastname").show(truncate=False)
df2.select("name.*").show(truncate=False)



+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+-------+--------+
|  fname|lastname|
+-------+--------+
|  James|   Smith|
|Michael|    Rose|
| Robert|Williams|
|  Maria|   Jones|
+-------+--------+

+---------+--------+-------+-----+
|firstname|lastname|coun

In [0]:
#The code starts by importing the necessary modules: pyspark and SparkSession from pyspark.sql. These modules are required for working with Spark DataFrames.

#Next, a SparkSession is created using the SparkSession.builder API. The appName parameter sets the name of the Spark application, and getOrCreate() either retrieves an existing SparkSession or creates a new one if none exists.

#The code defines a sample data as a list of tuples, where each tuple represents a row in the DataFrame. The data list contains four rows with four columns: firstname, lastname, country, and state.

#The columns list contains the column names for the DataFrame.

#The spark.createDataFrame() method is used to create a DataFrame from the data and columns. The data parameter is set to the input data, and the schema parameter is set to the list of columns.

#The df.show(truncate=False) statement is used to display the DataFrame. show() is a method in Spark DataFrame that prints the contents of the DataFrame to the console. The truncate=False parameter ensures that the output does not truncate the displayed data.

#Various operations on the DataFrame are demonstrated:

#df.select("firstname", "lastname").show() selects the "firstname" and "lastname" columns and displays them.
#df.select(df.firstname, df.lastname).show() selects the columns using DataFrame object name and displays them.
#df.select(df["firstname"], df["lastname"]).show() selects the columns using indexing syntax and displays them.
#df.select(col("firstname").alias("fname"), col("lastname")).show() uses the col() function from pyspark.sql.functions to select columns and rename the "firstname" column to "fname".
#df.select("*").show() selects all columns and displays them.
#df.select([col for col in df.columns]).show() selects all columns using a list comprehension and displays them.
#df.select(*columns).show() selects all columns using the * operator and displays them.
#df.select(df.columns[:3]).show(3) selects the first three columns using column indexing and displays the first three rows.
#df.select(df.columns[2:4]).show(3) selects columns using column indexing and displays the first three rows.
#df.select(df.colRegex("^.name")).show() selects columns matching a regular expression pattern and displays them.
#The code then defines a more complex schema using the StructType and StructField classes from pyspark.sql.types. This schema represents a nested structure where the "name" field contains a structure with "firstname", "middlename", and "lastname" fields.

#Another DataFrame df2 is created using the more complex schema and a list of data tuples. The spark.createDataFrame() method is used with the data and schema parameters.

#The df2.printSchema() statement prints the schema of the DataFrame, showing the structure of nested fields.

#The df2.show(truncate=False) statement displays the contents of the DataFrame.

#Various select() operations are performed on df2 to demonstrate selecting columns and nested fields. For example, df2.select("name") selects the "name" column, df2.select("name.firstname", "name.lastname") selects the "firstname" and "lastname" columns within the "name" field, and `df2.select
