# PYSPARK TOOLKIT

## Import from PySpark

In [2]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType  

## Initialize Spark Session

In [3]:
# Create a Spark session
spark = SparkSession.builder.appName("PySparkSQL").getOrCreate()

# Check the Spark session
print(type(spark))

# Print the Spark session details
print(spark.version)
print(spark.sparkContext.appName)

<class 'pyspark.sql.session.SparkSession'>
3.5.3
PySparkSQL


## Create Dataframe 
.type() = pyspark.sql.dataframe.DataFrame

In [None]:
data = [
    ("Jack","","Eldridge","36636","M",90000),
    ("Matthew","J", "Munro","28832","M",45400),
    ("Sheila","Oway", "Roberts","12114","F",64000),
    ("Anne","", "Dushane","32192","F",141000),
    ("Jane","Rebecca","Jones","99482","F",56000)
    ]

### CREATE: Using Spark

In [None]:
# Schema definition statement
schema_spark = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True),   
    StructField("id", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True)
    ])

# Create a DataFrame using the data and schema defined
df_using_spark = spark.createDataFrame(data=data,schema=schema_spark)

df_using_spark.printSchema()
df_using_spark.show(truncate=False) # truncate=False is specified meaning that the values in the DataFrame will not be truncated, and the full content of each column will be displayed.

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|Jack     |          |Eldridge|36636|M     |90000 |
|Matthew  |J         |Munro   |28832|M     |45400 |
|Sheila   |Oway      |Roberts |12114|F     |64000 |
|Anne     |          |Dushane |32192|F     |141000|
|Jane     |Rebecca   |Jones   |99482|F     |56000 |
+---------+----------+--------+-----+------+------+



### CREATE: Using DDL SQL

In [None]:
# Schema definition statement
schema_ddlsql = "firstname STRING, middlename STRING, lastname STRING, id STRING, gender STRING, salary INT " 

# Create a DataFrame using the data and schema defined
df_using_ddlsql = spark.createDataFrame(data=data,schema=schema_ddlsql)

df_using_ddlsql.printSchema()
df_using_ddlsql.show(truncate=False) #In this case, truncate=False is specified, which means that the values in the DataFrame will not be truncated, and the full content of each column will be displayed.


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|Jack     |          |Eldridge|36636|M     |90000 |
|Matthew  |J         |Munro   |28832|M     |45400 |
|Sheila   |Oway      |Roberts |12114|F     |64000 |
|Anne     |          |Dushane |32192|F     |141000|
|Jane     |Rebecca   |Jones   |99482|F     |56000 |
+---------+----------+--------+-----+------+------+

