In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,FloatType

#creating a spark session
spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

#creating list of tuples
tuples_list = [
    (1, "AAA", 95.0),
    (2, "BBB", 98.5),
    (3, "CCC", 88.6)
]

#defining the explicit schema using structfield and structtype
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Marks", FloatType(), True)
])

ddl_schema = """
ID INT,
Name STRING,
Marks float
"""
#createing a dataframe for the list of tuples
df = spark.createDataFrame(tuples_list, schema=ddl_schema)

#printing the schema
df.printSchema()

#print ing the dataframe
df.show()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Marks: float (nullable = true)

+---+----+-----+
| ID|Name|Marks|
+---+----+-----+
|  1| AAA| 95.0|
|  2| BBB| 98.5|
|  3| CCC| 88.6|
+---+----+-----+



In [9]:
df = spark.createDataFrame(tuples_list, schema=schema)
df.printSchema()
df.show()


root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Marks: float (nullable = true)

+---+----+-----+
| ID|Name|Marks|
+---+----+-----+
|  1| AAA| 95.0|
|  2| BBB| 98.5|
|  3| CCC| 88.6|
+---+----+-----+



In [11]:
df = spark.createDataFrame(tuples_list)
df.printSchema()
df.show()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: double (nullable = true)

+---+---+----+
| _1| _2|  _3|
+---+---+----+
|  1|AAA|95.0|
|  2|BBB|98.5|
|  3|CCC|88.6|
+---+---+----+



Key Learning :-

Purpose of SparkSession :- It acts like an entry point to start spark functionality like dataframes etc

key concepts of that spark session are :
1. Builder ->Starts building the SparkSession.
2. appName("example")->Assigns a name to your Spark job
3. getOrCreate():-
  Reuse: Returns an existing SparkSession if one exists.

  Create: Builds a new one if none exists.

PySpark has two primary distributed data structures for processing data:

1. RDD (Resilient Distributed Dataset)
2. DataFrame

RDD->
Schema ->No schema (unstructured) ,

Optimizations -> None ,

Performance	-> Slower (JVM serialization),

Use Case	-> Custom algorithms, raw data

DataFrame:

Schema ->Explicit schema (structured) ,

Optimizations -> Catalyst + Tungsten optimizations ,

Performance	-> Faster (columnar storage),

Use Case	-> SQL analytics, structured pipelines


Schema:
There are 3 mostly used schemas they are
1. Explicit Schema (StructType) : Defined manually using StructType and StructField.
2. Inferred Schema : Spark automatically guesses the schema from data.
3. DDL Schema (SQL-like String) : Define schema using a SQL DDL-formatted string.


 Data Display Methods:

1. show() -> Shows 20 rows, truncates long text
2. show(n) -> Shows first n rows
3. show(truncate=False) -> Displays full cell content
4. show(vertical=True) -> Displays rows vertically
5. printSchema() -> Shows column names and types
6. head() -> Returns list of Row objects
7. take() -> 	Returns list of Row objects
8. collect() -> Returns all data as list

In [13]:
df.limit(2).show()

+---+---+----+
| _1| _2|  _3|
+---+---+----+
|  1|AAA|95.0|
|  2|BBB|98.5|
+---+---+----+



In [14]:
df.show()

+---+---+----+
| _1| _2|  _3|
+---+---+----+
|  1|AAA|95.0|
|  2|BBB|98.5|
|  3|CCC|88.6|
+---+---+----+



In [15]:
df.show(1)

+---+---+----+
| _1| _2|  _3|
+---+---+----+
|  1|AAA|95.0|
+---+---+----+
only showing top 1 row



In [16]:
df.show(vertical=True)

-RECORD 0---
 _1  | 1    
 _2  | AAA  
 _3  | 95.0 
-RECORD 1---
 _1  | 2    
 _2  | BBB  
 _3  | 98.5 
-RECORD 2---
 _1  | 3    
 _2  | CCC  
 _3  | 88.6 



In [17]:
df.show(truncate=False)

+---+---+----+
|_1 |_2 |_3  |
+---+---+----+
|1  |AAA|95.0|
|2  |BBB|98.5|
|3  |CCC|88.6|
+---+---+----+



In [22]:
df.take(2)

[Row(_1=1, _2='AAA', _3=95.0), Row(_1=2, _2='BBB', _3=98.5)]

In [23]:
df.collect()

[Row(_1=1, _2='AAA', _3=95.0),
 Row(_1=2, _2='BBB', _3=98.5),
 Row(_1=3, _2='CCC', _3=88.6)]

In [34]:
#tuple of lists

list_tuple=([1,'AAA',33.0],[2,'BBB',58.0],[3,'None',90.0],[4,'DDD',None])

schema=StructType([
    StructField("ID",IntegerType(),True),
    StructField("Name",StringType(),False),
    StructField("Marks",FloatType(),True)
])

tuple_of_list_df=spark.createDataFrame(list_tuple,schema=schema)
tuple_of_list_df.show()

+---+----+-----+
| ID|Name|Marks|
+---+----+-----+
|  1| AAA| 33.0|
|  2| BBB| 58.0|
|  3|None| 90.0|
|  4| DDD| NULL|
+---+----+-----+

