# The objective of this worksheet is to explore basic operations on Spark Data Frames, including:
## - Creating data frame using toDF() method
## - Creating data frame using createDataFrame() method
## - Creating data frame csv() method

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('DataFrames-Worksheet-1').getOrCreate()

## Create a list and then convert into an RDD

In [2]:
students = [('David W.',85),('Gabriel S.',83),('Georgina D',80),('Rory J',86), ('David C.',81)]

rdd = spark.sparkContext.parallelize(students)

## Create Spark DataFrame from RDD

In [3]:
df = rdd.toDF()

## Let's check if DataFrame is created

In [4]:
type(df)

pyspark.sql.dataframe.DataFrame

In [5]:
df.show()

+----------+---+
|        _1| _2|
+----------+---+
|  David W.| 85|
|Gabriel S.| 83|
|Georgina D| 80|
|    Rory J| 86|
|  David C.| 81|
+----------+---+



In [6]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [7]:
df.dtypes

[('_1', 'string'), ('_2', 'bigint')]

## Creating DataFrame from RDD with Explicit Heading

In [8]:
rowHeading = ["Name", "Marks"]
df1 = rdd.toDF(rowHeading)

In [9]:
df1.show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|  David W.|   85|
|Gabriel S.|   83|
|Georgina D|   80|
|    Rory J|   86|
|  David C.|   81|
+----------+-----+



In [10]:
df2 = spark.createDataFrame(rdd)

In [11]:
df2.show()

+----------+---+
|        _1| _2|
+----------+---+
|  David W.| 85|
|Gabriel S.| 83|
|Georgina D| 80|
|    Rory J| 86|
|  David C.| 81|
+----------+---+



In [12]:
df2.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [13]:
df3 = spark.createDataFrame(rdd, schema = rowHeading)

In [14]:
df3.show()

+----------+-----+
|      Name|Marks|
+----------+-----+
|  David W.|   85|
|Gabriel S.|   83|
|Georgina D|   80|
|    Rory J|   86|
|  David C.|   81|
+----------+-----+



In [15]:
df3.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Marks: long (nullable = true)



## Create DataFrame from CSV File

In [17]:
s_df = spark.read.csv('StudentData-1.csv')

In [18]:
s_df.show()

+-----------+-----+----------+---------+
|        _c0|  _c1|       _c2|      _c3|
+-----------+-----+----------+---------+
|       Name|Marks|Attendance| Comments|
|   David W.|   85|       100|Excellent|
|   David C.|   83|        96|       NA|
| Gabriel S.|   76|        90|     NULL|
|Georgina D.|   85|        85|Excellent|
|    Rory J.|   78|      NULL|     NULL|
|     Jim C.| NULL|      NULL|     NULL|
|Williams R.| NULL|      NULL|     NULL|
|   David W.|   85|        60|Excellent|
| Gabriel S.|   76|        70|     NULL|
+-----------+-----+----------+---------+



In [19]:
s_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [None]:
# What is your observation about the above output?

# Column titles are included as first row
# All columns are str types

## Create DataFrame from CSV with Explicit Header

In [20]:
s_df1 = spark.read.csv('StudentData-1.csv', header = True, inferSchema = True)

In [21]:
s_df1.show()

+-----------+-----+----------+---------+
|       Name|Marks|Attendance| Comments|
+-----------+-----+----------+---------+
|   David W.|   85|       100|Excellent|
|   David C.|   83|        96|       NA|
| Gabriel S.|   76|        90|     NULL|
|Georgina D.|   85|        85|Excellent|
|    Rory J.|   78|      NULL|     NULL|
|     Jim C.| NULL|      NULL|     NULL|
|Williams R.| NULL|      NULL|     NULL|
|   David W.|   85|        60|Excellent|
| Gabriel S.|   76|        70|     NULL|
+-----------+-----+----------+---------+



In [22]:
s_df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Marks: integer (nullable = true)
 |-- Attendance: integer (nullable = true)
 |-- Comments: string (nullable = true)



In [None]:
# What is your observation about the above output, now?

# Column titles are recognized as column names
# Data types are inferred correctly

In [23]:
s_df1.describe()

DataFrame[summary: string, Name: string, Marks: string, Attendance: string, Comments: string]

In [24]:
s_df1.describe().show()

+-------+-----------+-----------------+------------------+---------+
|summary|       Name|            Marks|        Attendance| Comments|
+-------+-----------+-----------------+------------------+---------+
|  count|          9|                7|                 6|        4|
|   mean|       NULL|81.14285714285714|              83.5|     NULL|
| stddev|       NULL| 4.29839394148448|15.540270267920054|     NULL|
|    min|   David C.|               76|                60|Excellent|
|    max|Williams R.|               85|               100|       NA|
+-------+-----------+-----------------+------------------+---------+



In [None]:
# What is your observation about the above output?

## Selecting required columns

In [None]:
s_df1.select('Name','Marks').show()

In [None]:
s_df1.select("*").show()

In [None]:
s_df1.select(s_df1.columns[0]).show(5)

In [None]:
s_df1.select(s_df1.columns[0:2]).show()

## Explore more about the select method at:
#### https://sparkbyexamples.com/pyspark/select-columns-from-pyspark-dataframe/

## Using the withColumn method

In [None]:
s_df1 = s_df1.withColumn('Curved Marks', s_df1['Marks']+5)

In [None]:
s_df1.show()

## Explore more about the withColumn method at:
### https://sparkbyexamples.com/pyspark/pyspark-withcolumn/

## Dropping a column

In [None]:
new_df = s_df1.drop('comments')

In [None]:
new_df.show()

## Dropping duplicate rows

In [None]:
s_df1.dropDuplicates().collect()

In [None]:
s_df1.show()

In [None]:
# What is your observation about the above output?