# Part 1: Learning PySpark Basics
## The following will be covered:
* PySpark Dataframe
* Reading the Dataset
* Checking the Datatypes of the Column(Schema)
* Selecting Columns and Indexing
* Check Descibe option similar to Pandas
* Adding and Dropping Columns

In [8]:
import pyspark 
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [10]:
# read the dataset
df_pyspark = spark.read.option('header','true').csv('pysparktest.csv', inferSchema=True)

In [11]:
# Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [12]:
df_pyspark = spark.read.csv('pysparktest.csv', header=True, inferSchema=True)
df_pyspark.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|Jayden|  23|         5| 80000|
| Shawn|  27|         7| 65000|
|   Bob|null|         4| 12345|
|Jeremy|  40|         8| 69000|
|Joseph|  23|         4|  null|
|  Mary|  24|      null|100000|
|  null|null|      null|  null|
|  null|  34|         2| 40000|
|  null| 121|      null| 90000|
+------+----+----------+------+



In [13]:
# Selecting a column
df_pyspark.select('Name', 'Age').show()
# Notice it is like SQL, hence, pyspark.sql

+------+----+
|  Name| Age|
+------+----+
|Jayden|  23|
| Shawn|  27|
|   Bob|null|
|Jeremy|  40|
|Joseph|  23|
|  Mary|  24|
|  null|null|
|  null|  34|
|  null| 121|
+------+----+



In [14]:
# Check the cloumn types
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [15]:
# See simple stats of the dataframe
df_pyspark.describe().show()

+-------+-----+------------------+------------------+------------------+
|summary| Name|               Age|        Experience|            Salary|
+-------+-----+------------------+------------------+------------------+
|  count|    6|                 7|                 6|                 7|
|   mean| null|41.714285714285715|               5.0|65192.142857142855|
| stddev| null| 35.54206093121353|2.1908902300206643|30244.257286458927|
|    min|  Bob|                23|                 2|             12345|
|    max|Shawn|               121|                 8|            100000|
+-------+-----+------------------+------------------+------------------+



In [16]:
# Adding Columns in a data frame
df_pyspark= df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience']+2)

In [17]:
df_pyspark.show()

+------+----+----------+------+------------------------+
|  Name| Age|Experience|Salary|Experience after 2 years|
+------+----+----------+------+------------------------+
|Jayden|  23|         5| 80000|                       7|
| Shawn|  27|         7| 65000|                       9|
|   Bob|null|         4| 12345|                       6|
|Jeremy|  40|         8| 69000|                      10|
|Joseph|  23|         4|  null|                       6|
|  Mary|  24|      null|100000|                    null|
|  null|null|      null|  null|                    null|
|  null|  34|         2| 40000|                       4|
|  null| 121|      null| 90000|                    null|
+------+----+----------+------+------------------------+



In [18]:
# Dropping columns in a dataframe
df_spark=df_pyspark.drop('Experience after 2 years')

In [19]:
# Rename the column
df_pyspark.withColumnRenamed('Name','First Name')

DataFrame[First Name: string, Age: int, Experience: int, Salary: int, Experience after 2 years: int]

# Part 2 Fiter Functions
* Filter Operation
* &, |, ==
* ~


In [20]:
df_spark1=spark.read.csv('pysparktest.csv', header=True,inferSchema=True)
df_spark1.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|Jayden|  23|         5| 80000|
| Shawn|  27|         7| 65000|
|   Bob|null|         4| 12345|
|Jeremy|  40|         8| 69000|
|Joseph|  23|         4|  null|
|  Mary|  24|      null|100000|
|  null|null|      null|  null|
|  null|  34|         2| 40000|
|  null| 121|      null| 90000|
+------+----+----------+------+



### Filter Operation

In [23]:
# Salary of the people less than or equal to 20000
df_pyspark.filter("Salary<=20000")

DataFrame[Name: string, Age: int, Experience: int, Salary: int, Experience after 2 years: int]

In [25]:
df_pyspark.filter("Salary<=20000").select(['Name','Age']).show()

+----+----+
|Name| Age|
+----+----+
| Bob|null|
+----+----+



In [29]:
# More than one condition
df_pyspark.filter((df_pyspark['Salary']<=80000) & 
    (df_pyspark['Salary']>=15000)).show()

+------+---+----------+------+------------------------+
|  Name|Age|Experience|Salary|Experience after 2 years|
+------+---+----------+------+------------------------+
|Jayden| 23|         5| 80000|                       7|
| Shawn| 27|         7| 65000|                       9|
|Jeremy| 40|         8| 69000|                      10|
|  null| 34|         2| 40000|                       4|
+------+---+----------+------+------------------------+

