## In this tutorial we will cover
* PySpark DataFrame
* Reading the Dataset 
* Checking the datatypes of the column(schema)
* Selecting Columns and Indexing
* Check Describe option similar to Pandas
* Adding Columns
* Dropping Columns
* Renaming Columns

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [7]:
spark

In [17]:
## Reading the dataset

# df_pyspark = spark.read.option('header','true').csv('spark_tutorial.csv')

#passing inferSchema will let know the types of the values present in the df, else all columns will be type string.
df_pyspark = spark.read.option('header','true').csv('spark_tutorial.csv', inferSchema = True)


In [18]:
#displaying the dataframe

df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Harry| 27|         2|
|Jerry| 27|         4|
|param| 65|        40|
|Laksh| 54|        30|
|Babji| 33|        10|
| Anni| 32|        10|
+-----+---+----------+



In [19]:
## Check the schema

df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [20]:
## another way of reading dataframe 
df_pyspark = spark.read.csv('spark_tutorial.csv', header= True, inferSchema=True)

In [21]:
df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Harry| 27|         2|
|Jerry| 27|         4|
|param| 65|        40|
|Laksh| 54|        30|
|Babji| 33|        10|
| Anni| 32|        10|
+-----+---+----------+



In [22]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [23]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [31]:
#dataframe is data structure.

#getting top 2 
df_pyspark.head(2)

[Row(Name='Harry', Age=27, Experience=2),
 Row(Name='Jerry', Age=27, Experience=4)]

In [32]:


#getting column names: 
df_pyspark.columns


['Name', 'Age', 'Experience']

In [38]:
#Selecting the columns:
print(df_pyspark.select('Name'))

print(df_pyspark.select('Name').show())

print(type(df_pyspark.select('Name')))

#selecting more than one column:
print(df_pyspark.select(['Name','Experience']).show(2))

DataFrame[Name: string]
+-----+
| Name|
+-----+
|Harry|
|Jerry|
|param|
|Laksh|
|Babji|
| Anni|
+-----+

None
<class 'pyspark.sql.dataframe.DataFrame'>
+-----+----------+
| Name|Experience|
+-----+----------+
|Harry|         2|
|Jerry|         4|
+-----+----------+
only showing top 2 rows

None


In [42]:
df_pyspark['Name']


Column<'Name'>

In [43]:
#similar like pandas, and just like printSchema()

df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [45]:
#Aggregator applies to df, count, mean, stddev, min, max

df_pyspark.describe().show()

+-------+-----+------------------+------------------+
|summary| Name|               Age|        Experience|
+-------+-----+------------------+------------------+
|  count|    6|                 6|                 6|
|   mean| null|39.666666666666664|              16.0|
| stddev| null|15.945741333242134|15.388307249337076|
|    min| Anni|                27|                 2|
|    max|param|                65|                40|
+-------+-----+------------------+------------------+



In [48]:
### Adding Columns in dataframe

df_pyspark = df_pyspark.withColumn('Experience After 2yrs', df_pyspark['Experience']+2)

In [49]:
df_pyspark.show()

+-----+---+----------+---------------------+
| Name|Age|Experience|Experience After 2yrs|
+-----+---+----------+---------------------+
|Harry| 27|         2|                    4|
|Jerry| 27|         4|                    6|
|param| 65|        40|                   42|
|Laksh| 54|        30|                   32|
|Babji| 33|        10|                   12|
| Anni| 32|        10|                   12|
+-----+---+----------+---------------------+



In [51]:
### Drop the columns
# df_pyspark.drop('Experience After 2yrs').show()
df_pyspark = df_pyspark.drop('Experience After 2yrs')

In [52]:

df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Harry| 27|         2|
|Jerry| 27|         4|
|param| 65|        40|
|Laksh| 54|        30|
|Babji| 33|        10|
| Anni| 32|        10|
+-----+---+----------+



In [53]:
### How to rename column in spark dataframe

df_pyspark.withColumnRenamed('Name','FirstName')
df_pyspark.withColumnRenamed('Name','FirstName').show()

+---------+---+----------+
|FirstName|Age|Experience|
+---------+---+----------+
|    Harry| 27|         2|
|    Jerry| 27|         4|
|    param| 65|        40|
|    Laksh| 54|        30|
|    Babji| 33|        10|
|     Anni| 32|        10|
+---------+---+----------+

