# Part - 1

* PySpark Dataframe
* Reading the Dataset
* Checking the Datatype of the Column(Schema)
* Selecting the Columns and Indexing
* Check Describe options (similar to pandas)
* Adding Columns
* Dropping Columns
* Renaming Columns

In [1]:
import os
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-21"

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate() # type: ignore

In [4]:
spark

In [8]:
## read the dataset
df_pyspark = spark.read.option('header','true').csv('Test2.csv')

In [9]:
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Nehal| 21|         3|
|  Chirag| 20|         2|
|Devanshu| 19|         1|
|   Aryan| 25|         5|
|   Daksh| 24|         4|
|   Dhyey| 30|         9|
+--------+---+----------+



In [10]:
### Check the Schema
# Used for checking datatype
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [12]:
#inferSchema --> Gives True DataType of a column 
df_pyspark = spark.read.option('header','true').csv('Test2.csv',inferSchema=True)

In [13]:
# Used for checking datatype
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [14]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [15]:
# To get the column name
df_pyspark.columns

['Name', 'age', 'Experience']

In [16]:
# TO get head --> Returns in a list format
df_pyspark.head()

Row(Name='Nehal', age=21, Experience=3)

In [17]:
df_pyspark.head(3)

[Row(Name='Nehal', age=21, Experience=3),
 Row(Name='Chirag', age=20, Experience=2),
 Row(Name='Devanshu', age=19, Experience=1)]

In [18]:
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Nehal| 21|         3|
|  Chirag| 20|         2|
|Devanshu| 19|         1|
|   Aryan| 25|         5|
|   Daksh| 24|         4|
|   Dhyey| 30|         9|
+--------+---+----------+



In [19]:
#we have to use select function to get a particular column from the dataframe.
df_pyspark.select('Name')

DataFrame[Name: string]

In [20]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|   Nehal|
|  Chirag|
|Devanshu|
|   Aryan|
|   Daksh|
|   Dhyey|
+--------+



In [21]:
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|   Nehal|         3|
|  Chirag|         2|
|Devanshu|         1|
|   Aryan|         5|
|   Daksh|         4|
|   Dhyey|         9|
+--------+----------+



In [22]:
# To check the datatypes
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [23]:
## describe() is used to show statistical description of the data frame
df_pyspark.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string]

In [26]:
# min - max of name column is done by alphabetically

df_pyspark.describe().show()

+-------+-----+------------------+------------------+
|summary| Name|               age|        Experience|
+-------+-----+------------------+------------------+
|  count|    6|                 6|                 6|
|   mean| NULL|23.166666666666668|               4.0|
| stddev| NULL| 4.070217029430577|2.8284271247461903|
|    min|Aryan|                19|                 1|
|    max|Nehal|                30|                 9|
+-------+-----+------------------+------------------+



In [28]:
#Adding Columns
## withColumn(colName, col) function returns a new DataFrame by adding a column or replacing the existing column that has the same name.

df_pyspark.withColumn('Experiencde After 2 year', df_pyspark['Experience'] + 2)

DataFrame[Name: string, age: int, Experience: int, Experiencde After 2 year: int]

In [30]:
df_pyspark.withColumn('Experiencde After 2 year', df_pyspark['Experience'] + 2).show()

+--------+---+----------+------------------------+
|    Name|age|Experience|Experiencde After 2 year|
+--------+---+----------+------------------------+
|   Nehal| 21|         3|                       5|
|  Chirag| 20|         2|                       4|
|Devanshu| 19|         1|                       3|
|   Aryan| 25|         5|                       7|
|   Daksh| 24|         4|                       6|
|   Dhyey| 30|         9|                      11|
+--------+---+----------+------------------------+



In [34]:
### TO show the changes permanently reflected to dataframe we have to store to a variable

df_pyspark = df_pyspark.withColumn('Experiencde After 2 year', df_pyspark['Experience'] + 2)

In [35]:
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|age|Experience|Experiencde After 2 year|
+--------+---+----------+------------------------+
|   Nehal| 21|         3|                       5|
|  Chirag| 20|         2|                       4|
|Devanshu| 19|         1|                       3|
|   Aryan| 25|         5|                       7|
|   Daksh| 24|         4|                       6|
|   Dhyey| 30|         9|                      11|
+--------+---+----------+------------------------+



In [37]:
# Drop the columns
## drop(*cols) function returns a new DataFrame the drops the specified column.

df_pyspark.drop('Experiencde After 2 year').show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Nehal| 21|         3|
|  Chirag| 20|         2|
|Devanshu| 19|         1|
|   Aryan| 25|         5|
|   Daksh| 24|         4|
|   Dhyey| 30|         9|
+--------+---+----------+



In [39]:
df_pyspark.drop('age','Experiencde After 2 year').show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|   Nehal|         3|
|  Chirag|         2|
|Devanshu|         1|
|   Aryan|         5|
|   Daksh|         4|
|   Dhyey|         9|
+--------+----------+



In [40]:
# Show the actual changes to the dropped column we have to it to a variable

df_pyspark = df_pyspark.drop('Experiencde After 2 year')
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Nehal| 21|         3|
|  Chirag| 20|         2|
|Devanshu| 19|         1|
|   Aryan| 25|         5|
|   Daksh| 24|         4|
|   Dhyey| 30|         9|
+--------+---+----------+



In [43]:
#Rename the Columns
## withColumnRenamed(existing, new) function returns a new Dataframe by renaming the existing column.

df_pyspark.withColumnRenamed('Name','New Name')

DataFrame[New Name: string, age: int, Experience: int]

In [42]:
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
|   Nehal| 21|         3|
|  Chirag| 20|         2|
|Devanshu| 19|         1|
|   Aryan| 25|         5|
|   Daksh| 24|         4|
|   Dhyey| 30|         9|
+--------+---+----------+

