- PySpark DataFrame
- Reading Dataset
- Checking Datatypes of Column(Schema)
- Selecting Columns and Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping Columns
- Renaming Columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [4]:
# Read the dataset

spark.read.option('header', 'true').csv('test1.csv')

DataFrame[Name: string, Age: string, Experience: string]

In [5]:
spark.read.option('header', 'true').csv('test1.csv').show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Limon| 31|        10|
|Dalya| 30|         8|
| Viko| 29|         4|
+-----+---+----------+



In [8]:
# df_pyspark = spark.read.option('header', 'true').csv('test1.csv')
# inferSchema=True will show the original datatypes of the columns otherwise it will show all columns as strings

df_pyspark = spark.read.option('header', 'true').csv('test1.csv', inferSchema=True)

In [9]:
# check the schema

df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [10]:
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Limon| 31|        10|
|Dalya| 30|         8|
| Viko| 29|         4|
+-----+---+----------+



In [11]:
# check the schema

df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [12]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [13]:
# Get the column names

df_pyspark.columns

['Name', 'Age', 'Experience']

In [14]:
df_pyspark.head(3)

[Row(Name='Limon', Age=31, Experience=10),
 Row(Name='Dalya', Age=30, Experience=8),
 Row(Name='Viko', Age=29, Experience=4)]

In [16]:
df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Limon| 31|        10|
|Dalya| 30|         8|
| Viko| 29|         4|
+-----+---+----------+



In [17]:
# Pick up the name column

df_pyspark.select('Name')

DataFrame[Name: string]

In [18]:
df_pyspark.select('Name').show()

+-----+
| Name|
+-----+
|Limon|
|Dalya|
| Viko|
+-----+



In [19]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [20]:
df_pyspark.select(['Name', 'Experience'])

DataFrame[Name: string, Experience: int]

In [21]:
df_pyspark.select(['Name', 'Experience']).show()

+-----+----------+
| Name|Experience|
+-----+----------+
|Limon|        10|
|Dalya|         8|
| Viko|         4|
+-----+----------+



In [22]:
df_pyspark['Name']

Column<'Name'>

In [23]:
df_pyspark['Name'].show()

TypeError: 'Column' object is not callable

In [25]:
# Check data types

df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [26]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [27]:
df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| Age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|30.0|7.333333333333333|
| stddev| null| 1.0|3.055050463303893|
|    min|Dalya|  29|                4|
|    max| Viko|  31|               10|
+-------+-----+----+-----------------+



In [28]:
# Adding columns in dataframe

df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2)

DataFrame[Name: string, Age: int, Experience: int, Experience after 2 years: int]

In [29]:
df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2).show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience after 2 years|
+-----+---+----------+------------------------+
|Limon| 31|        10|                      12|
|Dalya| 30|         8|                      10|
| Viko| 29|         4|                       6|
+-----+---+----------+------------------------+



In [30]:
df_pyspark = df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2)

In [31]:
df_pyspark.show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience after 2 years|
+-----+---+----------+------------------------+
|Limon| 31|        10|                      12|
|Dalya| 30|         8|                      10|
| Viko| 29|         4|                       6|
+-----+---+----------+------------------------+



In [32]:
# Drop the columns

df_pyspark.drop('Experience after 2 years')

DataFrame[Name: string, Age: int, Experience: int]

In [33]:
df_pyspark.drop('Experience after 2 years').show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Limon| 31|        10|
|Dalya| 30|         8|
| Viko| 29|         4|
+-----+---+----------+



In [34]:
# inplase wn't work so we have to assign it to a variable

df_pyspark = df_pyspark.drop('Experience after 2 years')

In [35]:
df_pyspark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|Limon| 31|        10|
|Dalya| 30|         8|
| Viko| 29|         4|
+-----+---+----------+



In [36]:
# Rename the columns

df_pyspark.withColumnRenamed('Name', 'New Name')

DataFrame[New Name: string, Age: int, Experience: int]

In [37]:
df_pyspark.withColumnRenamed('Name', 'New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|   Limon| 31|        10|
|   Dalya| 30|         8|
|    Viko| 29|         4|
+--------+---+----------+

