# Pyspark DataFrame

In [33]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName("DataFrame").getOrCreate()

In [4]:
spark

# Reading the Dataset : Method 1

In [5]:
## read the dataset
spark.read.option('header','true').csv('pyspark_test_data.csv')

DataFrame[Name: string, age: string, Experience: string]

In [7]:
spark.read.option('header','true').csv('pyspark_test_data.csv').show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
| Shayema| 22|        10|
|Ehtasham| 19|         8|
| Mosahid| 24|         9|
+--------+---+----------+



In [9]:
df_pyspark=spark.read.option('header','true').csv('pyspark_test_data.csv')

In [10]:
## check the schema
df_pyspark.printSchema()   ## by default it takes every thing as string

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [12]:
## In order to prevent this
df_pyspark=spark.read.option('header','true').csv('pyspark_test_data.csv',inferSchema=True)

## Now check the schema
df_pyspark.printSchema()


root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



# Reading the Dataset : Method 2

In [90]:
df_pyspark=spark.read.csv('pyspark_test_data.csv',header=True,inferSchema=True)


In [70]:
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
| Shayema| 22|        10|
|Ehtasham| 19|         8|
| Mosahid| 24|         9|
+--------+---+----------+



In [15]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

# Selecting columns and indexing

In [16]:
df_pyspark.columns

['Name', 'age', 'Experience']

In [18]:
df_pyspark.head(3)  # getting top 3 columns , it is in list format

[Row(Name='Shayema', age=22, Experience=10),
 Row(Name='Ehtasham', age=19, Experience=8),
 Row(Name='Mosahid', age=24, Experience=9)]

In [42]:
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
| Shayema| 22|        10|
|Ehtasham| 19|         8|
| Mosahid| 24|         9|
+--------+---+----------+



In [23]:
df_pyspark.select('Name') # for 1 column

DataFrame[Name: string]

In [24]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
| Shayema|
|Ehtasham|
| Mosahid|
+--------+



In [25]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [27]:
df_pyspark.select(['Name','age'])  # for 2 columns

DataFrame[Name: string, age: int]

In [28]:
df_pyspark.select(['Name','age']).show()

+--------+---+
|    Name|age|
+--------+---+
| Shayema| 22|
|Ehtasham| 19|
| Mosahid| 24|
+--------+---+



In [30]:
df_pyspark['Name']    # only column will show

Column<'Name'>

In [31]:
df_pyspark['Name'].show()   # shows error

TypeError: 'Column' object is not callable

In [32]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

# Checking the Describe option similar to pandas

In [34]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string]

In [35]:
df_pyspark.describe().show()    # null is showing because it takes string values

+-------+--------+------------------+----------+
|summary|    Name|               age|Experience|
+-------+--------+------------------+----------+
|  count|       3|                 3|         3|
|   mean|    null|21.666666666666668|       9.0|
| stddev|    null| 2.516611478423583|       1.0|
|    min|Ehtasham|                19|         8|
|    max| Shayema|                24|        10|
+-------+--------+------------------+----------+



# Adding columns

In [79]:
df_pyspark.withColumn('Experience After 2 years',df_pyspark['Experience']+2)  # first we have to read the original datset then run it

DataFrame[Name: string, age: int, Experience: int, Experience After 2 years: int]

In [80]:
df_pyspark=df_pyspark.withColumn('Experience After 2 years',df_pyspark['Experience']+2)  # first we have to read the original datset then run it

In [81]:
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|age|Experience|Experience After 2 years|
+--------+---+----------+------------------------+
| Shayema| 22|        10|                      12|
|Ehtasham| 19|         8|                      10|
| Mosahid| 24|         9|                      11|
+--------+---+----------+------------------------+



# Dropping the columns

In [86]:
df_pyspark.drop('Experince After 2 years').show()  # first we have to read the original datset then run it

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
| Shayema| 22|        10|
|Ehtasham| 19|         8|
| Mosahid| 24|         9|
+--------+---+----------+



In [92]:
df_pyspark=df_pyspark.drop('Experince After 2 years')

In [93]:
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
| Shayema| 22|        10|
|Ehtasham| 19|         8|
| Mosahid| 24|         9|
+--------+---+----------+



In [95]:
##Rename the columns
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
| Shayema| 22|        10|
|Ehtasham| 19|         8|
| Mosahid| 24|         9|
+--------+---+----------+

