#### Topics-
       a. PySpark Dataframe
       b. Reading the Dataset
       c. Checking the Datatypes of the Column(Schema)
       d. Selecting Column and Indexing
       e. Check Describe option similar to Pandas
       f. Adding Columns
       g. Dropping Columns
       h. Rename column name

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('DataFrame').getOrCreate()

In [4]:
spark

In [6]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('2_NCGE.csv')

In [8]:
df_pyspark.show()

+---------+----+----------+
|     Name|CGPA|Experience|
+---------+----+----------+
|  Koustav|9.03|         2|
|  Bishyan|9.16|         1|
|Dwaipayan|8.73|         3|
|   Mainak|8.98|         4|
+---------+----+----------+



In [9]:
### check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Experience: string (nullable = true)



In [10]:
### in the op of prev line I can see that the datatype of CGPA and Experience is string
### as it is by default. To correct it need to add inferSchema=True during reading the dataset

df_pyspark1=spark.read.option('header','true').csv('2_NCGE.csv',inferSchema=True)


In [11]:
df_pyspark1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Experience: integer (nullable = true)



In [12]:
##other way to read dataset
df_pyspark1=spark.read.csv('2_NCGE.csv',inferSchema=True)
df_pyspark.show()

+---------+----+----------+
|     Name|CGPA|Experience|
+---------+----+----------+
|  Koustav|9.03|         2|
|  Bishyan|9.16|         1|
|Dwaipayan|8.73|         3|
|   Mainak|8.98|         4|
+---------+----+----------+



In [13]:
## dataframe is also a datastructure
type(df_pyspark1)

pyspark.sql.dataframe.DataFrame

In [15]:
df_pyspark.columns

['Name', 'CGPA', 'Experience']

In [29]:
df_pyspark1=spark.read.option('header','true').csv('2_NCGE.csv', inferSchema=True)

In [20]:
## show the name of the cols
df_pyspark1.columns

['Name', 'CGPA', 'Experience']

In [21]:
##show top 3 cols
df_pyspark.head(3)

[Row(Name='Koustav', CGPA='9.03', Experience='2'),
 Row(Name='Bishyan', CGPA='9.16', Experience='1'),
 Row(Name='Dwaipayan', CGPA='8.73', Experience='3')]

In [22]:
df_pyspark.show()

+---------+----+----------+
|     Name|CGPA|Experience|
+---------+----+----------+
|  Koustav|9.03|         2|
|  Bishyan|9.16|         1|
|Dwaipayan|8.73|         3|
|   Mainak|8.98|         4|
+---------+----+----------+



In [23]:
##Show the Name col only
df_pyspark1.select('Name').show()

+---------+
|     Name|
+---------+
|  Koustav|
|  Bishyan|
|Dwaipayan|
|   Mainak|
+---------+



In [25]:
df_pyspark1.select('Name')

DataFrame[Name: string]

In [27]:
type(df_pyspark1.select('Name'))  ## OP dataframe.AppName(DataFrame) here not the padas.DataFRame

pyspark.sql.dataframe.DataFrame

In [30]:
## select multiple cols
df_pyspark1.select(['Name','Experience'])

DataFrame[Name: string, Experience: int]

In [31]:
df_pyspark1.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|  Koustav|         2|
|  Bishyan|         1|
|Dwaipayan|         3|
|   Mainak|         4|
+---------+----------+



In [33]:
df_pyspark1['Name']

Column<'Name'>

In [36]:
df_pyspark1.dtypes

[('Name', 'string'), ('CGPA', 'double'), ('Experience', 'int')]

In [37]:
df_pyspark1.describe()

DataFrame[summary: string, Name: string, CGPA: string, Experience: string]

In [38]:
df_pyspark1.describe().show()

+-------+-------+-------------------+------------------+
|summary|   Name|               CGPA|        Experience|
+-------+-------+-------------------+------------------+
|  count|      4|                  4|                 4|
|   mean|   null|              8.975|               2.5|
| stddev|   null|0.18009256878986737|1.2909944487358056|
|    min|Bishyan|               8.73|                 1|
|    max| Mainak|               9.16|                 4|
+-------+-------+-------------------+------------------+



In [41]:
## adding cols in pyspark dataframe
df_pyspark1=df_pyspark1.withColumn('Experience after 3 years',df_pyspark1['Experience']+3)

In [42]:
df_pyspark1.show()

+---------+----+----------+------------------------+
|     Name|CGPA|Experience|Experience after 3 years|
+---------+----+----------+------------------------+
|  Koustav|9.03|         2|                       5|
|  Bishyan|9.16|         1|                       4|
|Dwaipayan|8.73|         3|                       6|
|   Mainak|8.98|         4|                       7|
+---------+----+----------+------------------------+



In [50]:
## Drop the cols
df_pyspark1.drop('Experience after 3 years').show()

+---------+----+----------+
|     Name|CGPA|Experience|
+---------+----+----------+
|  Koustav|9.03|         2|
|  Bishyan|9.16|         1|
|Dwaipayan|8.73|         3|
|   Mainak|8.98|         4|
+---------+----+----------+



In [51]:
df_pyspark1.show()

+---------+----+----------+------------------------+
|     Name|CGPA|Experience|Experience after 3 years|
+---------+----+----------+------------------------+
|  Koustav|9.03|         2|                       5|
|  Bishyan|9.16|         1|                       4|
|Dwaipayan|8.73|         3|                       6|
|   Mainak|8.98|         4|                       7|
+---------+----+----------+------------------------+



In [52]:
df_pyspark1=df_pyspark1.drop('Experience after 3 years')

In [53]:
df_pyspark1.show()

+---------+----+----------+
|     Name|CGPA|Experience|
+---------+----+----------+
|  Koustav|9.03|         2|
|  Bishyan|9.16|         1|
|Dwaipayan|8.73|         3|
|   Mainak|8.98|         4|
+---------+----+----------+



In [56]:
### rename the cols
df_pyspark1=df_pyspark1.withColumnRenamed('Name','Rename')

In [57]:
df_pyspark1.show()

+---------+----+----------+
|   Rename|CGPA|Experience|
+---------+----+----------+
|  Koustav|9.03|         2|
|  Bishyan|9.16|         1|
|Dwaipayan|8.73|         3|
|   Mainak|8.98|         4|
+---------+----+----------+

