## This notebook covers:
* PySpark DataFrame
* Reading The Dataset
* Checking the Datatypes of the column(Schema)
* Selecting columns and indexing
* Check Describe option similar to Pandas
* Adding columns
* Dropping columns
* Renaming columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [11]:
## read the dataset
df_pyspark=spark.read.option('header', 'true').csv('test1.csv', inferSchema=True)
# inferSchema=True makes string columns as integer

In [9]:
# df_pyspark=spark.read.option('header', 'true').csv('test1.csv')
df_pyspark.show()

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|  ola| 25|         8|
|kasia| 44|        10|
| kuba| 55|         7|
| bolo| 11|         8|
+-----+---+----------+



In [12]:
### Check the schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [36]:
df_pyspark=spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.show()

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|  ola| 25|         8|
|kasia| 44|        10|
| kuba| 55|         7|
| bolo| 11|         8|
+-----+---+----------+



In [16]:
### Selecting columns and indexing
df_pyspark.columns

['name', 'age', 'experience']

In [17]:
df_pyspark.head(3)

[Row(name='ola', age=25, experience=8),
 Row(name='kasia', age=44, experience=10),
 Row(name='kuba', age=55, experience=7)]

In [20]:
df_pyspark.show()

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|  ola| 25|         8|
|kasia| 44|        10|
| kuba| 55|         7|
| bolo| 11|         8|
+-----+---+----------+



In [22]:
df_pyspark.select(['name', 'experience'])

DataFrame[name: string, experience: int]

In [23]:
df_pyspark.select(['name', 'experience']).show()

+-----+----------+
| name|experience|
+-----+----------+
|  ola|         8|
|kasia|        10|
| kuba|         7|
| bolo|         8|
+-----+----------+



In [25]:
# in pandas we select column like that
df_pyspark['name']

Column<b'name'>

In [26]:
### Check Describe option similar to Pandas
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [27]:
df_pyspark.describe()

DataFrame[summary: string, name: string, age: string, experience: string]

In [28]:
df_pyspark.describe().show()

+-------+----+------------------+------------------+
|summary|name|               age|        experience|
+-------+----+------------------+------------------+
|  count|   4|                 4|                 4|
|   mean|null|             33.75|              8.25|
| stddev|null|19.585283590832514|1.2583057392117918|
|    min|bolo|                11|                 7|
|    max| ola|                55|                10|
+-------+----+------------------+------------------+



In [29]:
df_pyspark.show()

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|  ola| 25|         8|
|kasia| 44|        10|
| kuba| 55|         7|
| bolo| 11|         8|
+-----+---+----------+



In [37]:
### Adding columns in data frame
df_pyspark=df_pyspark.withColumn('experience after 2 years', df_pyspark['experience']+2)
df_pyspark.show()

+-----+---+----------+------------------------+
| name|age|experience|experience after 2 years|
+-----+---+----------+------------------------+
|  ola| 25|         8|                      10|
|kasia| 44|        10|                      12|
| kuba| 55|         7|                       9|
| bolo| 11|         8|                      10|
+-----+---+----------+------------------------+



In [38]:
df_pyspark

DataFrame[name: string, age: int, experience: int, experience after 2 years: int]

In [40]:
### Drop the columns
df_pyspark=df_pyspark.drop('experience after 2 years')

In [41]:
df_pyspark.show()

+-----+---+----------+
| name|age|experience|
+-----+---+----------+
|  ola| 25|         8|
|kasia| 44|        10|
| kuba| 55|         7|
| bolo| 11|         8|
+-----+---+----------+



In [42]:
### Rename the columns
df_pyspark.withColumnRenamed('name', 'new name').show()

+--------+---+----------+
|new name|age|experience|
+--------+---+----------+
|     ola| 25|         8|
|   kasia| 44|        10|
|    kuba| 55|         7|
|    bolo| 11|         8|
+--------+---+----------+

