# Content
- PySpark Dataframe
- Reading The Dataset
- Checking the Datatypes of the Column(Schema)
- Selecting Columns And Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping columns
- Renaming Columns

In [None]:
!pip install pyspark



In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
# Create a spark session
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [None]:
spark

In [4]:
# Create a dummy test1.csv file
with open('test1.csv', 'w') as f:
    f.write('Name,age,Experience\n')
    f.write('Krish,31,10\n')
    f.write('Sudhanshu,30,8\n')
    f.write('Sunny,29,4\n')

# Create a dummy test2.csv file
with open('test2.csv', 'w') as f:
    f.write('ID,City\n')
    f.write('1,New York\n')
    f.write('2,London\n')
    f.write('3,Paris\n')

# Create a dummy test3.csv file
with open('test3.csv', 'w') as f:
    f.write('Product,Price\n')
    f.write('Laptop,1200\n')
    f.write('Keyboard,75\n')
    f.write('Mouse,25\n')

# Create a dummy test4.csv file (assuming similar structure to test1)
with open('test4.csv', 'w') as f:
    f.write('Name,age,Experience\n')
    f.write('Alice,25,3\n')
    f.write('Bob,35,9\n')
    f.write('Charlie,28,6\n')

In [9]:
# Upload files first or run dummy data

## read the dataset
df_pyspark=spark.read.option('header','true').csv('test1.csv', inferSchema=True)

In [10]:
### Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [11]:
df_pyspark=spark.read.csv('test1.csv', header=True,inferSchema=True)
df_pyspark.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [12]:
### Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [None]:
type(df_pyspark)

In [13]:
pyspark.sql.dataframe.DataFrame

In [18]:
# Decide how many head rows to show
df_pyspark.head(3)

[Row(Name='Krish', age=31, Experience=10),
 Row(Name='Sudhanshu', age=30, Experience=8),
 Row(Name='Sunny', age=29, Experience=4)]

In [19]:
df_pyspark.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [20]:
df_pyspark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
+---------+----------+



In [21]:
df_pyspark['Name']

Column<'Name'>

In [22]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [23]:
### describe gives basic stats of the table

df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



In [24]:
### Adding Columns in data frame
df_pyspark=df_pyspark.withColumn('Experience After 2 year', df_pyspark['Experience'] + 2 )

In [25]:
df_pyspark.show()

+---------+---+----------+-----------------------+
|     Name|age|Experience|Experience After 2 year|
+---------+---+----------+-----------------------+
|    Krish| 31|        10|                     12|
|Sudhanshu| 30|         8|                     10|
|    Sunny| 29|         4|                      6|
+---------+---+----------+-----------------------+



In [26]:
### Drop the columns
df_pyspark=df_pyspark.drop('Experience After 2 year')

In [None]:
df_pyspark.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [30]:
### Rename the columns
df_pyspark.withColumnRenamed('Name','New Name').show()

+---------+---+----------+
| New Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+

