# Most of PySpark Functionality are Similar to Pandas
1. PySpark Dataframe
2. Reading The Dataset
3. Checking the Datatypes of the Column(Schema)
4. Selecting Columns And Indexing
5. Check Describe option similar to Pandas
6. Adding Columns
7. Dropping columns 
8. Rename The Column

# 1. PySpark Dataframe 
**A DataFrame is a data Structure like a table in Excel or Google Sheets:**

- **Rows:** Each row is a record (e.g., information about one person).
- **Columns:** Each column is a feature or attribute (e.g., "Name," "Age").
- **Pandas DataFrame:** Works on small data (fits in memory).
- **PySpark DataFrame:** Works on huge data (distributed across a cluster).



In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [3]:
spark

# 2. Reading The Dataset



In [4]:
## reading the dataset
'''
inferSchema='True' => to prevent the pyspark from reading dataType by default as string 
and update the integer dataType to integer instesd of string
'''
df_pyspark = spark.read.option('header','true').csv('test1.csv', inferSchema='True')
df_pyspark.show()

+---------+---+----------+
|     Name|Age|experience|
+---------+---+----------+
|  Muhamed| 25|        10|
|    Ahmed| 30|         7|
|Abdulaziz| 50|        25|
+---------+---+----------+



# 3. Checking the Datatypes of the Column(Schema)


In [5]:
## check the schema => basicly mean the dataTypes
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [6]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [7]:
df_pyspark.columns

['Name', 'Age', 'experience']

In [8]:
df_pyspark.head()

Row(Name='Muhamed', Age=25, experience=10)

In [9]:
df_pyspark.head(1)

[Row(Name='Muhamed', Age=25, experience=10)]

In [10]:
df_pyspark.head(2)

[Row(Name='Muhamed', Age=25, experience=10),
 Row(Name='Ahmed', Age=30, experience=7)]

In [11]:
df_pyspark.head(3)

[Row(Name='Muhamed', Age=25, experience=10),
 Row(Name='Ahmed', Age=30, experience=7),
 Row(Name='Abdulaziz', Age=50, experience=25)]

In [12]:
df_pyspark.head(4)

[Row(Name='Muhamed', Age=25, experience=10),
 Row(Name='Ahmed', Age=30, experience=7),
 Row(Name='Abdulaziz', Age=50, experience=25)]

In [13]:
df_pyspark.show()

+---------+---+----------+
|     Name|Age|experience|
+---------+---+----------+
|  Muhamed| 25|        10|
|    Ahmed| 30|         7|
|Abdulaziz| 50|        25|
+---------+---+----------+



# 4. Selecting Columns And Indexing


In [14]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [15]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [16]:
df_pyspark.select('Name').show() # in pandas => df_pyspark['Name']

+---------+
|     Name|
+---------+
|  Muhamed|
|    Ahmed|
|Abdulaziz|
+---------+



In [17]:
df_pyspark.select('age').show()

+---+
|age|
+---+
| 25|
| 30|
| 50|
+---+



In [18]:
df_pyspark.select('experience').show()

+----------+
|experience|
+----------+
|        10|
|         7|
|        25|
+----------+



In [19]:
df_pyspark.select(['Name','age']).show()

+---------+---+
|     Name|age|
+---------+---+
|  Muhamed| 25|
|    Ahmed| 30|
|Abdulaziz| 50|
+---------+---+



In [20]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('experience', 'int')]

# 5. Check Describe option similar to Pandas


In [21]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, experience: string]

In [22]:
df_pyspark.describe().show()

+-------+---------+------------------+-----------------+
|summary|     Name|               Age|       experience|
+-------+---------+------------------+-----------------+
|  count|        3|                 3|                3|
|   mean|     NULL|              35.0|             14.0|
| stddev|     NULL|13.228756555322953|9.643650760992955|
|    min|Abdulaziz|                25|                7|
|    max|  Muhamed|                50|               25|
+-------+---------+------------------+-----------------+



# 6. Adding Columns


In [30]:
df_pyspark = df_pyspark.withColumn('Experience After 2 Years',df_pyspark['experience']+2)


In [31]:
df_pyspark.show()

+---------+---+----------+------------------------+
|     Name|Age|experience|Experience After 2 Years|
+---------+---+----------+------------------------+
|  Muhamed| 25|        10|                      12|
|    Ahmed| 30|         7|                       9|
|Abdulaziz| 50|        25|                      27|
+---------+---+----------+------------------------+



# 7. Dropping columns 

In [32]:
df_pyspark = df_pyspark.drop('Experience After 2 Years')

In [33]:
df_pyspark.show()

+---------+---+----------+
|     Name|Age|experience|
+---------+---+----------+
|  Muhamed| 25|        10|
|    Ahmed| 30|         7|
|Abdulaziz| 50|        25|
+---------+---+----------+



In [34]:
df_pyspark = df_pyspark.drop('Experience')

In [35]:
df_pyspark.show()

+---------+---+
|     Name|Age|
+---------+---+
|  Muhamed| 25|
|    Ahmed| 30|
|Abdulaziz| 50|
+---------+---+



# 8. Rename The Column

In [36]:
df_pyspark = df_pyspark.withColumnRenamed('name','First Name')

In [38]:
df_pyspark.show()

+----------+---+
|First Name|Age|
+----------+---+
|   Muhamed| 25|
|     Ahmed| 30|
| Abdulaziz| 50|
+----------+---+

