<a href="https://colab.research.google.com/github/Musaveer39/PySpark/blob/main/PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Linux Basics

In [1]:
!ls

sample_data


In [2]:
!cat /etc/os-release

PRETTY_NAME="Ubuntu 22.04.4 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.4 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy


In [3]:
!uname -a

Linux bcce0d5f3498 6.1.123+ #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux


## PySpark Basic

In [4]:
pip install pyspark



In [5]:
from pyspark.sql import SparkSession

In [6]:
spark =  SparkSession.builder.appName('Basics').getOrCreate()

In [7]:
data = [("Hello","World")]
columns = ["Word1","Word2"]
df = spark.createDataFrame(data,columns)

In [8]:
df.show()

+-----+-----+
|Word1|Word2|
+-----+-----+
|Hello|World|
+-----+-----+



## Basic Transformation and Actions

In [9]:
data = [
    ("John", "Sales", 3000),
    ("Jane", "Finance", 4000),
    ("Mike", "Sales", 3500),
    ("Alice", "Finance", 3800),
    ("Bob", "IT", 4500)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)
df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [10]:
df_filtered = df.filter(df.Salary > 3500)
df_filtered.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| Jane|   Finance|  4000|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [11]:
#Grouped and Aggregates
df_grouped = df.groupBy("Department").avg("Salary")
df_grouped.show()

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
|   Finance|     3900.0|
|        IT|     4500.0|
+----------+-----------+



In [12]:
# Add new column: Salary with bonus (10%)
df_bonus = df.withColumn("Salary_With_Bonus", df.Salary * 1.1)
df_bonus.show()

+-----+----------+------+------------------+
| Name|Department|Salary| Salary_With_Bonus|
+-----+----------+------+------------------+
| John|     Sales|  3000|3300.0000000000005|
| Jane|   Finance|  4000|            4400.0|
| Mike|     Sales|  3500|3850.0000000000005|
|Alice|   Finance|  3800|            4180.0|
|  Bob|        IT|  4500|            4950.0|
+-----+----------+------+------------------+



In [13]:
from pyspark.sql.functions import col,upper,lower,concat_ws,when,length
df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [14]:
# Changes case transformations
df_upper  = df.withColumn("Name_Upper",upper(col("Name")))
df_lower  = df.withColumn("Name_Lower",lower(col("Name")))
df_upper.show()
df_lower.show()


+-----+----------+------+----------+
| Name|Department|Salary|Name_Upper|
+-----+----------+------+----------+
| John|     Sales|  3000|      JOHN|
| Jane|   Finance|  4000|      JANE|
| Mike|     Sales|  3500|      MIKE|
|Alice|   Finance|  3800|     ALICE|
|  Bob|        IT|  4500|       BOB|
+-----+----------+------+----------+

+-----+----------+------+----------+
| Name|Department|Salary|Name_Lower|
+-----+----------+------+----------+
| John|     Sales|  3000|      john|
| Jane|   Finance|  4000|      jane|
| Mike|     Sales|  3500|      mike|
|Alice|   Finance|  3800|     alice|
|  Bob|        IT|  4500|       bob|
+-----+----------+------+----------+



In [15]:
# Concatenate columns
df_concat =df.withColumn("Name_Department",concat_ws(" - ","Name","Department"))
df_concat.show()

+-----+----------+------+---------------+
| Name|Department|Salary|Name_Department|
+-----+----------+------+---------------+
| John|     Sales|  3000|   John - Sales|
| Jane|   Finance|  4000| Jane - Finance|
| Mike|     Sales|  3500|   Mike - Sales|
|Alice|   Finance|  3800|Alice - Finance|
|  Bob|        IT|  4500|       Bob - IT|
+-----+----------+------+---------------+



In [16]:
df_lenth = df.withColumn("Name_Length",length(col("Name")))
df_lenth.show()

+-----+----------+------+-----------+
| Name|Department|Salary|Name_Length|
+-----+----------+------+-----------+
| John|     Sales|  3000|          4|
| Jane|   Finance|  4000|          4|
| Mike|     Sales|  3500|          4|
|Alice|   Finance|  3800|          5|
|  Bob|        IT|  4500|          3|
+-----+----------+------+-----------+



In [17]:
# Conditional Column (Salary Category)
df_conditional = df.withColumn("Salary Category",when(col("Salary") >= 4000,"High")
                  .when(col("Salary") >= 3500,"Med").otherwise("Low"))
df_conditional.show()

+-----+----------+------+---------------+
| Name|Department|Salary|Salary Category|
+-----+----------+------+---------------+
| John|     Sales|  3000|            Low|
| Jane|   Finance|  4000|           High|
| Mike|     Sales|  3500|            Med|
|Alice|   Finance|  3800|            Med|
|  Bob|        IT|  4500|           High|
+-----+----------+------+---------------+



In [18]:
df_renamed = df_conditional.withColumnRenamed("Salary","Base Salary")
df_renamed.show()

+-----+----------+-----------+---------------+
| Name|Department|Base Salary|Salary Category|
+-----+----------+-----------+---------------+
| John|     Sales|       3000|            Low|
| Jane|   Finance|       4000|           High|
| Mike|     Sales|       3500|            Med|
|Alice|   Finance|       3800|            Med|
|  Bob|        IT|       4500|           High|
+-----+----------+-----------+---------------+



## Advanced Transformations

In [20]:
from pyspark.sql import SparkSession
spark =SparkSession.builder.appName("Basics").getOrCreate()
columns = ["Name","Department","Salary"]
data = [
    ("John", "Sales", 3000),
    ("Jane", "Finance", 4000),
    ("Mike", "Sales", 3500),
    ("Alice", "Finance", 3800),
    ("Bob", "IT", 4500)
]

df = spark.createDataFrame(data, columns)

df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [21]:
# Count by Department
df_count =  df.groupBy("Department").count()
df_count.show()

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|    2|
|   Finance|    2|
|        IT|    1|
+----------+-----+



In [22]:
# Group by Department and calculate average salary
df.groupBy("Department").avg("Salary").show()

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
|   Finance|     3900.0|
|        IT|     4500.0|
+----------+-----------+



In [24]:
# Group by Department and calculate multiple aggregations
df.groupBy("Department").agg({"Salary": "sum", "Salary": "max","Salary":'min'}).show()

+----------+-----------+
|Department|min(Salary)|
+----------+-----------+
|     Sales|       3000|
|   Finance|       3800|
|        IT|       4500|
+----------+-----------+



In [27]:
from pyspark.sql import functions as f
df.groupBy('Department').agg(f.avg('Salary'),f.max('Salary'),f.min('Salary')).show()

+----------+-----------+-----------+-----------+
|Department|avg(Salary)|max(Salary)|min(Salary)|
+----------+-----------+-----------+-----------+
|     Sales|     3250.0|       3500|       3000|
|   Finance|     3900.0|       4000|       3800|
|        IT|     4500.0|       4500|       4500|
+----------+-----------+-----------+-----------+



In [28]:
# Create another DataFrame for department info
dept_data = [
    ("Sales", "Building A"),
    ("Finance", "Building B"),
    ("IT", "Building C")
]
dept_columns = ["Department", "Location"]


In [29]:
dept_df = spark.createDataFrame(dept_data, dept_columns)
joined_df = df.join(dept_df, on='Department', how='inner')
joined_df.show()

+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|   Finance| Jane|  4000|Building B|
|   Finance|Alice|  3800|Building B|
|        IT|  Bob|  4500|Building C|
|     Sales| John|  3000|Building A|
|     Sales| Mike|  3500|Building A|
+----------+-----+------+----------+



In [30]:
# Left join
left_joined_df = df.join(dept_df, on='Department', how='left')
left_joined_df.show()

+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|     Sales| John|  3000|Building A|
|   Finance| Jane|  4000|Building B|
|     Sales| Mike|  3500|Building A|
|   Finance|Alice|  3800|Building B|
|        IT|  Bob|  4500|Building C|
+----------+-----+------+----------+



In [31]:
# Employee DataFrame
emp_data = [
    (1, "John", "Sales", 3000),
    (2, "Jane", "Finance", 4000),
    (3, "Mike", "Sales", 3500),
    (4, "Alice", "HR", 3800),
    (5, "Bob", "IT", 4500),
    (6, "Sam", "Support", 3200)
]
emp_cols = ["EmpID", "Name", "Department", "Salary"]
emp_df = spark.createDataFrame(emp_data, emp_cols)

# Department DataFrame
dept_data = [
    ("Sales", "Building A"),
    ("Finance", "Building B"),
    ("IT", "Building C"),
    ("Admin", "Building D")
]
dept_cols = ["Department", "Location"]
dept_df = spark.createDataFrame(dept_data, dept_cols)

# Display both
emp_df.show()
dept_df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1| John|     Sales|  3000|
|    2| Jane|   Finance|  4000|
|    3| Mike|     Sales|  3500|
|    4|Alice|        HR|  3800|
|    5|  Bob|        IT|  4500|
|    6|  Sam|   Support|  3200|
+-----+-----+----------+------+

+----------+----------+
|Department|  Location|
+----------+----------+
|     Sales|Building A|
|   Finance|Building B|
|        IT|Building C|
|     Admin|Building D|
+----------+----------+



In [32]:
#Full outer join
full_df = emp_df.join(dept_df, on='Department', how='full')
full_df.show()

+----------+-----+-----+------+----------+
|Department|EmpID| Name|Salary|  Location|
+----------+-----+-----+------+----------+
|     Admin| NULL| NULL|  NULL|Building D|
|   Finance|    2| Jane|  4000|Building B|
|        HR|    4|Alice|  3800|      NULL|
|        IT|    5|  Bob|  4500|Building C|
|     Sales|    1| John|  3000|Building A|
|     Sales|    3| Mike|  3500|Building A|
|   Support|    6|  Sam|  3200|      NULL|
+----------+-----+-----+------+----------+

