In [0]:
data = [
(1, "Amit", "IT", 60000),
(2, "Priya", "HR", 55000),
(3, "Rahul", "Finance", 75000),
(4, "Sneha", "IT", 80000),
(5, "Karan", "HR", 65000)
]
columns = ["EmpID", "Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)
df.describe()
df.printSchema()
df.show(3)

root
 |-- EmpID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1| Amit|        IT| 60000|
|    2|Priya|        HR| 55000|
|    3|Rahul|   Finance| 75000|
+-----+-----+----------+------+
only showing top 3 rows



In [0]:
from pyspark.sql.functions import col
filter_df = df.filter(col('Salary') > 70000).show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    3|Rahul|   Finance| 75000|
|    4|Sneha|        IT| 80000|
+-----+-----+----------+------+



In [0]:
from pyspark.sql.functions import avg
avg_sal_df = df.groupBy(col('Department')).agg(avg(col('Salary')).alias('Avg_Sal')).show()

+----------+-------+
|Department|Avg_Sal|
+----------+-------+
|        IT|70000.0|
|        HR|60000.0|
|   Finance|75000.0|
+----------+-------+



In [0]:
name_df = df.filter(col('Name').startswith('A')).show()

+-----+----+----------+------+
|EmpID|Name|Department|Salary|
+-----+----+----------+------+
|    1|Amit|        IT| 60000|
+-----+----+----------+------+



In [0]:
from pyspark.sql.functions import count
count_emp_df = df.groupBy(col('Department')).agg(count(col('EmpID')).alias('Emp_count')).show()

+----------+---------+
|Department|Emp_count|
+----------+---------+
|        IT|        2|
|        HR|        2|
|   Finance|        1|
+----------+---------+



In [0]:
tax_df = df.withColumn('tax_deduction', (0.1 * col('Salary'))).show()

+-----+-----+----------+------+-------------+
|EmpID| Name|Department|Salary|tax_deduction|
+-----+-----+----------+------+-------------+
|    1| Amit|        IT| 60000|       6000.0|
|    2|Priya|        HR| 55000|       5500.0|
|    3|Rahul|   Finance| 75000|       7500.0|
|    4|Sneha|        IT| 80000|       8000.0|
|    5|Karan|        HR| 65000|       6500.0|
+-----+-----+----------+------+-------------+



In [0]:
from pyspark.sql.functions import desc
final_df = df.sort(desc('Salary')).show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    4|Sneha|        IT| 80000|
|    3|Rahul|   Finance| 75000|
|    5|Karan|        HR| 65000|
|    1| Amit|        IT| 60000|
|    2|Priya|        HR| 55000|
+-----+-----+----------+------+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank
spec = Window.orderBy(desc('Salary'))
temp_df = df.withColumn('sal_rank',dense_rank().over(spec))
final_df = temp_df.filter(col('sal_rank') == 2).select('EmpID', 'Name', 'Department', 'Salary').show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    3|Rahul|   Finance| 75000|
+-----+-----+----------+------+



In [0]:
final_df = df.filter(col('Department').isin('HR', 'IT')).show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1| Amit|        IT| 60000|
|    2|Priya|        HR| 55000|
|    4|Sneha|        IT| 80000|
|    5|Karan|        HR| 65000|
+-----+-----+----------+------+



In [0]:
from pyspark.sql.functions import sum, round
final_df = df.agg(round(sum(col('Salary')),2).alias('Total_Salary')).show()

+------------+
|Total_Salary|
+------------+
|      335000|
+------------+



In [0]:
file_path = '/FileStore/tables/players.csv'
players_df = spark.read.csv(file_path, header=True, inferSchema = True)
players_df.show()

+--------------+---------+-----+-------+
|        Player|  Country| Runs|Wickets|
+--------------+---------+-----+-------+
|   Virat Kohli|    India|12000|      4|
|  Rohit Sharma|    India|11000|      8|
|Jasprit Bumrah|    India| 1200|    200|
|   Steve Smith|Australia| 9500|     20|
+--------------+---------+-----+-------+



In [0]:
spec = Window.orderBy(col('Runs').desc())
rank_df = players_df.withColumn('rank', dense_rank().over(spec))
final_df = rank_df.filter(col('rank') == 1).drop("rank").select('Player', 'Runs').show()

+-----------+-----+
|     Player| Runs|
+-----------+-----+
|Virat Kohli|12000|
+-----------+-----+



In [0]:
filter_df = players_df.filter(col('Country') == 'India')
final_df = filter_df.agg(round(avg(col('Runs')),2).alias('Avg_Indian_Run_Score')).show()

+--------------------+
|Avg_Indian_Run_Score|
+--------------------+
|             8066.67|
+--------------------+



In [0]:
final_df = players_df.filter(col('Wickets') > 50).show()

+--------------+-------+----+-------+
|        Player|Country|Runs|Wickets|
+--------------+-------+----+-------+
|Jasprit Bumrah|  India|1200|    200|
+--------------+-------+----+-------+



In [0]:
file_path = '/FileStore/tables/cities-2.json'
cities_df = spark.read.json(file_path)
cities_df.show()

+---------+----------+-----------+
|     City|Population|      State|
+---------+----------+-----------+
|   Mumbai|  20000000|Maharashtra|
|    Delhi|  18000000|      Delhi|
|Bangalore|  12000000|  Karnataka|
|Hyderabad|  10000000|  Telangana|
+---------+----------+-----------+



In [0]:
final_df = cities_df.filter(col('Population') > 15000000).show()

+------+----------+-----------+
|  City|Population|      State|
+------+----------+-----------+
|Mumbai|  20000000|Maharashtra|
| Delhi|  18000000|      Delhi|
+------+----------+-----------+



In [0]:
final_df = cities_df.groupBy(col('State')).agg(sum(col('Population')).alias('Population_per_State')).show()

+-----------+--------------------+
|      State|Population_per_State|
+-----------+--------------------+
|  Karnataka|            12000000|
|      Delhi|            18000000|
|Maharashtra|            20000000|
|  Telangana|            10000000|
+-----------+--------------------+



In [0]:
spec1 = Window.partitionBy('State')
filter_df = cities_df.withColumn('Population_per_State', sum(col('Population')).over(spec1)).select('State', 'Population_per_State')

spec2 = Window.orderBy(desc('Population_per_State'))
rank_df = filter_df.withColumn('pop_rank', dense_rank().over(spec2))
final_df = rank_df.filter(col('pop_rank') == 1).select('State', 'Population_per_State').show()

+-----------+--------------------+
|      State|Population_per_State|
+-----------+--------------------+
|Maharashtra|            20000000|
+-----------+--------------------+



In [0]:
pandas_df = df.toPandas()
pandas_df.head()

Unnamed: 0,EmpID,Name,Department,Salary
0,1,Amit,IT,60000
1,2,Priya,HR,55000
2,3,Rahul,Finance,75000
3,4,Sneha,IT,80000
4,5,Karan,HR,65000
