<a href="https://colab.research.google.com/github/dev0419/BDA_Lab/blob/main/Lab-2/SimplePysparkPrograms_L2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Implement a PySpark script that applies transformations like filter and withColumn on a DataFrame.

In [None]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Basics').getOrCreate()
data = [(1, 'John', 'Engineer', 1000),
        (2, 'Alice', 'Manager', 2000),
        (3, 'Bob', 'Developer', 1500),
        (4, 'Jane', 'Manager', 2500),
        (5, 'Eve', 'CEO', 5000)]
df = spark.createDataFrame(data,schema=['id','name','role','salary'])
df.show()

#map function
def square_salary(salary):
    return salary*salary

cols = ['id','name','role','squared_salary']
map_df = df.rdd.map(lambda row: (row.id,row.name,row.role,square_salary(row.salary))).toDF(cols)
print("map")
map_df.show()

#filter
filter_df = df.filter(df.role == 'Manager')
print("filter")
filter_df.show()

#groupby
group_df = df.groupBy('role').agg({'salary':'avg'})
print("group df")
group_df.show()

#join

dept_data = df.select(['id','role'])
print("department data")
dept_data.show()
join_df = df.join(dept_data.withColumnRenamed('role', 'dept_role'), on='id', how='inner')
print("join df")
join_df.show()

#union must have same no of columns
data2 = [(6, 'Tom', 'Analyst', 3000),
         (7, 'Mary', 'Developer', 2800),
         (8, 'Chris', 'Manager', 3500),
         (9, 'Linda', 'Engineer', 3200),
         (10, 'Mike', 'CEO', 6000)]

df2 = spark.createDataFrame(data2, ['id', 'name', 'role', 'salary'])
df2.show()
union_df = df.union(df2)
print("union df")
union_df.show()

distinct_df = union_df.distinct()
print("distinct_df")
distinct_df.show()

words_rdd = spark.sparkContext.parallelize(['Hello World', 'How are you'])
flat_mapped_rdd = words_rdd.flatMap(lambda x: x.split())
print("FlatMapped RDD:")
print(flat_mapped_rdd.collect())



+---+-----+---------+------+
| id| name|     role|salary|
+---+-----+---------+------+
|  1| John| Engineer|  1000|
|  2|Alice|  Manager|  2000|
|  3|  Bob|Developer|  1500|
|  4| Jane|  Manager|  2500|
|  5|  Eve|      CEO|  5000|
+---+-----+---------+------+

map
+---+-----+---------+--------------+
| id| name|     role|squared_salary|
+---+-----+---------+--------------+
|  1| John| Engineer|       1000000|
|  2|Alice|  Manager|       4000000|
|  3|  Bob|Developer|       2250000|
|  4| Jane|  Manager|       6250000|
|  5|  Eve|      CEO|      25000000|
+---+-----+---------+--------------+

filter
+---+-----+-------+------+
| id| name|   role|salary|
+---+-----+-------+------+
|  2|Alice|Manager|  2000|
|  4| Jane|Manager|  2500|
+---+-----+-------+------+

group df
+---------+-----------+
|     role|avg(salary)|
+---------+-----------+
| Engineer|     1000.0|
|  Manager|     2250.0|
|Developer|     1500.0|
|      CEO|     5000.0|
+---------+-----------+

department data
+---+-------

2. Write a PySpark script that performs actions like count and show on a DataFrame.

In [None]:
from pyspark.sql.functions import count
df.show()
df.count()
df.select(count(df.name).alias('name_count'),count(df.role).alias('role_count')).show()

+---+-----+---------+------+
| id| name|     role|salary|
+---+-----+---------+------+
|  1| John| Engineer|  1000|
|  2|Alice|  Manager|  2000|
|  3|  Bob|Developer|  1500|
|  4| Jane|  Manager|  2500|
|  5|  Eve|      CEO|  5000|
+---+-----+---------+------+

+----------+----------+
|name_count|role_count|
+----------+----------+
|         5|         5|
+----------+----------+



3. Demonstrate how to perform basic aggregations (e.g., sum, average) on a PySpark DataFrame.

In [None]:
sum_df = df.groupBy('role').agg({'salary':'sum'})
sum_df.show()
avg_df = df.groupBy('role').agg({'salary':'avg'})
avg_df.show()

+---------+-----------+
|     role|sum(salary)|
+---------+-----------+
| Engineer|       1000|
|  Manager|       4500|
|Developer|       1500|
|      CEO|       5000|
+---------+-----------+

+---------+-----------+
|     role|avg(salary)|
+---------+-----------+
| Engineer|     1000.0|
|  Manager|     2250.0|
|Developer|     1500.0|
|      CEO|     5000.0|
+---------+-----------+



4. Show how to write a PySpark DataFrame to a CSV file.

In [None]:
df.write.csv('output.csv', header=True)

5. Implement wordcount program in PySpark.

In [None]:
from operator import add
data = ["Hello Spark", "How are you Spark", "Spark is awesome"]
lines_rdd = spark.sparkContext.parallelize(data)
word_counts = lines_rdd.flatMap(lambda line: line.split()) \
                       .map(lambda word: (word, 1)) \
                       .reduceByKey(add)
print("Word counts:")
for word, count in word_counts.collect():
    print(f"{word}: {count}")

Word counts:
Hello: 1
Spark: 3
are: 1
is: 1
awesome: 1
How: 1
you: 1
