<a href="https://colab.research.google.com/github/Kiran45181/Pyspark/blob/main/RDD_Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##RDD - Resilient Distributed Dataset

In [14]:
# =====================================
# 1. Install and Import PySpark
# =====================================
# Uncomment this line in Colab (skip if PySpark already installed)
# !pip install pyspark

from pyspark import SparkConf, SparkContext

# =====================================
# 2. Stop Previous Spark Context (Fix Error)
# =====================================
try:
    sc.stop()
except:
    pass  # No active SparkContext to stop

# =====================================
# 3. Initialize SparkConf and SparkContext
# =====================================
conf = SparkConf().setMaster("local").setAppName("RDD Example")
sc = SparkContext(conf=conf)

# =====================================
# 4. Create RDD
# =====================================
rdd = sc.parallelize([1, 2, 3, 4, 5])

# =====================================
# 5. Transformation: Map
# Multiply each element by 2
# =====================================
rdd_mapped = rdd.map(lambda x: x * 2)

# Action: Collect
print("Mapped RDD:", rdd_mapped.collect())   # [2, 4, 6, 8, 10]

# =====================================
# 6. Transformation: Filter
# Keep only even numbers
# =====================================
rdd_filtered = rdd.filter(lambda x: x % 2 == 0)

# Action: Collect
print("Filtered RDD:", rdd_filtered.collect())  # [2, 4]

# =====================================
# 7. Action: Reduce
# Sum of all elements
# =====================================
sum_result = rdd.reduce(lambda a, b: a + b)
print("Sum using reduce:", sum_result)  # 15



#Transformation : flatmap
rdd_flatmap = rdd.flatMap(lambda x: range(x,x+2))
print("Flatmap RDD:", rdd_flatmap.collect())


rdd_nested_list = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
rdd_flatmap_nested = rdd_nested_list.flatMap(lambda x: x)
print(rdd_flatmap_nested.collect())


rdd_flatmap_string = sc.parallelize(["hello", "world"])
rdd_flatmap_string_result = rdd_flatmap_string.flatMap(lambda x: list(x))
print("Flatmap String RDD:", rdd_flatmap_string_result.collect())



#Input : 1,2,3,4,5
rdd_group = sc.parallelize([1,2,3,4,5])
#use group by on above series like odd and even
rdd_grouped = rdd_group.groupBy(lambda x: "even" if x % 2 == 0 else "odd")

#print like key value pair
# for key, value in rdd_grouped.collect():
#     print(key, list(value))


print([(key,list(value)) for key , value in rdd_grouped.collect()])




# =====================================
# 8. Action: Count
# Total number of elements
# =====================================
count_result = rdd.count()
print("Count of elements:", count_result)  # 5

# =====================================
# 9. Action: First
# Get first element
# =====================================
first_element = rdd.first()
print("First element:", first_element)  # 1

# =====================================
# 10. Stop Spark Context
# =====================================
sc.stop()


Mapped RDD: [2, 4, 6, 8, 10]
Filtered RDD: [2, 4]
Sum using reduce: 15
Flatmap RDD: [1, 2, 2, 3, 3, 4, 4, 5, 5, 6]
[1, 2, 3, 4, 5, 6, 7, 8, 9]
Flatmap String RDD: ['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd']
odd [1, 3, 5]
even [2, 4]
[('odd', [1, 3, 5]), ('even', [2, 4])]
Count of elements: 5
First element: 1


##DATAFRAME

In [15]:
data = [
    (1, "John", "HR", 5000),
    (2, "Jane", "IT", 8000),
    (3, "Mike", "IT", 6000),
    (4, "Sara", "Finance", 7000),
    (5, "David", "HR", 5500)
]

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

# Define column names
columns = ["ID", "Name", "Department", "Salary"]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

+---+-----+----------+------+
| ID| Name|Department|Salary|
+---+-----+----------+------+
|  1| John|        HR|  5000|
|  2| Jane|        IT|  8000|
|  3| Mike|        IT|  6000|
|  4| Sara|   Finance|  7000|
|  5|David|        HR|  5500|
+---+-----+----------+------+



In [16]:
df.select("Name","Salary").show()

+-----+------+
| Name|Salary|
+-----+------+
| John|  5000|
| Jane|  8000|
| Mike|  6000|
| Sara|  7000|
|David|  5500|
+-----+------+



In [17]:
df.filter(df.Salary > 6000).show()

+---+----+----------+------+
| ID|Name|Department|Salary|
+---+----+----------+------+
|  2|Jane|        IT|  8000|
|  4|Sara|   Finance|  7000|
+---+----+----------+------+



In [19]:
from pyspark.sql.functions import col

df = df.withColumn("Bonus", col("Salary") * 0.1)
df.show()

+---+-----+----------+------+-----+
| ID| Name|Department|Salary|Bonus|
+---+-----+----------+------+-----+
|  1| John|        HR|  5000|500.0|
|  2| Jane|        IT|  8000|800.0|
|  3| Mike|        IT|  6000|600.0|
|  4| Sara|   Finance|  7000|700.0|
|  5|David|        HR|  5500|550.0|
+---+-----+----------+------+-----+



In [20]:
df= df.drop("Bonus")
df.show()

+---+-----+----------+------+
| ID| Name|Department|Salary|
+---+-----+----------+------+
|  1| John|        HR|  5000|
|  2| Jane|        IT|  8000|
|  3| Mike|        IT|  6000|
|  4| Sara|   Finance|  7000|
|  5|David|        HR|  5500|
+---+-----+----------+------+



In [21]:
df = df.withColumnRenamed("Salary" , "Salary_After_Tax")
df.show()

+---+-----+----------+----------------+
| ID| Name|Department|Salary_After_Tax|
+---+-----+----------+----------------+
|  1| John|        HR|            5000|
|  2| Jane|        IT|            8000|
|  3| Mike|        IT|            6000|
|  4| Sara|   Finance|            7000|
|  5|David|        HR|            5500|
+---+-----+----------+----------------+



In [22]:
from pyspark.sql.functions import avg

df.groupBy("Department").agg(avg("Salary_After_Tax")).alias("Average Salary").show()

+----------+---------------------+
|Department|avg(Salary_After_Tax)|
+----------+---------------------+
|        HR|               5250.0|
|   Finance|               7000.0|
|        IT|               7000.0|
+----------+---------------------+



In [23]:
df.groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|        HR|    2|
|   Finance|    1|
|        IT|    2|
+----------+-----+



In [27]:
df.sort(df["Salary_After_Tax"].desc()).show()

+---+-----+----------+----------------+
| ID| Name|Department|Salary_After_Tax|
+---+-----+----------+----------------+
|  2| Jane|        IT|            8000|
|  4| Sara|   Finance|            7000|
|  3| Mike|        IT|            6000|
|  5|David|        HR|            5500|
|  1| John|        HR|            5000|
+---+-----+----------+----------------+



In [28]:
data_list = df.collect()

for row in data_list:
    print(row)

Row(ID=1, Name='John', Department='HR', Salary_After_Tax=5000)
Row(ID=2, Name='Jane', Department='IT', Salary_After_Tax=8000)
Row(ID=3, Name='Mike', Department='IT', Salary_After_Tax=6000)
Row(ID=4, Name='Sara', Department='Finance', Salary_After_Tax=7000)
Row(ID=5, Name='David', Department='HR', Salary_After_Tax=5500)


##DataSet

In [34]:
from pyspark.sql import Row
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col

# Get the existing SparkContext
sc = SparkContext.getOrCreate()
rdd = sc.parallelize([Row(name="RAJ", age=23),Row(name="Hari",age=21)])
dataset=spark.createDataFrame(rdd)
dataset.show()
dataset.filter(col("age") > 20).show()
dataset.select("name").show()

+----+---+
|name|age|
+----+---+
| RAJ| 23|
|Hari| 21|
+----+---+

+----+---+
|name|age|
+----+---+
| RAJ| 23|
|Hari| 21|
+----+---+

+----+
|name|
+----+
| RAJ|
|Hari|
+----+



In [35]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQLBasics").getOrCreate()

data = [
    (1, "Alice", "Sales", 3000),
    (2, "Bob", "IT", 4000),
    (3, "Cathy", "HR", 3500),
    (4, "David", "Sales", 4500),
    (5, "Eva", "IT", 4200)
]
columns = ["EmpID", "Name", "Department", "Salary"]

df = spark.createDataFrame(data, columns)
df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1|Alice|     Sales|  3000|
|    2|  Bob|        IT|  4000|
|    3|Cathy|        HR|  3500|
|    4|David|     Sales|  4500|
|    5|  Eva|        IT|  4200|
+-----+-----+----------+------+



In [37]:
#CONVERT TO RDD
rdd = df.rdd
print("RDD Example:" , rdd.map(lambda x: (x.Name,x.Salary)).collect())

RDD Example: [('Alice', 3000), ('Bob', 4000), ('Cathy', 3500), ('David', 4500), ('Eva', 4200)]


In [39]:
df.createOrReplaceTempView("employees")
str_sql = "SELECT Name, Salary FROM employees WHERE Salary > 3500"
sql_res = spark.sql(str_sql)
sql_res.show()

+-----+------+
| Name|Salary|
+-----+------+
|  Bob|  4000|
|David|  4500|
|  Eva|  4200|
+-----+------+

