In [0]:
rdd = sc.parallelize([1, 2, 3])
print(rdd.collect())


[1, 2, 3]


In [0]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
rdd = sc.parallelize([1, 2, 3, 4])
print(rdd.count())


4


In [0]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
rdd = sc.parallelize([10, 20, 30])
print(rdd.first())


10


In [0]:

rdd = sc.parallelize([5, 6, 7, 8])
print(rdd.take(2))

[5, 6]


In [0]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
rdd = sc.parallelize([1, 2, 3])
print(rdd.reduce(lambda x, y: x + y))  # Sum of elements

6


In [0]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
rdd = sc.parallelize([10, 20, 30])
rdd.saveAsTextFile('files.txt')

In [0]:
rdd = sc.parallelize([1, 2, 3])
print(rdd.map(lambda x: x * 2).collect())

[2, 4, 6]


In [0]:
rdd = sc.parallelize([1, 2, 3, 4])
print(rdd.filter(lambda x: x % 2 == 0).collect())  # Even numbers

[2, 4]


In [0]:
union_inp = sc.parallelize([2,4,5,6,7,8,9])
union_rdd_1 = union_inp.filter(lambda x: x % 2 == 0)
union_rdd_2 = union_inp.filter(lambda x: x % 3 == 0)
print(union_rdd_1.union(union_rdd_2).collect())

[2, 4, 6, 8, 6, 9]


In [0]:
rdd = sc.parallelize(["hello world", "PySpark RDD"])
print(rdd.flatMap(lambda x: x.split(" ")).collect())


['hello', 'world', 'PySpark', 'RDD']


In [0]:
marks_rdd = sc.parallelize([('Rahul', 25), ('Swati', 26), ('Shreya', 22), ('Abhay', 29), ('Rohan', 22),
('Rahul', 23), ('Swati', 19), ('Shreya', 28), ('Abhay', 26), ('Rohan', 22)])
print(marks_rdd.reduceByKey(lambda x, y: x + y).collect())

[('Shreya', 50), ('Swati', 45), ('Rahul', 48), ('Abhay', 55), ('Rohan', 44)]


In [0]:
marks_rdd = sc.parallelize([('Rahul', 25), ('Swati', 26), ('Shreya', 22), ('Abhay', 29), ('Rohan', 22),
('Rahul', 23), ('Swati', 19), ('Shreya', 28), ('Abhay', 26), ('Rohan', 22)])
print(marks_rdd.sortByKey('ascending').collect())

[('Abhay', 29), ('Abhay', 26), ('Rahul', 25), ('Rahul', 23), ('Rohan', 22), ('Rohan', 22), ('Shreya', 22), ('Shreya', 28), ('Swati', 26), ('Swati', 19)]


In [0]:
marks_rdd = sc.parallelize([('Rahul', 25), ('Swati', 26), ('Shreya', 22), ('Abhay', 29), ('Rohan', 22),
('Rahul', 23), ('Swati', 19), ('Shreya', 28), ('Abhay', 26), ('Rohan', 22)])
dict_rdd = marks_rdd.groupByKey().collect()
for key, value in dict_rdd:
  print(key, list(value))

Shreya [22, 28]
Swati [26, 19]
Rahul [25, 23]
Abhay [29, 26]
Rohan [22, 22]


In [0]:
marks_rdd = sc.parallelize([('Rahul', 25), ('Swati', 26), ('Rohan', 22), ('Rahul', 23), ('Swati', 19),
('Shreya', 28), ('Abhay', 26), ('Rohan', 22)])
dict_rdd = marks_rdd.countByKey().items()
for key, value in dict_rdd:
  print(key, value)

Rahul 2
Swati 2
Rohan 2
Shreya 1
Abhay 1


In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("PySparkExample").getOrCreate()

data = [('Ravi', 25, 'Delhi'), ('Meena', 30, 'Mumbai'), ('Arun', 22, 'Chennai')]
columns = ['Name', 'Age', 'City']

df = spark.createDataFrame(data, columns)

# Select single column
df.select('Name').show()

# Select multiple columns
df.select('Name', 'Age').show()


+-----+
| Name|
+-----+
| Ravi|
|Meena|
| Arun|
+-----+

+-----+---+
| Name|Age|
+-----+---+
| Ravi| 25|
|Meena| 30|
| Arun| 22|
+-----+---+



In [0]:
# Rename a column
renamed_df = df.withColumnRenamed('Age', 'Years')
renamed_df.show()


+-----+-----+-------+
| Name|Years|   City|
+-----+-----+-------+
| Ravi|   25|  Delhi|
|Meena|   30| Mumbai|
| Arun|   22|Chennai|
+-----+-----+-------+



In [0]:
# Filter rows where Age > 25
filtered_df = df.filter(df['Age'] > 25)
filtered_df.show()


+-----+---+------+
| Name|Age|  City|
+-----+---+------+
|Meena| 30|Mumbai|
+-----+---+------+



In [0]:
# Add a new column with calculated values
df_with_new_col = df.withColumn('AgeNextYear', df['Age'] + 1)
df_with_new_col.show()
# Drop a column
df_dropped = df.drop('City')
df_dropped.show()
# Sort by Age in descending order
df_sorted = df.orderBy(df['Age'].desc())
df_sorted.show()
# Group by City and calculate the average age
df.groupBy('City').avg('Age').show()
# Joining two DataFrames
data2 = [('Delhi', 'North'), ('Mumbai', 'West'), ('Chennai', 'South')]
columns2 = ['City', 'Region']

df2 = spark.createDataFrame(data2, columns2)

joined_df = df.join(df2, on='City', how='inner')
joined_df.show()



+-----+---+-------+-----------+
| Name|Age|   City|AgeNextYear|
+-----+---+-------+-----------+
| Ravi| 25|  Delhi|         26|
|Meena| 30| Mumbai|         31|
| Arun| 22|Chennai|         23|
+-----+---+-------+-----------+

+-----+---+
| Name|Age|
+-----+---+
| Ravi| 25|
|Meena| 30|
| Arun| 22|
+-----+---+

+-----+---+-------+
| Name|Age|   City|
+-----+---+-------+
|Meena| 30| Mumbai|
| Ravi| 25|  Delhi|
| Arun| 22|Chennai|
+-----+---+-------+

+-------+--------+
|   City|avg(Age)|
+-------+--------+
|  Delhi|    25.0|
| Mumbai|    30.0|
|Chennai|    22.0|
+-------+--------+

+-------+-----+---+------+
|   City| Name|Age|Region|
+-------+-----+---+------+
|Chennai| Arun| 22| South|
|  Delhi| Ravi| 25| North|
| Mumbai|Meena| 30|  West|
+-------+-----+---+------+



In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define a UDF to add "Mr./Ms." prefix
def add_prefix(name):
    return f"Mr./Ms. {name}"

add_prefix_udf = udf(add_prefix, StringType())

# Apply UDF
df_with_prefix = df.withColumn('NameWithPrefix', add_prefix_udf(df['Name']))
df_with_prefix.show()


+-----+---+-------+--------------+
| Name|Age|   City|NameWithPrefix|
+-----+---+-------+--------------+
| Ravi| 25|  Delhi|  Mr./Ms. Ravi|
|Meena| 30| Mumbai| Mr./Ms. Meena|
| Arun| 22|Chennai|  Mr./Ms. Arun|
+-----+---+-------+--------------+



In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("PySparkExample").getOrCreate()

data = [('Ravi', 25, 'Delhi'), ('Meena', 30, 'Mumbai'), ('Arun', 22, 'Chennai')]
columns = ['Name', 'Age', 'City']

df = spark.createDataFrame(data, columns)

# Create a temporary view
df.createOrReplaceTempView("people")

# Query the view using SQL
spark.sql("SELECT * FROM people WHERE Age > 25").show()


+-----+---+------+
| Name|Age|  City|
+-----+---+------+
|Meena| 30|Mumbai|
+-----+---+------+



In [0]:
# Create a global temporary view
df.createGlobalTempView("global_people")

# Query the global view using SQL (prefix with `global_temp.`)
spark.sql("SELECT * FROM global_temp.global_people").show()


+-----+---+-------+
| Name|Age|   City|
+-----+---+-------+
| Ravi| 25|  Delhi|
|Meena| 30| Mumbai|
| Arun| 22|Chennai|
+-----+---+-------+

