In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

spark = SparkSession.builder.appName("DataFrame Transformations").getOrCreate()

data = [
    (1, "Alice", 29, "New York"),
    (2, "Bob", 31, "San Francisco"),
    (3, "Cathy", 23, "Los Angeles"),
    (4, "David", 45, "Chicago"),    
    (5, "Eva", 35, "Boston")
]


columns = ["id", "name", "age", "city"]
df = spark.createDataFrame(data, columns)
print("Original DataFrame:")
df.show()

filtered_df = df.filter(col("age") > 30)
print("Filtered DataFrame (age > 30):")
filtered_df.show()


transformed_df = df.withColumn("is_above_30", when(col("age") >= 30, True).otherwise(False))
print("Transformed DataFrame with 'is_adult' column:")
transformed_df.show()

spark.stop()




Original DataFrame:
+---+-----+---+-------------+
| id| name|age|         city|
+---+-----+---+-------------+
|  1|Alice| 29|     New York|
|  2|  Bob| 31|San Francisco|
|  3|Cathy| 23|  Los Angeles|
|  4|David| 45|      Chicago|
|  5|  Eva| 35|       Boston|
+---+-----+---+-------------+

Filtered DataFrame (age > 30):
+---+-----+---+-------------+
| id| name|age|         city|
+---+-----+---+-------------+
|  2|  Bob| 31|San Francisco|
|  4|David| 45|      Chicago|
|  5|  Eva| 35|       Boston|
+---+-----+---+-------------+

Transformed DataFrame with 'is_adult' column:
+---+-----+---+-------------+-----------+
| id| name|age|         city|is_above_30|
+---+-----+---+-------------+-----------+
|  1|Alice| 29|     New York|      false|
|  2|  Bob| 31|San Francisco|       true|
|  3|Cathy| 23|  Los Angeles|      false|
|  4|David| 45|      Chicago|       true|
|  5|  Eva| 35|       Boston|       true|
+---+-----+---+-------------+-----------+



In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataFrame Actions").getOrCreate()

data = [
    (1, "John", "Doe", 5000),
    (2, "Jane", "Smith", 6000),
    (3, "Sam", "Brown", 7000),
    (4, "Lisa", "Wilson", 8000),
    (5, "Paul", "Jones", 9000)
]

columns = ["id", "first_name", "last_name", "salary"]

df = spark.createDataFrame(data, columns)
df.show()

row_count = df.count()
print(f"Number of rows in the DataFrame: {row_count}")

df.select("first_name", "salary").show()
df.show(3)
df.describe().show()
spark.stop()




+---+----------+---------+------+
| id|first_name|last_name|salary|
+---+----------+---------+------+
|  1|      John|      Doe|  5000|
|  2|      Jane|    Smith|  6000|
|  3|       Sam|    Brown|  7000|
|  4|      Lisa|   Wilson|  8000|
|  5|      Paul|    Jones|  9000|
+---+----------+---------+------+

Number of rows in the DataFrame: 5
+----------+------+
|first_name|salary|
+----------+------+
|      John|  5000|
|      Jane|  6000|
|       Sam|  7000|
|      Lisa|  8000|
|      Paul|  9000|
+----------+------+

+---+----------+---------+------+
| id|first_name|last_name|salary|
+---+----------+---------+------+
|  1|      John|      Doe|  5000|
|  2|      Jane|    Smith|  6000|
|  3|       Sam|    Brown|  7000|
+---+----------+---------+------+
only showing top 3 rows

+-------+------------------+----------+---------+------------------+
|summary|                id|first_name|last_name|            salary|
+-------+------------------+----------+---------+------------------+
|  coun

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg

spark = SparkSession.builder \
    .appName("DataFrame Aggregations") \
    .getOrCreate()

data = [
    (1, "John", "Doe", 5000),
    (2, "Jane", "Smith", 6000),
    (3, "Sam", "Brown", 7000),
    (4, "Lisa", "Wilson", 8000),
    (5, "Paul", "Jones", 9000)
]

columns = ["id", "first_name", "last_name", "salary"]

df = spark.createDataFrame(data, columns)

total_salary = df.agg(sum("salary")).collect()[0][0]
print(f"Total salary: {total_salary}")

average_salary = df.agg(avg("salary")).collect()[0][0]
print(f"Average salary: {average_salary}")

spark.stop()




Total salary: 35000
Average salary: 7000.0


In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Write DataFrame to CSV").getOrCreate()

data = [
    (1, "John", "Doe", 5000),
    (2, "Jane", "Smith", 6000),
    (3, "Sam", "Brown", 7000),
    (4, "Lisa", "Wilson", 8000),
    (5, "Paul", "Jones", 9000)
]


columns = ["id", "first_name", "last_name", "salary"]
df = spark.createDataFrame(data, columns)
df.coalesce(1).write.csv("csvfile.csv", header=True, mode="overwrite")
spark.stop()


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

spark = SparkSession.builder \
    .appName("WordCountFromFile") \
    .getOrCreate()

df = spark.read.text("word_count.txt")

word_counts = (df
    .select(explode(split(col("value"), " ")).alias("word"))
    .groupBy("word")
    .count()
)

word_counts.show()

spark.stop()




+-----+-----+
| word|count|
+-----+-----+
|    !|    1|
|  lab|    2|
|   is|    1|
|Hello|    3|
| This|    1|
|     |    1|
|    2|    1|
+-----+-----+

