In [0]:
# Create a Spark DataFrame from a Python list of tuples
data = [("Alice", 29), ("Bob", 31), ("Cathy", 25)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

# Display the DataFrame
display(df)

# Select only the "Name" column
names_df = df.select("Name")
display(names_df)

# Filter rows where Age > 28
filtered_df = df.filter(df.Age > 28)
display(filtered_df)

# Group by "Age" and count
grouped_df = df.groupBy("Age").count()
display(grouped_df)

In [0]:
# Add 5 years to each person's age
df_with_age_plus_5 = df.withColumn("AgePlus5", df.Age + 5)
display(df_with_age_plus_5)

# Create a new column indicating if age is above 30
df_with_is_above_30 = df.withColumn("IsAbove30", df.Age > 30)
display(df_with_is_above_30)

# Concatenate Name and Age into a single string column
from pyspark.sql.functions import concat, lit
df_with_name_age = df.withColumn("NameAge", concat(df.Name, lit("_"), df.Age))
display(df_with_name_age)

# Rename the "Age" column to "Years"
df_renamed = df.withColumnRenamed("Age", "Years")
display(df_renamed)

In [0]:
# Create a Spark DataFrame with sample data including an "Amount" column
sample_data = [("Alice", 100), ("Bob", 200), ("Cathy", 150), ("Alice", 50), ("Bob", 75)]
sample_columns = ["Name", "Amount"]
amount_df = spark.createDataFrame(sample_data, sample_columns)

display(amount_df)

# Aggregate: Sum of Amount per Name
aggregated_df = amount_df.groupBy("Name").sum("Amount").withColumnRenamed("sum(Amount)", "TotalAmount")
display(aggregated_df)