In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, lower, when, concat, lit, length, regexp_extract, avg, max, min, sum, desc

# Initialize Spark session (already running in Databricks)
spark = SparkSession.builder \
    .appName("SparkDataProcessing") \
    .getOrCreate()

# Read the CSV data into Spark from Databricks FileStore
file_path = "dbfs:/FileStore/tables/session_info_colab.csv"  # Adjust the path if necessary
df = spark.read.csv(file_path, header=True, inferSchema=True, sep=";")

# Initial Data Inspection
print("Initial Schema:")
df.printSchema()

print("Initial Data:")
df.show(5, truncate=False)

# Data Cleaning and Preprocessing
cleaned_df = df \
    .withColumn("user_name", lower(col("user_name"))) \
    .withColumn("email_domain", regexp_extract(col("email"), r'@(.+)', 1)) \
    .filter(col("email").isNotNull()) \
    .dropDuplicates(["user_id"]) \
    .withColumn("address_length", length(col("address")))

# Transformations and New Columns
transformed_df = cleaned_df \
    .withColumn("status", when(col("address_length") > 50, lit("Detailed")).otherwise(lit("Short"))) \
    .withColumn("full_user_info", concat(col("user_name"), lit(" - "), col("email_domain")))

# Aggregations
user_stats = cleaned_df.groupBy("user_name").agg(
    count("*").alias("session_count"),
    avg("address_length").alias("avg_address_length"),
    max("address_length").alias("max_address_length"),
    min("address_length").alias("min_address_length")
)

# Sorting Example
sorted_user_stats = user_stats.orderBy(desc("session_count"))

# Joins
joined_df = transformed_df.join(user_stats, on="user_name", how="inner")

# Caching Example
joined_df.cache()
print(f"Count after caching: {joined_df.count()}")

# Grouped Aggregations and Additional Insights
email_domain_stats = cleaned_df.groupBy("email_domain").agg(
    count("user_id").alias("user_count"),
    sum("address_length").alias("total_address_length")
).orderBy(desc("user_count"))

# Data Insights
print("Transformed Data:")
transformed_df.show(5, truncate=False)

print("User Statistics:")
sorted_user_stats.show(5, truncate=False)

print("Joined Data:")
joined_df.show(5, truncate=False)

print("Email Domain Statistics:")
email_domain_stats.show(5, truncate=False)

# Stop the Spark session
spark.stop()