In [None]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Simple PySpark Example") \
    .getOrCreate()

# Create a simple DataFrame
data = [("John", "Doe", 28),
        ("Jane", "Smith", 35),
        ("Sam", "Brown", 50)]

columns = ["First Name", "Last Name", "Age"]


+----------+---------+---+
|First Name|Last Name|Age|
+----------+---------+---+
|      John|      Doe| 28|
|      Jane|    Smith| 35|
|       Sam|    Brown| 50|
+----------+---------+---+



In [None]:

df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

# Stop the Spark session
spark.stop()


In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count, desc
from pyspark.sql.window import Window
import pandas as pd
# Initialize Spark session
spark = SparkSession.builder.appName("E-Commerce Analysis").getOrCreate()

# Load the dataset from the CSV file
# Assumes the file has been transferred using spark-submit --files "C:\Users\hp\Downloads\export.csv"
#file_path = "export.csv"  # Use the filename directly as it's now available in the working directory

# Load the CSV into a DataFrame
df = pd.read_csv(r"C:\Users\hp\Downloads\export.csv")

# Show the schema to ensure correct data types are inferred
df.printSchema()

# 1. Calculate total spending amount for each user
total_spent_df = df.groupBy("user_id").agg(sum("amount").alias("total_spent"))

# 2. Calculate average transaction amount per user
avg_transaction_df = df.groupBy("user_id").agg(avg("amount").alias("avg_transaction"))

# 3. Find the most frequently purchased category per user
category_count_df = df.groupBy("user_id", "category").agg(count("category").alias("category_count"))

# Find the most frequently purchased category per user using a window function
window_spec = Window.partitionBy("user_id").orderBy(desc("category_count"))
favorite_category_df = category_count_df.withColumn("rank", rank().over(window_spec)) \
                                        .filter(col("rank") == 1) \
                                        .select("user_id", "category")

# Join the three DataFrames together
final_df = total_spent_df \
    .join(avg_transaction_df, on="user_id", how="inner") \
    .join(favorite_category_df, on="user_id", how="inner") \
    .withColumnRenamed("category", "favorite_category")

# Show the result
final_df.show()

# Stop Spark session
spark.stop()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\hp\\Downloads\\export.csv'

In [8]:
!spark-submit --files "C:\Users\hp\Downloads\export.csv" --master local[*] e_commerce_analysis.py

24/11/13 08:44:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception in thread "main" java.io.IOException: Illegal file pattern: error parsing regexp: invalid escape sequence: `\U`
	at org.apache.hadoop.fs.GlobFilter.init(GlobFilter.java:71)
	at org.apache.hadoop.fs.GlobFilter.<init>(GlobFilter.java:50)
	at org.apache.hadoop.fs.Globber.doGlob(Globber.java:265)
	at org.apache.hadoop.fs.Globber.glob(Globber.java:202)
	at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2124)
	at org.apache.spark.util.DependencyUtils$.resolveGlobPath(DependencyUtils.scala:318)
	at org.apache.spark.util.DependencyUtils$.$anonfun$resolveGlobPaths$2(DependencyUtils.scala:273)
	at org.apache.spark.util.DependencyUtils$.$anonfun$resolveGlobPaths$2$adapted(DependencyUtils.scala:271)
	at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
	at scala.collection.IndexedSeqOptimized.foreach(Indexe

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import date

# Initialize Spark session
spark = SparkSession.builder.appName("Sector Transactions Analysis").getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("sector", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("transaction_date", DateType(), True)
])

# Create a new sample dataset with different sectors
data = [
    ("T1", "U1", "P1", "Healthcare", 200.0, date(2024, 1, 10)),
    ("T2", "U1", "P2", "Education", 120.0, date(2024, 1, 14)),
    ("T3", "U2", "P3", "Retail", 150.0, date(2024, 2, 3)),
    ("T4", "U2", "P4", "Food", 80.0, date(2024, 2, 5)),
    ("T5", "U3", "P5", "Healthcare", 250.0, date(2024, 3, 1)),
    ("T6", "U3", "P6", "Education", 90.0, date(2024, 3, 3)),
    ("T7", "U4", "P7", "Retail", 300.0, date(2024, 4, 7)),
    ("T8", "U4", "P8", "Food", 60.0, date(2024, 4, 9))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Calculate total spending amount and average transaction amount per user
spending_summary_df = df.groupBy("user_id").agg(
    F.sum("amount").alias("total_spent"),
    F.avg("amount").alias("avg_transaction")
)

# Calculate the most frequently purchased sector for each user
sector_count_df = df.groupBy("user_id", "sector") \
    .count() \
    .withColumn("rank", F.row_number().over(
        Window.partitionBy("user_id").orderBy(F.desc("count"))
    )) \
    .filter(F.col("rank") == 1) \
    .select("user_id", F.col("sector").alias("favorite_sector"))

# Join the summary and favorite sector results
final_result_df = spending_summary_df.join(sector_count_df, on="user_id", how="left")

# Show the result
final_result_df.show()

# Stop Spark session
spark.stop()


+-------+-----------+---------------+---------------+
|user_id|total_spent|avg_transaction|favorite_sector|
+-------+-----------+---------------+---------------+
|     U2|      230.0|          115.0|           Food|
|     U1|      320.0|          160.0|      Education|
|     U4|      360.0|          180.0|         Retail|
|     U3|      340.0|          170.0|      Education|
+-------+-----------+---------------+---------------+

