In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum,count, max, min, when

In [3]:
# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("Data_Analysis_Example").getOrCreate()

In [4]:
# Step 2: Load Data
# Sample data of customers' spending in different departments
data = [
    ("John", "Electronics", 200),
    ("John", "Clothing", 150),
    ("Anna", "Electronics", 300),
    ("Anna", "Furniture", 400),
    ("Mike", "Clothing", 100),
    ("Mike", "Furniture", 200),
    ("Sara", "Clothing", 250),
    ("Sara", "Electronics", 100)
]
columns = ["Customer_Name", "Department", "Amount_Spent"]
df = spark.createDataFrame(data, schema=columns)

In [5]:

# Step 3: Data Cleaning and Preparation
# Suppose we want to add a new column categorizing spending as "High" or "Low"
df = df.withColumn("Spending_Category", when(col("Amount_Spent") > 200, "High").otherwise("Low"))

In [6]:
# Step 4: Exploratory Data Analysis
# a) Calculate average spending per department
avg_spending_df = df.groupBy("Department").agg(avg("Amount_Spent").alias("Average_Spending"))
print("Average spending by department:")
avg_spending_df.show()

# b) Count customers per spending category
spending_count_df = df.groupBy("Spending_Category").agg(count("Customer_Name").alias("Customer_Count"))
print("Number of customers by spending category:")
spending_count_df.show()


Average spending by department:
+-----------+------------------+
| Department|  Average_Spending|
+-----------+------------------+
|Electronics|             200.0|
|   Clothing|166.66666666666666|
|  Furniture|             300.0|
+-----------+------------------+

Number of customers by spending category:
+-----------------+--------------+
|Spending_Category|Customer_Count|
+-----------------+--------------+
|             High|             3|
|              Low|             5|
+-----------------+--------------+



In [9]:
# Step 5: Feature Engineering
# Create a new feature for total spending by each customer
total_spending_df = df.groupBy("Customer_Name").agg(sum("Amount_Spent").alias("Total_Spending"))
print("Total spending by each customer:")
total_spending_df.show()

Total spending by each customer:
+-------------+--------------+
|Customer_Name|Total_Spending|
+-------------+--------------+
|         Anna|           700|
|         John|           350|
|         Mike|           300|
|         Sara|           350|
+-------------+--------------+



In [10]:
# Step 6: Data Transformation for Machine Learning (if needed)
# Convert categorical columns into numerical indices if used in ML modeling
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Department", outputCol="Department_Index")
indexed_df = indexer.fit(df).transform(df)
indexed_df.select("Department", "Department_Index").distinct().show()

+-----------+----------------+
| Department|Department_Index|
+-----------+----------------+
|   Clothing|             0.0|
|Electronics|             1.0|
|  Furniture|             2.0|
+-----------+----------------+



In [11]:
# Step 7: Stop the Spark session
spark.stop()