In [4]:
import pandas as pd
import numpy as np
import json



In [5]:
# The JSON data provided
raw_data = [
  {"order_id": "ORD1001", "order_date": "2025-01-10", "customer": {"customer_id": "CUST01", "name": "Rahul Sharma", "city": "Delhi"}, "items": [{"product_id": "P101", "product_name": "Laptop", "category": "Electronics", "price": 55000, "quantity": 1}, {"product_id": "P102", "product_name": "Mouse", "category": "Electronics", "price": 500, "quantity": 2}], "payment": {"method": "Credit Card", "status": "Success"}},
  {"order_id": "ORD1002", "order_date": "2025-01-11", "customer": {"customer_id": "CUST02", "name": "Sneha Verma", "city": "Mumbai"}, "items": [{"product_id": "P103", "product_name": "Mobile Phone", "category": "Electronics", "price": 32000, "quantity": 1}], "payment": {"method": "UPI", "status": "Success"}},
  {"order_id": "ORD1003", "order_date": "2025-01-12", "customer": {"customer_id": "CUST03", "name": "Amit Patel", "city": "Pune"}, "items": [{"product_id": "P201", "product_name": "Dining Table", "category": "Furniture", "price": 18000, "quantity": 1}, {"product_id": "P202", "product_name": "Chair", "category": "Furniture", "price": 3500, "quantity": 4}], "payment": {"method": "Cash", "status": "Failed"}},
  {"order_id": "ORD1004", "order_date": "2025-01-13", "customer": {"customer_id": "CUST04", "name": "Neha Singh", "city": "Bangalore"}, "items": [{"product_id": "P104", "product_name": "Headphones", "category": "Electronics", "price": 2500, "quantity": 2}], "payment": {"method": "Debit Card", "status": "Success"}},
  {"order_id": "ORD1005", "order_date": "2025-01-14", "customer": {"customer_id": "CUST05", "name": "Karan Mehta", "city": "Delhi"}, "items": [{"product_id": "P301", "product_name": "Office Chair", "category": "Furniture", "price": 12000, "quantity": 1}], "payment": {"method": "Net Banking", "status": "Success"}},
  {"order_id": "ORD1006", "order_date": "2025-01-15", "customer": {"customer_id": "CUST06", "name": "Pooja Iyer", "city": "Chennai"}, "items": [{"product_id": "P105", "product_name": "Smart Watch", "category": "Electronics", "price": 9000, "quantity": 2}], "payment": {"method": "UPI", "status": "Success"}}
]


print(f"Total records ingested: {len(raw_data)}")

Total records ingested: 6


In [6]:
#Flatten the nested 'items' while keeping order and customer info
df = pd.json_normalize(
    raw_data, 
    record_path=['items'], 
    meta=[
        'order_id', 
        'order_date', 
        ['customer', 'customer_id'], 
        ['customer', 'name'], 
        ['customer', 'city'],
        ['payment', 'method'], 
        ['payment', 'status']
    ]
)


df.head()

Unnamed: 0,product_id,product_name,category,price,quantity,order_id,order_date,customer.customer_id,customer.name,customer.city,payment.method,payment.status
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success
3,P201,Dining Table,Furniture,18000,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
4,P202,Chair,Furniture,3500,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed


In [8]:
# 1. Standardize column names

df.columns = [
    'product_id', 'product_name', 'category', 'price', 'quantity',
    'order_id', 'order_date', 'customer_id', 'customer_name', 'customer_city',
    'payment_method', 'order_status'
]

In [9]:
# 2. Validate and convert data types
df['order_date'] = pd.to_datetime(df['order_date'])
df['price'] = df['price'].astype(float)
df['quantity'] = df['quantity'].astype(int)

In [10]:
print("Null values check:\n", df.isnull().sum())



Null values check:
 product_id        0
product_name      0
category          0
price             0
quantity          0
order_id          0
order_date        0
customer_id       0
customer_name     0
customer_city     0
payment_method    0
order_status      0
dtype: int64


In [11]:
df.head()

Unnamed: 0,product_id,product_name,category,price,quantity,order_id,order_date,customer_id,customer_name,customer_city,payment_method,order_status
0,P101,Laptop,Electronics,55000.0,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
1,P102,Mouse,Electronics,500.0,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
2,P103,Mobile Phone,Electronics,32000.0,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success
3,P201,Dining Table,Furniture,18000.0,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
4,P202,Chair,Furniture,3500.0,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed


In [14]:
# Convert pandas columns to NumPy arrays for calculation

price_arr = df['price'].values
quantity_arr = df['quantity'].values

In [15]:
# Calculate Total Amount per row using NumPy multiplication

df['total_amount'] = np.multiply(price_arr, quantity_arr)

In [16]:
df['high_value_order_flag'] = np.where(df['total_amount'] > 15000, 'Yes', 'No')

df[['product_name', 'total_amount', 'high_value_order_flag']].head()

Unnamed: 0,product_name,total_amount,high_value_order_flag
0,Laptop,55000.0,Yes
1,Mouse,1000.0,No
2,Mobile Phone,32000.0,Yes
3,Dining Table,18000.0,Yes
4,Chair,14000.0,No


In [19]:
# Keep a copy of the full data for "Completed vs Cancelled" analysis later

full_dataset = df.copy()

In [20]:
df_final = df[df['order_status'] == 'Success'].copy()

In [22]:
# Select and Order Mandatory Columns

mandatory_columns = [
    'order_id', 'order_date', 'customer_id', 'customer_name', 'customer_city',
    'product_name', 'category', 'quantity', 'total_amount', 
    'payment_method', 'order_status', 'high_value_order_flag'
]

In [23]:
df_final = df_final[mandatory_columns]


df_final.head()


Unnamed: 0,order_id,order_date,customer_id,customer_name,customer_city,product_name,category,quantity,total_amount,payment_method,order_status,high_value_order_flag
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Laptop,Electronics,1,55000.0,Credit Card,Success,Yes
1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Mouse,Electronics,2,1000.0,Credit Card,Success,No
2,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,Mobile Phone,Electronics,1,32000.0,UPI,Success,Yes
5,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Headphones,Electronics,2,5000.0,Debit Card,Success,No
6,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Office Chair,Furniture,1,12000.0,Net Banking,Success,No


In [24]:
print("--- BUSINESS ANALYTICS REPORT ---")

--- BUSINESS ANALYTICS REPORT ---


In [17]:

revenue_city = df_final.groupby('customer_city')['total_amount'].sum()
print(f"1. Total Revenue per City:\n{revenue_city}\n")

1. Total Revenue per City:
customer_city
Bangalore     5000.0
Chennai      18000.0
Delhi        68000.0
Mumbai       32000.0
Name: total_amount, dtype: float64



In [18]:
top_cat = df_final.groupby('category')['total_amount'].sum().idxmax()
print(f"2. Top Category: {top_cat}\n")

2. Top Category: Electronics



In [19]:
high_val_count = df_final[df_final['high_value_order_flag'] == 'Yes']['order_id'].nunique()
print(f"3. High-Value Order Count: {high_val_count}\n")


3. High-Value Order Count: 3



In [20]:
avg_val = df_final.groupby('order_id')['total_amount'].sum().mean()
print(f"4. Average Order Value: {avg_val:.2f}\n")

4. Average Order Value: 24600.00



In [21]:
top_payment = df_final['payment_method'].value_counts().idxmax()
print(f"5. Preferred Payment: {top_payment}\n")

5. Preferred Payment: Credit Card



In [22]:
top_cust = df_final.groupby('customer_name')['total_amount'].sum().sort_values(ascending=False).head(3)
print(f"6. Top Customers:\n{top_cust}\n")

6. Top Customers:
customer_name
Rahul Sharma    56000.0
Sneha Verma     32000.0
Pooja Iyer      18000.0
Name: total_amount, dtype: float64



In [23]:
status_report = full_dataset.groupby('order_id')['order_status'].first().value_counts()
print(f"7. Order Status Count:\n{status_report}")

7. Order Status Count:
order_status
Success    5
Failed     1
Name: count, dtype: int64
