In [74]:
#importing necessary libraries
import json
import pandas as pd
import numpy as np

In [75]:
#input data used here
data = [
  {
    "order_id": "ORD1001",
    "order_date": "2025-01-10",
    "customer": {
      "customer_id": "CUST01",
      "name": "Rahul Sharma",
      "city": "Delhi"
    },
    "items": [
      {"product_id": "P101","product_name": "Laptop","category": "Electronics","price": 55000,"quantity": 1},
      {"product_id": "P102","product_name": "Mouse","category": "Electronics","price": 500,"quantity": 2}
    ],
    "payment": {"method": "Credit Card","status": "Success"}
  },
  {
    "order_id": "ORD1002",
    "order_date": "2025-01-11",
    "customer": {"customer_id": "CUST02","name": "Sneha Verma","city": "Mumbai"},
    "items": [
      {"product_id": "P103","product_name": "Mobile Phone","category": "Electronics","price": 32000,"quantity": 1}
    ],
    "payment": {"method": "UPI","status": "Success"}
  },
  {
    "order_id": "ORD1003",
    "order_date": "2025-01-12",
    "customer": {"customer_id": "CUST03","name": "Amit Patel","city": "Pune"},
    "items": [
      {"product_id": "P201","product_name": "Dining Table","category": "Furniture","price": 18000,"quantity": 1},
      {"product_id": "P202","product_name": "Chair","category": "Furniture","price": 3500,"quantity": 4}
    ],
    "payment": {"method": "Cash","status": "Failed"}
  },
  {
    "order_id": "ORD1004",
    "order_date": "2025-01-13",
    "customer": {"customer_id": "CUST04","name": "Neha Singh","city": "Bangalore"},
    "items": [
      {"product_id": "P104","product_name": "Headphones","category": "Electronics","price": 2500,"quantity": 2}
    ],
    "payment": {"method": "Debit Card","status": "Success"}
  },
  {
    "order_id": "ORD1005",
    "order_date": "2025-01-14",
    "customer": {"customer_id": "CUST05","name": "Karan Mehta","city": "Delhi"},
    "items": [
      {"product_id": "P301","product_name": "Office Chair","category": "Furniture","price": 12000,"quantity": 1}
    ],
    "payment": {"method": "Net Banking","status": "Success"}
  },
  {
    "order_id": "ORD1006",
    "order_date": "2025-01-15",
    "customer": {"customer_id": "CUST06","name": "Pooja Iyer","city": "Chennai"},
    "items": [
      {"product_id": "P105","product_name": "Smart Watch","category": "Electronics","price": 9000,"quantity": 2}
    ],
    "payment": {"method": "UPI","status": "Success"}
  }
]


In [76]:
df = pd.json_normalize(
    data,
    record_path='items',
    meta=[
        'order_id',
        'order_date',
        ['customer','customer_id'],
        ['customer','name'],
        ['customer','city'],
        ['payment','method'],
        ['payment','status']
    ]
)

df

Unnamed: 0,product_id,product_name,category,price,quantity,order_id,order_date,customer.customer_id,customer.name,customer.city,payment.method,payment.status
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success
3,P201,Dining Table,Furniture,18000,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
4,P202,Chair,Furniture,3500,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
5,P104,Headphones,Electronics,2500,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success
6,P301,Office Chair,Furniture,12000,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success
7,P105,Smart Watch,Electronics,9000,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success


In [77]:
#data cleaning
df.columns = [
    'product_id','product_name','product_category','price','quantity',
    'order_id','order_date','customer_id','customer_name','customer_city',
    'payment_method','payment_status'
]
df['order_date'] = pd.to_datetime(df['order_date'])
df['price'] = df['price'].astype(float)
df['quantity'] = df['quantity'].astype(int)

df

Unnamed: 0,product_id,product_name,product_category,price,quantity,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status
0,P101,Laptop,Electronics,55000.0,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
1,P102,Mouse,Electronics,500.0,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
2,P103,Mobile Phone,Electronics,32000.0,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success
3,P201,Dining Table,Furniture,18000.0,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
4,P202,Chair,Furniture,3500.0,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
5,P104,Headphones,Electronics,2500.0,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success
6,P301,Office Chair,Furniture,12000.0,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success
7,P105,Smart Watch,Electronics,9000.0,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success


In [78]:
#revenue calculation
price_array = df['price'].to_numpy()
qty_array = df['quantity'].to_numpy()

df['total_amount'] = np.multiply(price_array, qty_array)

#boolean masking
df['order_status'] = np.where(
    df['payment_status'] == 'Success',
    'Completed',
    'Cancelled'
)

df

Unnamed: 0,product_id,product_name,product_category,price,quantity,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status,total_amount,order_status
0,P101,Laptop,Electronics,55000.0,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,55000.0,Completed
1,P102,Mouse,Electronics,500.0,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,1000.0,Completed
2,P103,Mobile Phone,Electronics,32000.0,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success,32000.0,Completed
3,P201,Dining Table,Furniture,18000.0,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed,18000.0,Cancelled
4,P202,Chair,Furniture,3500.0,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed,14000.0,Cancelled
5,P104,Headphones,Electronics,2500.0,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success,5000.0,Completed
6,P301,Office Chair,Furniture,12000.0,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success,12000.0,Completed
7,P105,Smart Watch,Electronics,9000.0,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success,18000.0,Completed


In [79]:
#excluding failed payments
completed_df = df[df['order_status'] == 'Completed']

#high value order flag - taken as 30000 here 
df['high_value_order_flag'] = np.where(
    df['total_amount'] >= 30000,
    'Yes',
    'No'
)


In [80]:
#Final ready dataset
final_df = df[[
    'order_id',
    'order_date',
    'customer_id',
    'customer_name',
    'customer_city',
    'product_name',
    'product_category',
    'quantity',
    'total_amount',
    'payment_method',
    'order_status',
    'high_value_order_flag'
]]

final_df

Unnamed: 0,order_id,order_date,customer_id,customer_name,customer_city,product_name,product_category,quantity,total_amount,payment_method,order_status,high_value_order_flag
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Laptop,Electronics,1,55000.0,Credit Card,Completed,Yes
1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Mouse,Electronics,2,1000.0,Credit Card,Completed,No
2,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,Mobile Phone,Electronics,1,32000.0,UPI,Completed,Yes
3,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Dining Table,Furniture,1,18000.0,Cash,Cancelled,No
4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Chair,Furniture,4,14000.0,Cash,Cancelled,No
5,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Headphones,Electronics,2,5000.0,Debit Card,Completed,No
6,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Office Chair,Furniture,1,12000.0,Net Banking,Completed,No
7,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,Smart Watch,Electronics,2,18000.0,UPI,Completed,No


In [81]:
#Total revenue per city 
completed_df.groupby('customer_city')['total_amount'].sum()


customer_city
Bangalore     5000.0
Chennai      18000.0
Delhi        68000.0
Mumbai       32000.0
Name: total_amount, dtype: float64

In [82]:
#product category with hightest revenue
completed_df.groupby('product_category')['total_amount'].sum().idxmax()


'Electronics'

In [83]:
#count of high value orders
df[df['high_value_order_flag'] == 'Yes']['order_id'].nunique()


2

In [84]:
#average order value
completed_df.groupby('order_id')['total_amount'].sum().mean()


np.float64(24600.0)

In [85]:
#most preferred payment method
completed_df['payment_method'].value_counts().idxmax()

'Credit Card'

In [86]:
#Top customers by revenue
completed_df.groupby('customer_name')['total_amount'].sum().sort_values(ascending=False)


customer_name
Rahul Sharma    56000.0
Sneha Verma     32000.0
Pooja Iyer      18000.0
Karan Mehta     12000.0
Neha Singh       5000.0
Name: total_amount, dtype: float64

In [89]:
#Completed vs cancelled orders 
final_df.groupby('order_id')['order_status'].first().value_counts()


order_status
Completed    5
Cancelled    1
Name: count, dtype: int64