## FULL CASE STUDY: Business Data Transformation & Analytics using Pandas & NumPy (Jupyter Notebook)

In [11]:
import json
import pandas as pd
import numpy as np

# Json file loading
with open("/Users/bot/Downloads/demo_json.json", "r") as file:
    data = json.load(file)
data

[{'order_id': 'ORD1001',
  'order_date': '2025-01-10',
  'customer': {'customer_id': 'CUST01',
   'name': 'Rahul Sharma',
   'city': 'Delhi'},
  'items': [{'product_id': 'P101',
    'product_name': 'Laptop',
    'category': 'Electronics',
    'price': 55000,
    'quantity': 1},
   {'product_id': 'P102',
    'product_name': 'Mouse',
    'category': 'Electronics',
    'price': 500,
    'quantity': 2}],
  'payment': {'method': 'Credit Card', 'status': 'Success'}},
 {'order_id': 'ORD1002',
  'order_date': '2025-01-11',
  'customer': {'customer_id': 'CUST02',
   'name': 'Sneha Verma',
   'city': 'Mumbai'},
  'items': [{'product_id': 'P103',
    'product_name': 'Mobile Phone',
    'category': 'Electronics',
    'price': 32000,
    'quantity': 1}],
  'payment': {'method': 'UPI', 'status': 'Success'}},
 {'order_id': 'ORD1003',
  'order_date': '2025-01-12',
  'customer': {'customer_id': 'CUST03', 'name': 'Amit Patel', 'city': 'Pune'},
  'items': [{'product_id': 'P201',
    'product_name': 'Dini

### DataFrame Creation

In [12]:
df = pd.json_normalize(
    data,
    record_path="items",
    meta=[
        "order_id",
        "order_date",
        ["customer", "customer_id"],
        ["customer", "name"],
        ["customer", "city"],
        ["payment", "method"],
        ["payment", "status"]
    ]
)

df.head()

Unnamed: 0,product_id,product_name,category,price,quantity,order_id,order_date,customer.customer_id,customer.name,customer.city,payment.method,payment.status
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success
3,P201,Dining Table,Furniture,18000,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed
4,P202,Chair,Furniture,3500,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed


### Data Cleaning

In [13]:
df.columns = [
    "product_id",
    "product_name",
    "product_category",
    "price",
    "quantity_sold",
    "order_id",
    "order_date",
    "customer_id",
    "customer_name",
    "customer_city",
    "payment_method",
    "payment_status"
]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   product_id        8 non-null      object
 1   product_name      8 non-null      object
 2   product_category  8 non-null      object
 3   price             8 non-null      int64 
 4   quantity_sold     8 non-null      int64 
 5   order_id          8 non-null      object
 6   order_date        8 non-null      object
 7   customer_id       8 non-null      object
 8   customer_name     8 non-null      object
 9   customer_city     8 non-null      object
 10  payment_method    8 non-null      object
 11  payment_status    8 non-null      object
dtypes: int64(2), object(10)
memory usage: 900.0+ bytes


### convert order_date from object to datetime type

In [14]:
df["order_date"] = pd.to_datetime(df["order_date"])

In [15]:
df["order_date"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8 entries, 0 to 7
Series name: order_date
Non-Null Count  Dtype         
--------------  -----         
8 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 196.0 bytes


### Revenue calculation using NumPy

In [16]:
price_array = df["price"].to_numpy()
quantity_array = df["quantity_sold"].to_numpy()

df["total_amount"] = price_array * quantity_array

df

Unnamed: 0,product_id,product_name,product_category,price,quantity_sold,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status,total_amount
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,55000
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,1000
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success,32000
3,P201,Dining Table,Furniture,18000,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed,18000
4,P202,Chair,Furniture,3500,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed,14000
5,P104,Headphones,Electronics,2500,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success,5000
6,P301,Office Chair,Furniture,12000,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success,12000
7,P105,Smart Watch,Electronics,9000,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success,18000


### Exclude failed payments

In [17]:
#data before excluding
df

Unnamed: 0,product_id,product_name,product_category,price,quantity_sold,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status,total_amount
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,55000
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,1000
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success,32000
3,P201,Dining Table,Furniture,18000,1,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed,18000
4,P202,Chair,Furniture,3500,4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,Cash,Failed,14000
5,P104,Headphones,Electronics,2500,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success,5000
6,P301,Office Chair,Furniture,12000,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success,12000
7,P105,Smart Watch,Electronics,9000,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success,18000


In [19]:
df = df[df["payment_status"] == "Success"]
df

Unnamed: 0,product_id,product_name,product_category,price,quantity_sold,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status,total_amount
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,55000
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,1000
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success,32000
5,P104,Headphones,Electronics,2500,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success,5000
6,P301,Office Chair,Furniture,12000,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success,12000
7,P105,Smart Watch,Electronics,9000,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success,18000


### High-value order flag

In [20]:
#threshold for high value order being here 30000 assumption
df["high_value_order_flag"] = np.where(
    df["total_amount"] >= 30000, "Yes", "No"
)

In [21]:
df

Unnamed: 0,product_id,product_name,product_category,price,quantity_sold,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status,total_amount,high_value_order_flag
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,55000,Yes
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,1000,No
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success,32000,Yes
5,P104,Headphones,Electronics,2500,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success,5000,No
6,P301,Office Chair,Furniture,12000,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success,12000,No
7,P105,Smart Watch,Electronics,9000,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success,18000,No


### Assign order status

In [22]:
df["order_status"] = "Completed"

In [23]:
df

Unnamed: 0,product_id,product_name,product_category,price,quantity_sold,order_id,order_date,customer_id,customer_name,customer_city,payment_method,payment_status,total_amount,high_value_order_flag,order_status
0,P101,Laptop,Electronics,55000,1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,55000,Yes,Completed
1,P102,Mouse,Electronics,500,2,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Credit Card,Success,1000,No,Completed
2,P103,Mobile Phone,Electronics,32000,1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,UPI,Success,32000,Yes,Completed
5,P104,Headphones,Electronics,2500,2,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Debit Card,Success,5000,No,Completed
6,P301,Office Chair,Furniture,12000,1,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Net Banking,Success,12000,No,Completed
7,P105,Smart Watch,Electronics,9000,2,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,UPI,Success,18000,No,Completed


### Final Business-Ready Dataset

In [24]:
final_df = df[[
    "order_id",
    "order_date",
    "customer_id",
    "customer_name",
    "customer_city",
    "product_name",
    "product_category",
    "quantity_sold",
    "total_amount",
    "payment_method",
    "order_status",
    "high_value_order_flag"
]]

final_df


Unnamed: 0,order_id,order_date,customer_id,customer_name,customer_city,product_name,product_category,quantity_sold,total_amount,payment_method,order_status,high_value_order_flag
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Laptop,Electronics,1,55000,Credit Card,Completed,Yes
1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,Mouse,Electronics,2,1000,Credit Card,Completed,No
2,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,Mobile Phone,Electronics,1,32000,UPI,Completed,Yes
5,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,Headphones,Electronics,2,5000,Debit Card,Completed,No
6,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,Office Chair,Furniture,1,12000,Net Banking,Completed,No
7,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,Smart Watch,Electronics,2,18000,UPI,Completed,No


### Now starting with the Business question answers

In [25]:
#Total revenue per city
final_df.groupby("customer_city")["total_amount"].sum()

customer_city
Bangalore     5000
Chennai      18000
Delhi        68000
Mumbai       32000
Name: total_amount, dtype: int64

In [26]:
#Product category with highest revenue
final_df.groupby("product_category")["total_amount"].sum().idxmax()

'Electronics'

In [28]:
#Count of high-value orders
(final_df["high_value_order_flag"] == "Yes").sum()


np.int64(2)

In [30]:
#Average order value
final_df["total_amount"].mean()

np.float64(20500.0)

In [31]:
#Most preferred payment method
final_df["payment_method"].value_counts().idxmax()

'Credit Card'

In [34]:
#Top customers by revenue
final_df.groupby("customer_name")["total_amount"].sum().sort_values(ascending=False)


customer_name
Rahul Sharma    56000
Sneha Verma     32000
Pooja Iyer      18000
Karan Mehta     12000
Neha Singh       5000
Name: total_amount, dtype: int64

In [35]:
#Completed vs Cancelled orders
final_df["order_status"].value_counts()


order_status
Completed    6
Name: count, dtype: int64

### Explanations to the why

#### Raw nested JSON transaction data was ingested and transformed into a clean, flat dataset using Pandas and NumPy. The data was normalized to ensure one record per product sold, cleaned for consistency, and processed using NumPy for efficient revenue calculations. Business rules such as excluding failed payments and identifying high-value orders were applied to produce an analytics-ready dataset suitable for reporting and decision-making.