In [1]:
import pandas as pd
import numpy as np 

### Reading Data and Converting to DataFrame 

In [2]:
df = pd.read_json("/Users/bot/Downloads/demo_json.json")
df

Unnamed: 0,order_id,order_date,customer,items,payment
0,ORD1001,2025-01-10,"{'customer_id': 'CUST01', 'name': 'Rahul Sharm...","[{'product_id': 'P101', 'product_name': 'Lapto...","{'method': 'Credit Card', 'status': 'Success'}"
1,ORD1002,2025-01-11,"{'customer_id': 'CUST02', 'name': 'Sneha Verma...","[{'product_id': 'P103', 'product_name': 'Mobil...","{'method': 'UPI', 'status': 'Success'}"
2,ORD1003,2025-01-12,"{'customer_id': 'CUST03', 'name': 'Amit Patel'...","[{'product_id': 'P201', 'product_name': 'Dinin...","{'method': 'Cash', 'status': 'Failed'}"
3,ORD1004,2025-01-13,"{'customer_id': 'CUST04', 'name': 'Neha Singh'...","[{'product_id': 'P104', 'product_name': 'Headp...","{'method': 'Debit Card', 'status': 'Success'}"
4,ORD1005,2025-01-14,"{'customer_id': 'CUST05', 'name': 'Karan Mehta...","[{'product_id': 'P301', 'product_name': 'Offic...","{'method': 'Net Banking', 'status': 'Success'}"
5,ORD1006,2025-01-15,"{'customer_id': 'CUST06', 'name': 'Pooja Iyer'...","[{'product_id': 'P105', 'product_name': 'Smart...","{'method': 'UPI', 'status': 'Success'}"


In [3]:
df.info(), df.shape

<class 'pandas.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   order_id    6 non-null      str   
 1   order_date  6 non-null      str   
 2   customer    6 non-null      object
 3   items       6 non-null      object
 4   payment     6 non-null      object
dtypes: object(3), str(2)
memory usage: 372.0+ bytes


(None, (6, 5))

In [4]:
type(df)

pandas.DataFrame

### Flatten Nested Data

In [5]:
customer_df = pd.json_normalize(df['customer'])
customer_df

Unnamed: 0,customer_id,name,city
0,CUST01,Rahul Sharma,Delhi
1,CUST02,Sneha Verma,Mumbai
2,CUST03,Amit Patel,Pune
3,CUST04,Neha Singh,Bangalore
4,CUST05,Karan Mehta,Delhi
5,CUST06,Pooja Iyer,Chennai


In [6]:
payment_df = pd.json_normalize(df['payment'])
payment_df

Unnamed: 0,method,status
0,Credit Card,Success
1,UPI,Success
2,Cash,Failed
3,Debit Card,Success
4,Net Banking,Success
5,UPI,Success


### Expand arrays so that one row represents one product per order

In [7]:
items_df = df.explode("items")
items_expanded = pd.json_normalize(items_df["items"])
items_expanded

Unnamed: 0,product_id,product_name,category,price,quantity
0,P101,Laptop,Electronics,55000,1
0,P102,Mouse,Electronics,500,2
1,P103,Mobile Phone,Electronics,32000,1
2,P201,Dining Table,Furniture,18000,1
2,P202,Chair,Furniture,3500,4
3,P104,Headphones,Electronics,2500,2
4,P301,Office Chair,Furniture,12000,1
5,P105,Smart Watch,Electronics,9000,2


### Making Final DataFrame

In [8]:
final_df = pd.concat(
    [
        items_df[["order_id", "order_date"]],
        customer_df,
        items_expanded,
        payment_df
    ],
    axis=1
)

In [9]:
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000,1,Credit Card,Success
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500,2,Credit Card,Success
1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000,1,UPI,Success
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000,1,Cash,Failed
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500,4,Cash,Failed
3,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500,2,Debit Card,Success
4,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000,1,Net Banking,Success
5,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000,2,UPI,Success


In [10]:
final_df.isnull().sum()

order_id        0
order_date      0
customer_id     0
name            0
city            0
product_id      0
product_name    0
category        0
price           0
quantity        0
method          0
status          0
dtype: int64

In [11]:
final_df.info()

<class 'pandas.DataFrame'>
Index: 8 entries, 0 to 5
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   order_id      8 non-null      str  
 1   order_date    8 non-null      str  
 2   customer_id   8 non-null      str  
 3   name          8 non-null      str  
 4   city          8 non-null      str  
 5   product_id    8 non-null      str  
 6   product_name  8 non-null      str  
 7   category      8 non-null      str  
 8   price         8 non-null      int64
 9   quantity      8 non-null      int64
 10  method        8 non-null      str  
 11  status        8 non-null      str  
dtypes: int64(2), str(10)
memory usage: 832.0 bytes


### Data Cleaning

In [12]:
final_df["order_date"] = pd.to_datetime(final_df["order_date"])
final_df["price"] = final_df["price"].astype(float)
final_df["quantity"] = final_df["quantity"].astype(int)
final_df.info()

<class 'pandas.DataFrame'>
Index: 8 entries, 0 to 5
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   order_id      8 non-null      str           
 1   order_date    8 non-null      datetime64[us]
 2   customer_id   8 non-null      str           
 3   name          8 non-null      str           
 4   city          8 non-null      str           
 5   product_id    8 non-null      str           
 6   product_name  8 non-null      str           
 7   category      8 non-null      str           
 8   price         8 non-null      float64       
 9   quantity      8 non-null      int64         
 10  method        8 non-null      str           
 11  status        8 non-null      str           
dtypes: datetime64[us](1), float64(1), int64(1), str(9)
memory usage: 832.0 bytes


In [13]:
final_df.columns = (
    final_df.columns
    .str.lower()
    .str.replace(" ", "_")
)

In [14]:
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000.0,1,Credit Card,Success
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500.0,2,Credit Card,Success
1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000.0,1,UPI,Success
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000.0,1,Cash,Failed
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500.0,4,Cash,Failed
3,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500.0,2,Debit Card,Success
4,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000.0,1,Net Banking,Success
5,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000.0,2,UPI,Success


### Calculating revenue collection and Applying conditional logic using vectorized operations

In [15]:
price = final_df["price"].to_numpy()
quantity = final_df["quantity"].to_numpy()
payment_status = final_df["status"].to_numpy()

revenue = np.where(
    payment_status == "Success",
    price * quantity,
    0
)

final_df["total_amount"] = revenue

In [16]:
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status,total_amount
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000.0,1,Credit Card,Success,55000.0
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500.0,2,Credit Card,Success,1000.0
1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000.0,1,UPI,Success,32000.0
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000.0,1,Cash,Failed,0.0
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500.0,4,Cash,Failed,0.0
3,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500.0,2,Debit Card,Success,5000.0
4,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000.0,1,Net Banking,Success,12000.0
5,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000.0,2,UPI,Success,18000.0


### Boolean Masking

In [17]:
success_mask = final_df['status'] =='Success'
final_df['successful_orders'] = success_mask
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status,total_amount,successful_orders
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000.0,1,Credit Card,Success,55000.0,True
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500.0,2,Credit Card,Success,1000.0,True
1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000.0,1,UPI,Success,32000.0,True
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000.0,1,Cash,Failed,0.0,False
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500.0,4,Cash,Failed,0.0,False
3,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500.0,2,Debit Card,Success,5000.0,True
4,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000.0,1,Net Banking,Success,12000.0,True
5,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000.0,2,UPI,Success,18000.0,True


### Business Logic Application

In [18]:
final_df.loc[~success_mask, "total_amount"] = 0
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status,total_amount,successful_orders
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000.0,1,Credit Card,Success,55000.0,True
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500.0,2,Credit Card,Success,1000.0,True
1,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000.0,1,UPI,Success,32000.0,True
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000.0,1,Cash,Failed,0.0,False
2,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500.0,4,Cash,Failed,0.0,False
3,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500.0,2,Debit Card,Success,5000.0,True
4,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000.0,1,Net Banking,Success,12000.0,True
5,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000.0,2,UPI,Success,18000.0,True


In [19]:
total_sales_amount = final_df['total_amount'].sum()
print(f"The total sales amount is Rs.{total_sales_amount}.")

The total sales amount is Rs.123000.0.


In [20]:
order_total_df = (
    final_df
    .groupby("order_id")["total_amount"]
    .sum()
    .reset_index()
    .rename(columns={"total_amount": "order_total"})
)
order_total_df["high_value_order"] = order_total_df["order_total"] > 50000
final_df = final_df.merge(order_total_df, on="order_id", how="left")
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status,total_amount,successful_orders,order_total,high_value_order
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000.0,1,Credit Card,Success,55000.0,True,56000.0,True
1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500.0,2,Credit Card,Success,1000.0,True,56000.0,True
2,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000.0,1,UPI,Success,32000.0,True,32000.0,False
3,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000.0,1,Cash,Failed,0.0,False,0.0,False
4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500.0,4,Cash,Failed,0.0,False,0.0,False
5,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500.0,2,Debit Card,Success,5000.0,True,5000.0,False
6,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000.0,1,Net Banking,Success,12000.0,True,12000.0,False
7,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000.0,2,UPI,Success,18000.0,True,18000.0,False


In [21]:
final_df["order_status"] = np.where(
    final_df["status"] == "Success",
    "Completed",
    "Cancelled"
)

In [22]:
final_df

Unnamed: 0,order_id,order_date,customer_id,name,city,product_id,product_name,category,price,quantity,method,status,total_amount,successful_orders,order_total,high_value_order,order_status
0,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P101,Laptop,Electronics,55000.0,1,Credit Card,Success,55000.0,True,56000.0,True,Completed
1,ORD1001,2025-01-10,CUST01,Rahul Sharma,Delhi,P102,Mouse,Electronics,500.0,2,Credit Card,Success,1000.0,True,56000.0,True,Completed
2,ORD1002,2025-01-11,CUST02,Sneha Verma,Mumbai,P103,Mobile Phone,Electronics,32000.0,1,UPI,Success,32000.0,True,32000.0,False,Completed
3,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P201,Dining Table,Furniture,18000.0,1,Cash,Failed,0.0,False,0.0,False,Cancelled
4,ORD1003,2025-01-12,CUST03,Amit Patel,Pune,P202,Chair,Furniture,3500.0,4,Cash,Failed,0.0,False,0.0,False,Cancelled
5,ORD1004,2025-01-13,CUST04,Neha Singh,Bangalore,P104,Headphones,Electronics,2500.0,2,Debit Card,Success,5000.0,True,5000.0,False,Completed
6,ORD1005,2025-01-14,CUST05,Karan Mehta,Delhi,P301,Office Chair,Furniture,12000.0,1,Net Banking,Success,12000.0,True,12000.0,False,Completed
7,ORD1006,2025-01-15,CUST06,Pooja Iyer,Chennai,P105,Smart Watch,Electronics,9000.0,2,UPI,Success,18000.0,True,18000.0,False,Completed


## Business Questions

### Total revenue per city

In [23]:
total_revenue_city = final_df.groupby("city")['total_amount'].sum().reset_index()
print("Revenue per City:")
print(total_revenue_city.to_string(index=False))

Revenue per City:
     city  total_amount
Bangalore        5000.0
  Chennai       18000.0
    Delhi       68000.0
   Mumbai       32000.0
     Pune           0.0


### Product category with highest revenue

In [24]:
category_revenue = final_df.groupby("category")['total_amount'].sum().reset_index()
top_category = category_revenue.sort_values(by='total_amount', ascending=False).iloc[0]
print(f"Highest revenue category: {top_category['category']} with revenue Rs.{top_category['total_amount']}")

Highest revenue category: Electronics with revenue Rs.111000.0


### Count of high-value orders

In [25]:
high_value_orders = final_df[final_df['high_value_order'] == True]
print(f"The no of high value orders is :{len(high_value_orders)}")

The no of high value orders is :2


### Average order value

In [26]:
order_level_df = (final_df[final_df["order_status"] == "Completed"].groupby('order_id')['total_amount'].sum().reset_index())
average_order_value = order_level_df['total_amount'].mean()
print(f"The average order value is : Rs.{average_order_value}")

The average order value is : Rs.24600.0


### Most preferred payment method

In [27]:
preferred_payment = (
    final_df[final_df["order_status"] == "Completed"]
    .groupby("method")
    .size()
    .reset_index(name="count")
    .sort_values(by="count", ascending=False)
)

print("Most preferred payment method:")
print(preferred_payment.iloc[0])

Most preferred payment method:
method    Credit Card
count               2
Name: 0, dtype: object


### Top customers by revenue

In [28]:
top_customer = (
    final_df[final_df["order_status"] == "Completed"]
    .groupby(["customer_id","name"])['total_amount']
    .sum()
    .reset_index()
    .sort_values(by="total_amount", ascending=False)
)

print("The customer with top revenue is:")
print(top_customer.iloc[0])

The customer with top revenue is:
customer_id           CUST01
name            Rahul Sharma
total_amount         56000.0
Name: 0, dtype: object


### Completed vs cancelled orders

In [29]:
order_status_df = (
    final_df
    .groupby("order_id")["order_status"]
    .first()
    .reset_index()
)
order_status_counts = order_status_df["order_status"].value_counts()
order_status_counts

order_status
Completed    5
Cancelled    1
Name: count, dtype: int64

## Final DataFrames and Datasets

In [30]:
clean_final_df = final_df.copy()
clean_final_df.to_csv("clean_final_dataframe.csv", index=False)

In [31]:
business_ready_df = clean_final_df[
    [
        "order_id",
        "order_date",
        "customer_id",
        "name",
        "city",
        "product_name",
        "category",
        "quantity",
        "total_amount",
        "method",
        "order_status",
        "high_value_order"
    ]
]
business_ready_df.to_csv("business_ready_dataset.csv", index=False)