In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
# Ensure directory structure
os.makedirs("data_warehouse", exist_ok=True)

### Ingestion

In [3]:
sales_df = pd.read_csv("sale_price.csv")
feedback_df = pd.read_json("customer_feedback.json")

In [4]:
sales_df

Unnamed: 0,sale_id,product_id,customer_id,sale_price,quantity,sale_date
0,1001,P101,C_001,$150.00,2.0,15/01/2023
1,1002,P102,C_002,$75,3.0,01/20/2023
2,1003,P103,C_001,$250.50,1.0,25/01/2023
3,1004,P101,C_003,$150.00,4.0,01/02/2023
4,1005,P104,C_004,$30.00,,05/02/2023
...,...,...,...,...,...,...
995,1996,P104,C_042,$250.50,4.0,03/29/2023
996,1997,P105,C_014,$30.00,1.0,03/02/2023
997,1998,P104,C_092,$75,,31/03/2023
998,1999,P103,C_063,$250.50,5.0,10/03/2023


In [5]:
feedback_df 

Unnamed: 0,customer_id,product_id,sentiment_score,review_date
0,C_001,P101,4.5,2023-01-16
1,C_002,P102,3.0,2023-01-21
2,C_003,P101,5.0,2023-02-02
3,C_005,P102,4.0,2023-02-11
4,C_001,P105,2.5,2023-03-11
...,...,...,...,...
995,C_066,P104,4.8,2023-03-27
996,C_035,P105,3.5,2023-03-04
997,C_086,P102,4.7,2023-03-18
998,C_088,P104,4.6,2023/02/18


### Cleansing

In [6]:
# Remove '$' and convert sale_price to float
sales_df['sale_price'] = sales_df['sale_price'].replace('[\$,]', '', regex=True).astype(float)

In [7]:
sales_df.head()

Unnamed: 0,sale_id,product_id,customer_id,sale_price,quantity,sale_date
0,1001,P101,C_001,150.0,2.0,15/01/2023
1,1002,P102,C_002,75.0,3.0,01/20/2023
2,1003,P103,C_001,250.5,1.0,25/01/2023
3,1004,P101,C_003,150.0,4.0,01/02/2023
4,1005,P104,C_004,30.0,,05/02/2023


In [8]:
sales_df.isnull().sum()


sale_id          0
product_id       0
customer_id      0
sale_price     200
quantity       188
sale_date        0
dtype: int64

In [9]:
# Fill missing quantity with 1 (default)
sales_df['quantity'] = sales_df['quantity'].fillna(1).astype(int)

In [10]:
sales_df.head()

Unnamed: 0,sale_id,product_id,customer_id,sale_price,quantity,sale_date
0,1001,P101,C_001,150.0,2,15/01/2023
1,1002,P102,C_002,75.0,3,01/20/2023
2,1003,P103,C_001,250.5,1,25/01/2023
3,1004,P101,C_003,150.0,4,01/02/2023
4,1005,P104,C_004,30.0,1,05/02/2023


In [11]:
# Standardize date format
sales_df['sale_date'] = pd.to_datetime(sales_df['sale_date'], format='mixed', errors='coerce')
feedback_df['review_date'] = pd.to_datetime(feedback_df['review_date'], format='mixed', errors='coerce')

In [12]:
feedback_df 

Unnamed: 0,customer_id,product_id,sentiment_score,review_date
0,C_001,P101,4.5,2023-01-16
1,C_002,P102,3.0,2023-01-21
2,C_003,P101,5.0,2023-02-02
3,C_005,P102,4.0,2023-02-11
4,C_001,P105,2.5,2023-03-11
...,...,...,...,...
995,C_066,P104,4.8,2023-03-27
996,C_035,P105,3.5,2023-03-04
997,C_086,P102,4.7,2023-03-18
998,C_088,P104,4.6,2023-02-18


In [13]:
sales_df

Unnamed: 0,sale_id,product_id,customer_id,sale_price,quantity,sale_date
0,1001,P101,C_001,150.0,2,2023-01-15
1,1002,P102,C_002,75.0,3,2023-01-20
2,1003,P103,C_001,250.5,1,2023-01-25
3,1004,P101,C_003,150.0,4,2023-01-02
4,1005,P104,C_004,30.0,1,2023-05-02
...,...,...,...,...,...,...
995,1996,P104,C_042,250.5,4,2023-03-29
996,1997,P105,C_014,30.0,1,2023-03-02
997,1998,P104,C_092,75.0,1,2023-03-31
998,1999,P103,C_063,250.5,5,2023-10-03


### Trans

In [14]:
# Compute total revenue
sales_df['total_revenue'] = sales_df['sale_price'] * sales_df['quantity']

In [15]:
# Keep the latest feedback entry per (product_id, customer_id)
feedback_df = feedback_df.sort_values('review_date').drop_duplicates(
    subset=['product_id', 'customer_id'], keep='last'
)

In [16]:
# Merge sales and feedback
merged_df = pd.merge(sales_df, feedback_df, on=['product_id', 'customer_id'], how='left')

In [17]:
merged_df

Unnamed: 0,sale_id,product_id,customer_id,sale_price,quantity,sale_date,total_revenue,sentiment_score,review_date
0,1001,P101,C_001,150.0,2,2023-01-15,300.0,4.5,2023-01-16
1,1002,P102,C_002,75.0,3,2023-01-20,225.0,4.5,2023-02-05
2,1003,P103,C_001,250.5,1,2023-01-25,250.5,,NaT
3,1004,P101,C_003,150.0,4,2023-01-02,600.0,3.7,2023-02-18
4,1005,P104,C_004,30.0,1,2023-05-02,30.0,3.9,2023-02-01
...,...,...,...,...,...,...,...,...,...
995,1996,P104,C_042,250.5,4,2023-03-29,1002.0,,NaT
996,1997,P105,C_014,30.0,1,2023-03-02,30.0,,NaT
997,1998,P104,C_092,75.0,1,2023-03-31,75.0,1.7,2023-03-31
998,1999,P103,C_063,250.5,5,2023-10-03,1252.5,,NaT


In [18]:
# Convert sale_price to numeric (force invalid entries to NaN)
merged_df['sale_price'] = pd.to_numeric(merged_df['sale_price'], errors='coerce')

# Remove rows where sale_price is 0 or NaN, and review_date is missing
merged_df = merged_df[(merged_df['sale_price'] > 0) & (merged_df['review_date'].notna())]


### 4. Loading to warehouse


In [19]:
processed_path = "data_warehouse/processed_sales_data.csv"
merged_df.to_csv(processed_path, index=False)

print(f"Processed data saved to {processed_path}")

Processed data saved to data_warehouse/processed_sales_data.csv
