In [79]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [80]:
df_csv = pd.read_csv("orders.csv")
df_json = pd.read_json("customers.json")

In [81]:
print("\nFirst 5 rows of CSV dataset:")
print(df_csv.head())


First 5 rows of CSV dataset:
  customer_id order_id   age  gender       city product_category   price  \
0        C001    O1001  23.0    Male     Mumbai      Electronics  599.99   
1        C002    O1002  35.0  Female      Delhi          Fashion   49.99   
2        C003    O1003  29.0    Male  Bangalore             Home  120.50   
3        C004    O1004  41.0  Female    Chennai           Beauty   35.00   
4        C005    O1005   NaN    Male    Kolkata           Sports   89.99   

   quantity  discount  order_date  
0       1.0      10.0  2025-01-01  
1       2.0       5.0  2025-01-02  
2       1.0       NaN  2025-01-03  
3       3.0       0.0  2025-01-04  
4       2.0      15.0  2025-01-05  


In [82]:
print("\nFirst 5 rows of JSON dataset:")
print(df_json.head())


First 5 rows of JSON dataset:
  customer_id  loyalty_points  membership_years  avg_rating  \
0        C001          1200.0               3.0         4.5   
1        C002           850.0               2.0         4.2   
2        C003             NaN               1.0         3.8   
3        C004           640.0               4.0         4.7   
4        C005           500.0               NaN         4.0   

  preferred_payment_method  last_login_days_ago  
0              Credit Card                    5  
1                      UPI                   12  
2               Debit Card                   20  
3              Net Banking                    2  
4                      UPI                   15  


In [83]:
df = pd.merge(df_csv, df_json, on="customer_id", how="inner")

print("\nFirst 5 rows AFTER merge:")
print(df.head())


First 5 rows AFTER merge:
  customer_id order_id   age  gender       city product_category   price  \
0        C001    O1001  23.0    Male     Mumbai      Electronics  599.99   
1        C002    O1002  35.0  Female      Delhi          Fashion   49.99   
2        C003    O1003  29.0    Male  Bangalore             Home  120.50   
3        C004    O1004  41.0  Female    Chennai           Beauty   35.00   
4        C005    O1005   NaN    Male    Kolkata           Sports   89.99   

   quantity  discount  order_date  loyalty_points  membership_years  \
0       1.0      10.0  2025-01-01          1200.0               3.0   
1       2.0       5.0  2025-01-02           850.0               2.0   
2       1.0       NaN  2025-01-03             NaN               1.0   
3       3.0       0.0  2025-01-04           640.0               4.0   
4       2.0      15.0  2025-01-05           500.0               NaN   

   avg_rating preferred_payment_method  last_login_days_ago  
0         4.5              

In [84]:
print("\nMissing values in each column before cleaning:")
print(df.isnull().sum())
print("\nTotal missing values before cleaning:", df.isnull().sum().sum())


Missing values in each column before cleaning:
customer_id                  0
order_id                     0
age                         10
gender                       0
city                         0
product_category             0
price                        5
quantity                     7
discount                     4
order_date                   0
loyalty_points              14
membership_years             1
avg_rating                   2
preferred_payment_method     0
last_login_days_ago          0
dtype: int64

Total missing values before cleaning: 43


In [85]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

In [86]:
print("\nMissing values in each column after cleaning:")
print(df.isnull().sum())
print("\nTotal missing values after cleaning:", df.isnull().sum().sum())


Missing values in each column after cleaning:
customer_id                 0
order_id                    0
age                         0
gender                      0
city                        0
product_category            0
price                       0
quantity                    0
discount                    0
order_date                  0
loyalty_points              0
membership_years            0
avg_rating                  0
preferred_payment_method    0
last_login_days_ago         0
dtype: int64

Total missing values after cleaning: 0


In [87]:
df["total_amount"] = df["price"] * df["quantity"]

In [88]:
scaler = MinMaxScaler()

cols_to_normalize = [
    'age',
    'price',
    'quantity',
    'discount',
    'loyalty_points',
    'membership_years',
    'avg_rating',
    'last_login_days_ago',
    'total_amount'
]

df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

In [89]:
df.to_csv("final_processed_data.csv", index=False)
print("\nProcessing complete. File saved as final_processed_data.csv")


Processing complete. File saved as final_processed_data.csv
