# 🧹 Customer Orders Data Cleaning
This notebook demonstrates essential data cleaning steps on a simulated customer order dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


## 🔽 Load the Raw Dataset

In [None]:
df = pd.read_csv('../data/raw_customer_orders.csv')
df.head()

## 📋 Basic Data Overview

In [None]:
df.info()
df.describe(include='all')

## 🔎 Missing & Duplicate Values

In [None]:
print("Missing values per column:")
print(df.isna().sum())
print("\nDuplicate rows:", df.duplicated().sum())

## 🛠 Data Cleaning Steps

In [None]:
# Remove duplicate rows
df = df.drop_duplicates()

# Drop rows with missing Order ID or Shipping Address
df = df.dropna(subset=['Order ID', 'Shipping Address'])

# Normalize Customer Name
df['Customer Name'] = df['Customer Name'].str.strip().str.title()

# Split Customer Name into First and Last Name
df[['First Name', 'Last Name']] = df['Customer Name'].str.split(' ', n=1, expand=True)

# Move new columns right after Customer Name
name_index = df.columns.get_loc("Customer Name")
cols = df.columns.tolist()
cols.insert(name_index + 1, cols.pop(cols.index("First Name")))
cols.insert(name_index + 2, cols.pop(cols.index("Last Name")))
df = df[cols]

# Drop original Customer Name column
df = df.drop(columns=['Customer Name'])

# Convert Order Date to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], dayfirst=True, errors='coerce')

# Remove negative prices
df = df[df['Product Price'] >= 0]

# Fill missing quantities with median
df['Quantity'] = df['Quantity'].fillna(df['Quantity'].median())


## ✅ Final Cleaned Dataset Preview

In [None]:
df.head()

## 💾 Save Cleaned Dataset

In [None]:
df.to_csv('../data/clean_customer_orders.csv', index=False)
print("Cleaned data saved successfully.")