In [12]:
import pandas as pd

def clean_orders_data(file_path):
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()
    df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce', dayfirst=False)
    df = df.dropna(subset=['Order Date'])
    df = df.drop_duplicates()
    df = df.dropna(subset=['Customer Name', 'Quantity', 'Unit Price'])
    df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
    df['Unit Price'] = pd.to_numeric(df['Unit Price'], errors='coerce')
    df = df[(df['Quantity'] > 0) & (df['Unit Price'] > 0)]
    def recalc_revenue(row):
        calculated = row['Quantity'] * row['Unit Price']
        if pd.isna(row['Total Revenue']) or abs(calculated - row['Total Revenue']) > 0.01:
            return calculated
        return row['Total Revenue']
    df['Total Revenue'] = pd.to_numeric(df['Total Revenue'], errors='coerce')
    df['Total Revenue'] = df.apply(recalc_revenue, axis=1)
    df = df.sort_values('Order Date')
    df = df.reset_index(drop=True)
    return df

cleaned_df = clean_orders_data('sales.csv')
cleaned_df.to_csv('cleaned_sales.csv', index=False)

print("Cleaning complete. Saved to 'cleaned_sales.csv'")
cleaned_df.head()



Cleaning complete. Saved to 'cleaned_sales.csv'


Unnamed: 0,Order ID,Customer Name,Order Date,Product,Quantity,Unit Price,Total Revenue
0,1001,John Doe,2024-01-01,Widget A,10.0,25.0,250.0
1,1002,Jane Smith,2024-01-02,Widget B,5.0,40.0,200.0
2,1006,John Doe,2024-06-01,Widget A,4.0,25.0,100.0
