In [1]:
#Loading the file first
import pandas as pd
from pathlib import Path

# Set path to your file. If the file is in the notebook folder, this is fine:
FILE = Path("Book1.xlsx")

# Load first sheet
df = pd.read_excel(FILE, sheet_name='Week-2-Sales-Data')

# Quick peek
print("Loaded rows:", len(df))
df.head()


Loaded rows: 100


Unnamed: 0,Order_ID,Product,Region,Units_Sold,Unit_Price,Revenue,Sales_Rep,Order_Date
0,ORD001,Printer,Limpopo,45,2985,134325,Rep-2,2024-03-28
1,ORD002,Headphones,Western Cape,16,15076,241216,Rep-18,2024-04-11
2,ORD003,Laptop,Western Cape,45,14860,668700,Rep-16,2024-05-18
3,ORD004,External Hard Drive,KwaZulu-Natal,21,16237,340977,Rep-3,2024-05-16
4,ORD005,Smartphone,Western Cape,41,9420,386220,Rep-17,2024-02-21


In [2]:
# Basic info
print("Columns:", list(df.columns))
print("\nShape (rows, cols):", df.shape)
print("\nData types:")
print(df.dtypes)

# More details
display(df.info())
display(df.describe(include='all').T)


Columns: ['Order_ID', 'Product', 'Region', 'Units_Sold', 'Unit_Price', 'Revenue', 'Sales_Rep', 'Order_Date']

Shape (rows, cols): (100, 8)

Data types:
Order_ID              object
Product               object
Region                object
Units_Sold             int64
Unit_Price             int64
Revenue                int64
Sales_Rep             object
Order_Date    datetime64[ns]
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Order_ID    100 non-null    object        
 1   Product     100 non-null    object        
 2   Region      100 non-null    object        
 3   Units_Sold  100 non-null    int64         
 4   Unit_Price  100 non-null    int64         
 5   Revenue     100 non-null    int64         
 6   Sales_Rep   100 non-null    object        
 7   Order_Date  100 non-null    datetime64[ns]
dtypes: datetime64[

None

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
Order_ID,100.0,100.0,ORD001,1.0,,,,,,,
Product,100.0,7.0,Smartwatch,21.0,,,,,,,
Region,100.0,7.0,Western Cape,24.0,,,,,,,
Units_Sold,100.0,,,,28.23,2.0,16.75,27.5,42.25,49.0,14.159877
Unit_Price,100.0,,,,12616.57,927.0,6916.0,12380.5,17670.25,24995.0,6927.580312
Revenue,100.0,,,,352953.38,7914.0,110355.0,324875.0,528600.0,1040204.0,263247.77183
Sales_Rep,100.0,20.0,Rep-1,8.0,,,,,,,
Order_Date,100.0,,,,2024-03-24 10:04:48,2024-01-01 00:00:00,2024-02-05 00:00:00,2024-03-27 12:00:00,2024-05-13 18:00:00,2024-06-29 00:00:00,


In [3]:
# Missing values per column
missing = df.isnull().sum().sort_values(ascending=False)
print("Missing values per column:\n", missing)

# Duplicate rows
total_dups = df.duplicated().sum()
print(f"\nTotal exact duplicate rows: {total_dups}")

# If there's an Order_ID column, check duplicates by that id
if 'Order_ID' in df.columns:
    dup_order_ids = df.duplicated(subset=['Order_ID']).sum()
    print("Duplicate Order_ID count:", dup_order_ids)


Missing values per column:
 Order_ID      0
Product       0
Region        0
Units_Sold    0
Unit_Price    0
Revenue       0
Sales_Rep     0
Order_Date    0
dtype: int64

Total exact duplicate rows: 0
Duplicate Order_ID count: 0


In [4]:
# Trim whitespace in object/string columns and standardize Region/Product casing
obj_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
for c in obj_cols:
    # Leave NaN as NaN; apply .str.strip() only to strings
    df[c] = df[c].where(df[c].isnull(), df[c].str.strip())

# Optional normalizations (uncomment if you want them)
if 'Region' in df.columns:
    df['Region'] = df['Region'].where(df['Region'].isnull(), df['Region'].str.title())
if 'Product' in df.columns:
    df['Product'] = df['Product'].where(df['Product'].isnull(), df['Product'].str.title())

# Quick check
df[obj_cols].head()


Unnamed: 0,Order_ID,Product,Region,Sales_Rep
0,ORD001,Printer,Limpopo,Rep-2
1,ORD002,Headphones,Western Cape,Rep-18
2,ORD003,Laptop,Western Cape,Rep-16
3,ORD004,External Hard Drive,Kwazulu-Natal,Rep-3
4,ORD005,Smartphone,Western Cape,Rep-17


In [5]:
numeric_candidates = ['Revenue', 'Units_Sold', 'Unit_Price']
for col in numeric_candidates:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Show how many NaNs appeared after coercion
for col in numeric_candidates:
    if col in df.columns:
        print(f"{col} - nulls after coercion:", df[col].isnull().sum())


Revenue - nulls after coercion: 0
Units_Sold - nulls after coercion: 0
Unit_Price - nulls after coercion: 0


In [6]:
if set(['Revenue','Units_Sold','Unit_Price']).issubset(df.columns):
    missing_before = df['Revenue'].isnull().sum()
    mask = df['Revenue'].isnull() & df['Units_Sold'].notnull() & df['Unit_Price'].notnull()
    df.loc[mask, 'Revenue'] = df.loc[mask, 'Units_Sold'] * df.loc[mask, 'Unit_Price']
    missing_after = df['Revenue'].isnull().sum()
    print(f"Revenue missing before: {missing_before}; after filling from units*price: {missing_after}")


Revenue missing before: 0; after filling from units*price: 0


In [7]:
date_col = 'Order_Date'  # adjust if different
if date_col in df.columns:
    # If your dates are in day/month/year format set dayfirst=True
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce', dayfirst=False)
    print("Order_Date - nulls after conversion:", df[date_col].isnull().sum())
    display(df[[date_col]].head())
else:
    print("No Order_Date column found.")


Order_Date - nulls after conversion: 0


Unnamed: 0,Order_Date
0,2024-03-28
1,2024-04-11
2,2024-05-18
3,2024-05-16
4,2024-02-21


In [8]:
# Option A: drop exact duplicate rows
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before-after} exact duplicate rows (exact match on all columns).")

# Option B: if Order_ID exists and should be unique, drop by Order_ID keeping first appearance
if 'Order_ID' in df.columns:
    before = len(df)
    df = df.drop_duplicates(subset=['Order_ID'], keep='first')
    after = len(df)
    print(f"After ensuring unique Order_IDs, rows dropped: {before-after}")


Dropped 0 exact duplicate rows (exact match on all columns).
After ensuring unique Order_IDs, rows dropped: 0


In [9]:
# Summary of nulls
print("Nulls before handling:")
display(df.isnull().sum())

# Strategy example:
# - Drop rows missing Order_Date or Product because these are critical for analysis.
critical_cols = []
if 'Order_Date' in df.columns:
    critical_cols.append('Order_Date')
if 'Product' in df.columns:
    critical_cols.append('Product')

# Drop rows missing critical info
df = df.dropna(subset=critical_cols)
print("\nRows after dropping rows missing critical columns:", len(df))

# Fill non-critical categorical with 'Unknown'
if 'Region' in df.columns:
    df['Region'] = df['Region'].fillna('Unknown')

# Impute numeric columns sensibly
if 'Units_Sold' in df.columns:
    df['Units_Sold'] = df['Units_Sold'].fillna(0)            # if missing treat as 0 sold
if 'Unit_Price' in df.columns and df['Unit_Price'].isnull().any():
    df['Unit_Price'] = df['Unit_Price'].fillna(df['Unit_Price'].median())

# Final check
print("\nNulls after handling:")
display(df.isnull().sum())


Nulls before handling:


Order_ID      0
Product       0
Region        0
Units_Sold    0
Unit_Price    0
Revenue       0
Sales_Rep     0
Order_Date    0
dtype: int64


Rows after dropping rows missing critical columns: 100

Nulls after handling:


Order_ID      0
Product       0
Region        0
Units_Sold    0
Unit_Price    0
Revenue       0
Sales_Rep     0
Order_Date    0
dtype: int64

In [10]:
print("Final dtypes:")
print(df.dtypes)

# Example: add YearMonth column for reporting
if 'Order_Date' in df.columns:
    df['YearMonth'] = df['Order_Date'].dt.to_period('M').astype(str)

# Quick value counts to inspect categories
if 'Region' in df.columns:
    print("\nRegion distribution:")
    display(df['Region'].value_counts().head(20))
if 'Product' in df.columns:
    print("\nTop products by record count:")
    display(df['Product'].value_counts().head(20))


Final dtypes:
Order_ID              object
Product               object
Region                object
Units_Sold             int64
Unit_Price             int64
Revenue                int64
Sales_Rep             object
Order_Date    datetime64[ns]
dtype: object

Region distribution:


Region
Western Cape     24
Gauteng          16
North West       14
Eastern Cape     14
Limpopo          11
Free State       11
Kwazulu-Natal    10
Name: count, dtype: int64


Top products by record count:


Product
Smartwatch             21
Tablet                 16
Printer                15
Smartphone             15
Laptop                 13
Headphones             11
External Hard Drive     9
Name: count, dtype: int64

In [11]:
OUT = Path("Book1_cleaned.xlsx")
df.to_excel(OUT, index=False)
print("Saved cleaned dataset to", OUT.resolve())


Saved cleaned dataset to C:\Users\CAPACITI-JHB\Documents\Book1\Book1_cleaned.xlsx
