In [8]:
import pandas as pd

# Load the Sales dataset
sales_df = pd.read_csv('Sales.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset to inspect
print(sales_df.head())


   Order Number  Line Item Order Date Delivery Date  CustomerKey  StoreKey  \
0        366000          1   1/1/2016           NaN       265598        10   
1        366001          1   1/1/2016     1/13/2016      1269051         0   
2        366001          2   1/1/2016     1/13/2016      1269051         0   
3        366002          1   1/1/2016     1/12/2016       266019         0   
4        366002          2   1/1/2016     1/12/2016       266019         0   

   ProductKey  Quantity Currency Code  
0        1304         1           CAD  
1        1048         2           USD  
2        2007         1           USD  
3        1106         7           CAD  
4         373         1           CAD  


In [9]:
# Check for missing values in the Sales dataset
missing_values = sales_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)


Missing values in each column:
Order Number         0
Line Item            0
Order Date           0
Delivery Date    49719
CustomerKey          0
StoreKey             0
ProductKey           0
Quantity             0
Currency Code        0
dtype: int64


In [10]:
# Drop rows with missing Delivery Date
sales_cleaned_df = sales_df.dropna(subset=['Delivery Date'])

# Verify the number of rows after dropping
print(f"Number of rows before dropping missing values: {len(sales_df)}")
print(f"Number of rows after dropping missing values: {len(sales_cleaned_df)}")


Number of rows before dropping missing values: 62884
Number of rows after dropping missing values: 13165


In [11]:
# Check for duplicates in the key columns
duplicates = sales_df.duplicated(subset=['Order Number', 'Line Item'])
print(f"Number of duplicate entries: {duplicates.sum()}")

# Check for any missing values in key columns
missing_keys = sales_df[['Order Number', 'CustomerKey', 'StoreKey', 'ProductKey']].isnull().sum()
print(f"Missing values in key columns:\n{missing_keys}")

# Display unique values for key columns to ensure consistency
unique_orders = sales_df['Order Number'].nunique()
total_orders = len(sales_df)
print(f"Unique Order Numbers: {unique_orders}")
print(f"Total number of orders: {total_orders}")


Number of duplicate entries: 0
Missing values in key columns:
Order Number    0
CustomerKey     0
StoreKey        0
ProductKey      0
dtype: int64
Unique Order Numbers: 26326
Total number of orders: 62884


In [12]:
# Convert 'Order Date' and 'Delivery Date' to datetime
sales_df['Order Date'] = pd.to_datetime(sales_df['Order Date'], errors='coerce')
sales_df['Delivery Date'] = pd.to_datetime(sales_df['Delivery Date'], errors='coerce')

# Convert 'Quantity' to numeric, handling any non-numeric values
sales_df['Quantity'] = pd.to_numeric(sales_df['Quantity'], errors='coerce')

# Display the data types of the columns to confirm
print(sales_df.dtypes)


Order Number              int64
Line Item                 int64
Order Date       datetime64[ns]
Delivery Date    datetime64[ns]
CustomerKey               int64
StoreKey                  int64
ProductKey                int64
Quantity                  int64
Currency Code            object
dtype: object


In [13]:
# Check for duplicates in key columns
duplicates = sales_df.duplicated(subset=['Order Number', 'Line Item'])
print(f"Number of duplicate Order Number and Line Item combinations: {duplicates.sum()}")

# Check for any missing values in key columns
missing_keys = sales_df[['Order Number', 'CustomerKey', 'StoreKey', 'ProductKey']].isnull().sum()
print(f"Missing values in key columns:\n{missing_keys}")

# Display unique values for key columns to ensure consistency
unique_orders = sales_df['Order Number'].nunique()
total_orders = len(sales_df)
print(f"Unique Order Numbers: {unique_orders}")
print(f"Total number of orders: {total_orders}")


Number of duplicate Order Number and Line Item combinations: 0
Missing values in key columns:
Order Number    0
CustomerKey     0
StoreKey        0
ProductKey      0
dtype: int64
Unique Order Numbers: 26326
Total number of orders: 62884


In [14]:
# Save the cleaned Sales dataset
sales_cleaned_df.to_csv('cleaned_Sales.csv', index=False)