# Task I - Data Preparation

This notebook performs data cleaning and initial exploration of the online retail dataset.

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 1. Load the Dataset

In [None]:
data_path = '../online_retail_II.xlsx'
df = pd.read_excel(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

## 2. Initial Data Overview

In [None]:
print("Dataset Info:")
print(df.info())
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nDataset statistics:")
df.describe()

## 3. Data Cleaning

In [None]:
print(f"Original dataset size: {len(df)} rows")

df_clean = df.copy()

print(f"\n--- Removing Null Values ---")
print(f"Rows with null values before removal: {df_clean.isnull().sum().sum()}")
df_clean = df_clean.dropna()
print(f"Rows after removing nulls: {len(df_clean)}")

print(f"\n--- Removing Duplicate Rows ---")
print(f"Duplicate rows before removal: {df_clean.duplicated().sum()}")
df_clean = df_clean.drop_duplicates()
print(f"Rows after removing duplicates: {len(df_clean)}")

print(f"\n--- Handling Negative Values ---")
print(f"Negative prices: {(df_clean['Price'] < 0).sum()}")
print(f"Negative quantities: {(df_clean['Quantity'] < 0).sum()}")
print(f"Zero prices: {(df_clean['Price'] == 0).sum()}")
print(f"Zero quantities: {(df_clean['Quantity'] == 0).sum()}")

df_clean = df_clean[(df_clean['Price'] > 0) & (df_clean['Quantity'] > 0)]
print(f"\nRows after removing negative and zero values: {len(df_clean)}")

print(f"\nTotal rows removed: {len(df) - len(df_clean)}")
print(f"Final dataset size: {len(df_clean)} rows ({(len(df_clean)/len(df)*100):.2f}% of original)")

## 4. Analysis of Items (Products)

In [None]:
print(f"Total unique items (StockCode): {df_clean['StockCode'].nunique()}")
print(f"Total unique descriptions: {df_clean['Description'].nunique()}")

print(f"\n--- Item Statistics ---")
item_stats = df_clean.groupby('StockCode').agg({
    'Description': 'first',
    'Price': ['mean', 'min', 'max', 'std'],
    'Quantity': ['sum', 'mean', 'count'],
    'Invoice': 'nunique'
}).round(2)

item_stats.columns = ['Description', 'Avg_Price', 'Min_Price', 'Max_Price', 'Std_Price', 
                       'Total_Quantity', 'Avg_Quantity', 'Transaction_Count', 'Invoice_Count']

print(item_stats.head(10))

In [None]:
print("Top 10 Items by Total Quantity Sold:")
top_items_quantity = df_clean.groupby('StockCode').agg({
    'Description': 'first',
    'Quantity': 'sum',
    'Price': 'mean',
    'Invoice': 'count'
}).sort_values('Quantity', ascending=False).head(10)

top_items_quantity.columns = ['Description', 'Total_Quantity', 'Avg_Price', 'Transactions']
print(top_items_quantity)

print("\n\nTop 10 Items by Average Price:")
top_items_price = df_clean.groupby('StockCode').agg({
    'Description': 'first',
    'Price': 'mean',
    'Quantity': 'sum',
    'Invoice': 'count'
}).sort_values('Price', ascending=False).head(10)

top_items_price.columns = ['Description', 'Avg_Price', 'Total_Quantity', 'Transactions']
print(top_items_price)

## 5. Customer and Geographic Analysis

In [None]:
print(f"Total unique customers: {df_clean['Customer ID'].nunique()}")
print(f"Total unique countries: {df_clean['Country'].nunique()}")

print(f"\nTop 10 Countries by Transaction Count:")
country_transactions = df_clean['Country'].value_counts().head(10)
print(country_transactions)

print(f"\nTop 10 Countries by Total Quantity:")
country_quantity = df_clean.groupby('Country')['Quantity'].sum().sort_values(ascending=False).head(10)
print(country_quantity)

print(f"\nTop 10 Customers by Total Spending:")
df_clean['Total_Spending'] = df_clean['Price'] * df_clean['Quantity']
customer_spending = df_clean.groupby('Customer ID')['Total_Spending'].sum().sort_values(ascending=False).head(10)
print(customer_spending)

## 6. Price and Quantity Distribution

In [None]:
print("Price Statistics:")
print(df_clean['Price'].describe().round(2))

print("\nQuantity Statistics:")
print(df_clean['Quantity'].describe().round(2))

print("\nTransaction Value (Price x Quantity) Statistics:")
df_clean['Transaction_Value'] = df_clean['Price'] * df_clean['Quantity']
print(df_clean['Transaction_Value'].describe().round(2))

## 7. Summary and Data Quality Report

In [None]:
summary_report = {
    'Original Rows': len(df),
    'Cleaned Rows': len(df_clean),
    'Rows Removed': len(df) - len(df_clean),
    'Removal Percentage': f"{((len(df) - len(df_clean))/len(df)*100):.2f}%",
    'Unique Products': df_clean['StockCode'].nunique(),
    'Unique Customers': df_clean['Customer ID'].nunique(),
    'Countries': df_clean['Country'].nunique(),
    'Date Range': f"{df_clean['InvoiceDate'].min()} to {df_clean['InvoiceDate'].max()}",
    'Avg Price': f"${df_clean['Price'].mean():.2f}",
    'Avg Quantity': f"{df_clean['Quantity'].mean():.2f}",
    'Total Revenue': f"${df_clean['Transaction_Value'].sum():.2f}"
}

print("\n" + "="*50)
print("DATA PREPARATION SUMMARY")
print("="*50)
for key, value in summary_report.items():
    print(f"{key}: {value}")
print("="*50)

## 8. Export Cleaned Data

In [None]:
df_clean.to_csv('cleaned_data.csv', index=False)
print("Cleaned data exported to 'cleaned_data.csv'")
file_size = df_clean.memory_usage(deep=True).sum() / (1024**2)
print("File size:", round(file_size, 2), "MB")