### Data Analysis

- **How many rows**: Number of roows in the dataset.
- **How many columns**: Number of columns in the dataset.
- **List of the columns**: List of columns in the dataset.
- **Total Products**: Number of products in the dataset.
- **Distinct Products**: Number of unique products.
- **Total Transactions**: Number of transactions recorded.
- **Total Customers**: Number of customers in the dataset.
- **Distinct Categories**: Number of unique product categories.


### Data Validation Part

In [9]:
import pandas as pd
import os

current_dir = os.getcwd() 
dataset_dir = os.path.join(current_dir, "Dataset")

excel_file = 'not_validated_dataset_with_category.xlsx'
excel_file_path = os.path.join(dataset_dir, excel_file)

excel_file2 = 'validated_distinct_products_with_categories.xlsx'
excel_file_path2 = os.path.join(dataset_dir, excel_file2)

df1 = pd.read_excel(excel_file_path)
df2 = pd.read_excel(excel_file_path2)

print("=== DIAGNOSTIC ANALYSIS ===")
print(f"df1 shape: {df1.shape}")
print(f"df2 shape: {df2.shape}")

print(f"\ndf1 columns: {list(df1.columns)}")
print(f"df2 columns: {list(df2.columns)}")

if 'Itemname' not in df1.columns:
    print(" ERROR: 'Itemname' column not found in df1!")
if 'Itemname' not in df2.columns:
    print(" ERROR: 'Itemname' column not found in df2!")

print(f"\nNull values in df1['Itemname']: {df1['Itemname'].isnull().sum()}")
print(f"Null values in df2['Itemname']: {df2['Itemname'].isnull().sum()}")

df1_items = set(df1['Itemname'].dropna())
df2_items = set(df2['Itemname'].dropna())

print(f"\nUnique items in df1: {len(df1_items)}")
print(f"Unique items in df2: {len(df2_items)}")

items_in_df1_not_in_df2 = df1_items - df2_items
items_in_df2_not_in_df1 = df2_items - df1_items

print(f"\nItems in df1 but NOT in df2: {len(items_in_df1_not_in_df2)}")
print(f"Items in df2 but NOT in df1: {len(items_in_df2_not_in_df1)}")

print(f"\n{'='*80}")
print(f"ALL ITEMS IN DF1 BUT NOT IN DF2 ({len(items_in_df1_not_in_df2)} items):")
print(f"{'='*80}")
for i, item in enumerate(sorted(items_in_df1_not_in_df2), 1):
    print(f"{i:2d}. '{item}'")

print(f"\n{'='*80}")
print(f"ALL ITEMS IN DF2 BUT NOT IN DF1 ({len(items_in_df2_not_in_df1)} items):")
print(f"{'='*80}")
for i, item in enumerate(sorted(items_in_df2_not_in_df1), 1):
    print(f"{i:2d}. '{item}'")

print(f"\n{'='*80}")
print("ANALYSIS OF UNMATCHED ITEMS FROM DF1:")
print(f"{'='*80}")

df1_unmatched_counts = df1[df1['Itemname'].isin(items_in_df1_not_in_df2)]['Itemname'].value_counts()
print(f"Total rows affected by unmatched items: {df1_unmatched_counts.sum()}")
print("\nTop 10 most frequent unmatched items in df1:")
for item, count in df1_unmatched_counts.head(10).items():
    print(f"  '{item}': {count} occurrences")

unmatched_df1 = pd.DataFrame({
    'Itemname': list(items_in_df1_not_in_df2),
    'Frequency_in_df1': [df1['Itemname'].value_counts().get(item, 0) for item in items_in_df1_not_in_df2]
}).sort_values('Frequency_in_df1', ascending=False)

unmatched_df2 = pd.DataFrame({
    'Itemname': list(items_in_df2_not_in_df1)
})

print(f"\n=== STRING ANALYSIS ===")
print("Sample items from df1:")
for item in df1['Itemname'].dropna().head(5):
    print(f"  '{item}' (length: {len(str(item))})")

print("\nSample items from df2:")
for item in df2['Itemname'].dropna().head(5):
    print(f"  '{item}' (length: {len(str(item))})")

merged_df = df1.merge(df2, on='Itemname', how='left')

print(f"\n=== MERGE RESULTS ===")
print(f"Original df1 rows: {len(df1)}")
print(f"Merged rows: {len(merged_df)}")

df2_columns = [col for col in df2.columns if col != 'Itemname']
if df2_columns:
    missing_data_mask = merged_df[df2_columns].isnull().all(axis=1)
    rows_with_missing_data = missing_data_mask.sum()
    print(f"Rows with NO data from df2: {rows_with_missing_data}")
    print(f"Rows with data from df2: {len(merged_df) - rows_with_missing_data}")
    
    if rows_with_missing_data > 0:
        print(f"\nFirst 10 items that didn't get matched:")
        unmatched_items = merged_df[missing_data_mask]['Itemname'].unique()[:10]
        for item in unmatched_items:
            print(f"  '{item}'")

output_file = 'full_validated_dataset.xlsx'
output_file_path = os.path.join(dataset_dir, output_file)

if 'category' in merged_df.columns:
    merged_df = merged_df.drop('category', axis=1)

if 'Corrected_Category' in merged_df.columns:
    merged_df = merged_df.rename(columns={'Corrected_Category': 'category'})

# ===== CATEGORY VALIDATION CHECK =====
print(f"\n=== CATEGORY VALIDATION CHECK ===")


if 'category' in merged_df.columns:
  
    missing_categories = merged_df['category'].isnull().sum()
    print(f"Rows with missing categories: {missing_categories}")
    
    if missing_categories > 0:
      
        merged_df['category'] = merged_df['category'].fillna('Miscellaneous')
        print(f"Filled {missing_categories} missing categories with 'Miscellaneous'")
    else:
        print("No missing categories found - all rows have valid categories")
    
   
    category_counts = merged_df['category'].value_counts()
  
else:
    print("Warning: No 'category' column found in merged dataset!")

merged_df.to_excel(output_file_path, index=False)

print(f"\nMerged dataset saved to: {output_file_path}")



=== DIAGNOSTIC ANALYSIS ===
df1 shape: (520609, 8)
df2 shape: (4185, 2)

df1 columns: ['BillNo', 'Itemname', 'Quantity', 'Date', 'Price', 'CustomerID', 'Country', 'category']
df2 columns: ['Itemname', 'Corrected_Category']

Null values in df1['Itemname']: 0
Null values in df2['Itemname']: 0

Unique items in df1: 4185
Unique items in df2: 4185

Items in df1 but NOT in df2: 3
Items in df2 but NOT in df1: 3

ALL ITEMS IN DF1 BUT NOT IN DF2 (3 items):
 1. 'OOPS ! adjustment'
 2. 'Wrongly mrked had 85123a in box'
 3. 'crushed ctn'

ALL ITEMS IN DF2 BUT NOT IN DF1 (3 items):
 1. 'OOPS ! Adjustment'
 2. 'crushed ctn HAPPY STENCIL CRAFT'
 3. 'wrongly mrked had 85123a in box'

ANALYSIS OF UNMATCHED ITEMS FROM DF1:
Total rows affected by unmatched items: 3

Top 10 most frequent unmatched items in df1:
  'crushed ctn': 1 occurrences
  'OOPS ! adjustment': 1 occurrences
  'Wrongly mrked had 85123a in box': 1 occurrences

=== STRING ANALYSIS ===
Sample items from df1:
  'WHITE HANGING HEART T-LIGHT

In [10]:
import pandas as pd
import os

# ===== CONFIGURATION =====
BACKGROUND_COLOR = "#f5f5f5" 
TEXT_COLOR = "#333333"       


current_dir = os.getcwd() 
dataset_dir = os.path.join(current_dir, "Dataset")


excel_file = 'full_validated_dataset.xlsx'
excel_file_path = os.path.join(dataset_dir, excel_file)

# ===== DATA LOADING =====

df = pd.read_excel(excel_file_path)


df = df[df['category'] != 'Miscellaneous']
print(f"Dataset after removing Miscellaneous category: {len(df)} records")



print("\n ===== DATA ANALYSIS ===== \n")


plots_dir = os.path.join(current_dir, 'Analysis_plots')
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

# ===== FUNCTION DEFINITION =====
def compute_stats(dataframe):
    """
    Compute key statistics for the given DataFrame.
    Returns a dictionary with analysis results.
    """
    stats = {}
    stats['Rows'] = len(dataframe)
    stats['Columns'] = len(dataframe.columns)
    stats['Total Products'] = len(dataframe)  
    stats['Distinct Products'] = dataframe['Itemname'].nunique()
    stats['Total Transactions'] = dataframe['BillNo'].nunique()
    stats['Total Customers'] = int(dataframe['CustomerID'].dropna().nunique())
    stats['Distinct Categories'] = dataframe['category'].nunique()
    return stats

# ===== ANALYSIS FOR DIFFERENT PARTITIONS =====

stats_whole = compute_stats(df)


one_third_index = len(df) // 3

df_first_third = df.iloc[:one_third_index]
df_last_two_thirds = df.iloc[one_third_index:]

stats_first_third = compute_stats(df_first_third)
stats_last_two_thirds = compute_stats(df_last_two_thirds)

# ===== PRINT OVERVIEW FOR WHOLE DATASET =====
print("Dataset Overview (Whole Dataset):")
print("=" * 50)
print(f"  Rows: {stats_whole['Rows']:,}")
print(f"  Columns: {stats_whole['Columns']}")
print(f"  Column Names: {', '.join(df.columns.tolist())}")
print("=" * 50)
print(f"  Total Products: {stats_whole['Total Products']:,}")
print(f"  Distinct Products: {stats_whole['Distinct Products']:,}")
print(f"  Total Transactions: {stats_whole['Total Transactions']:,}")
print(f"  Total Customers: {stats_whole['Total Customers']:,}")
print(f"  Distinct Categories: {stats_whole['Distinct Categories']}")
print("=" * 50)

# ===== COMPARISON TABLE =====

comparison_data = {
    "Full Dataset": stats_whole,
    "First Third": stats_first_third,
    "Last Two Thirds": stats_last_two_thirds
}
comparison_table = pd.DataFrame(comparison_data)
comparison_table.index.name = "Metric"

print("\nComparison Table:")
print(comparison_table)



Dataset after removing Miscellaneous category: 517587 records

 ===== DATA ANALYSIS ===== 

Dataset Overview (Whole Dataset):
  Rows: 517,587
  Columns: 8
  Column Names: BillNo, Itemname, Quantity, Date, Price, CustomerID, Country, category
  Total Products: 517,587
  Distinct Products: 4,009
  Total Transactions: 19,505
  Total Customers: 4,297
  Distinct Categories: 19

Comparison Table:
                     Full Dataset  First Third  Last Two Thirds
Metric                                                         
Rows                       517587       172529           345058
Columns                         8            8                8
Total Products             517587       172529           345058
Distinct Products            4009         3254             3615
Total Transactions          19505         6706            12800
Total Customers              4297         2487             3632
Distinct Categories            19           19               19


In [11]:
import pandas as pd
import os

current_dir = os.getcwd() 
dataset_dir = os.path.join(current_dir, "Dataset")


excel_file = 'full_validated_dataset.xlsx'
excel_file_path = os.path.join(dataset_dir, excel_file)


def load_dataset(file_path):
    return pd.read_excel(file_path)

data_excel = load_dataset(excel_file_path)

data_excel = data_excel[data_excel['category'] != 'Miscellaneous']
print(f"Dataset after removing Miscellaneous category: {len(data_excel)} records")



data_excel.dropna(subset=['Itemname'], inplace=True)   
output_file = os.path.join(current_dir, 'unique_items.xlsx')

unique_items_df = pd.DataFrame({'unique_items': data_excel['Itemname'].unique()})
unique_items_df.to_excel(output_file, index=False)



distinct_products_df = data_excel[['Itemname', 'category']].drop_duplicates().reset_index(drop=True)

distinct_products_df = distinct_products_df.sort_values(['category', 'Itemname'])


output_file = os.path.join(dataset_dir, 'distinct_products_with_categories.xlsx')
distinct_products_df.to_excel(output_file, index=False)

print(f"Excel file saved to: {output_file}")



Dataset after removing Miscellaneous category: 517587 records
Excel file saved to: c:\Users\moham\Coding-Projects\Apriori_VS_Word2Vec\Dataset\distinct_products_with_categories.xlsx


### Transaction Patterns

Analyze the purchasing patterns visible in the data:

- **Average items per transaction**.
- **Distribution of transaction sizes**: Histogram showing the number of items per invoice.


In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

# ===== VISUALIZATION SETTINGS =====
MAIN_COLOR = "#1f77b4"       
TERTIARY_COLOR = "#2ca02c"   
BACKGROUND_COLOR = "#f5f5f5"
TEXT_COLOR = "#333333"       

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette([MAIN_COLOR, TERTIARY_COLOR])
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.facecolor'] = BACKGROUND_COLOR
plt.rcParams['axes.edgecolor'] = TEXT_COLOR
plt.rcParams['axes.labelcolor'] = TEXT_COLOR
plt.rcParams['text.color'] = TEXT_COLOR
plt.rcParams['xtick.color'] = TEXT_COLOR
plt.rcParams['ytick.color'] = TEXT_COLOR

# ===== DATA LOADING =====
current_dir = os.getcwd() 
dataset_dir = os.path.join(current_dir, "Dataset")
excel_file = 'full_validated_dataset.xlsx'
excel_file_path = os.path.join(dataset_dir, excel_file)


df = pd.read_excel(excel_file_path)

df = df[df['category'] != 'Miscellaneous']
print(f"Dataset after removing Miscellaneous category: {len(df)} records")


if 'Date' in df.columns:
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        df['Date'] = pd.to_datetime(df['Date'])

plots_dir = os.path.join(dataset_dir, 'Analysis_plots')
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

# ===== HELPER FUNCTION FOR TRANSACTION ANALYSIS =====
def compute_transaction_metrics(dataframe):
    """
    Compute transaction metrics:
      - items per transaction: average, median, min, max
      - transaction size percentiles (25th, 50th, 75th, 90th, 95th, 99th)
    Returns a tuple with (metrics dictionary, items_per_transaction series)
    """
    
    items_per_transaction = dataframe.groupby('BillNo').size()
    
    metrics = {}
    metrics['Avg Items'] = items_per_transaction.mean()
    metrics['Median Items'] = items_per_transaction.median()
    metrics['Min Items'] = items_per_transaction.min()
    metrics['Max Items'] = items_per_transaction.max()
    
    
    percentiles = np.percentile(items_per_transaction, [25, 50, 75, 90, 95, 99])
    metrics['P25'] = percentiles[0]
    metrics['P50'] = percentiles[1] 
    metrics['P75'] = percentiles[2]
    metrics['P90'] = percentiles[3]
    metrics['P95'] = percentiles[4]
    metrics['P99'] = percentiles[5]
    
    return metrics, items_per_transaction

# ===== DATA PARTITIONING =====

n = len(df)
one_third_index = n // 3

df_whole = df.copy()
df_first_third = df.iloc[:one_third_index]
df_last_two_thirds = df.iloc[one_third_index:]

# ===== ANALYZE TRANSACTION PATTERNS =====
print("\n ===== TRANSACTION PATTERNS ANALYSIS ===== \n")


metrics_whole, items_whole = compute_transaction_metrics(df_whole)
metrics_first_third, items_first_third = compute_transaction_metrics(df_first_third)
metrics_last_two_thirds, items_last_two_thirds = compute_transaction_metrics(df_last_two_thirds)


print("Average Items per Transaction (Whole Dataset):")
print("=" * 50)
print(f" Average Items: {metrics_whole['Avg Items']:.2f}")
print(f" Median Items: {metrics_whole['Median Items']}")
print(f" Min Items: {metrics_whole['Min Items']}")
print(f" Max Items: {metrics_whole['Max Items']}")
print("=" * 50)
print("\nTransaction Size Percentiles (Whole Dataset):")
print(f" 25th percentile: {metrics_whole['P25']:.1f} items")
print(f" 50th percentile (median): {metrics_whole['P50']:.1f} items")
print(f" 75th percentile: {metrics_whole['P75']:.1f} items")
print(f" 90th percentile: {metrics_whole['P90']:.1f} items")
print(f" 95th percentile: {metrics_whole['P95']:.1f} items")
print(f" 99th percentile: {metrics_whole['P99']:.1f} items")
print("=" * 50)

# ===== COMPARISON TABLE =====
comparison_data = {
    "Whole": metrics_whole,
    "First Third": metrics_first_third,
    "Last Two Thirds": metrics_last_two_thirds
}
comparison_table = pd.DataFrame(comparison_data)
comparison_table.index.name = "Metric"
print("\nComparison Table (Transaction Metrics):")
print(comparison_table)


comparison_table_file = os.path.join(plots_dir, "transaction_comparison_table.xlsx")
comparison_table.to_excel(comparison_table_file)
print(f"\nComparison table saved to: {comparison_table_file}")

# ===== PLOTTING FUNCTION =====
def plot_transaction_distribution(items_series, title, save_path):
    """
    Plot the transaction size distribution given a series of items per transaction.
    Saves the plot to the provided save_path.
    """
  
    size_bins = [1, 5, 10, 15, 20, 30, 50, 100, np.inf]
    size_labels = ['1-4', '5-9', '10-14', '15-19', '20-29', '30-49', '50-99', '100+']
    
 
    transaction_binned = pd.cut(items_series, bins=size_bins, labels=size_labels)
    size_counts = transaction_binned.value_counts().sort_index()
    total_transactions = len(items_series)
    

    plt.figure(figsize=(12, 7), facecolor=BACKGROUND_COLOR)
    bars = plt.bar(size_counts.index.astype(str), size_counts.values, color=MAIN_COLOR, edgecolor='white')
    plt.title(f'Transaction Size Distribution - {title}', fontweight='bold', fontsize=16, color=TEXT_COLOR)
    plt.xlabel('Number of Items', fontsize=14, color=TEXT_COLOR)
    plt.ylabel('Transactions', fontsize=14, color=TEXT_COLOR)
    

    for i, v in enumerate(size_counts.values):
        percentage = (v / total_transactions) * 100
        plt.text(i, v + 0.05 * total_transactions, f"{percentage:.1f}%", ha='center', fontsize=10, 
                 fontweight='bold', color=TEXT_COLOR)
    
  
    max_bin = size_counts.idxmax()
    max_idx = list(size_counts.index).index(max_bin)
    bars[max_idx].set_edgecolor('black')
    bars[max_idx].set_linewidth(2)
    
    plt.grid(axis='y', alpha=0.3, color="#cccccc")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()  

# ===== GENERATE AND SAVE SEPARATE PLOTS =====

partitions = {
    "Whole_Dataset": items_whole,
    "First_Third": items_first_third,
    "Last_Two_Thirds": items_last_two_thirds
}

for partition_name, items_series in partitions.items():
    plot_title = partition_name.replace("_", " ")
    save_file = os.path.join(plots_dir, f"transaction_size_distribution_{partition_name}.png")
    plot_transaction_distribution(items_series, plot_title, save_file)
    print(f"Plot saved for {plot_title} at: {save_file}")


Dataset after removing Miscellaneous category: 517587 records

 ===== TRANSACTION PATTERNS ANALYSIS ===== 

Average Items per Transaction (Whole Dataset):
 Average Items: 26.54
 Median Items: 15.0
 Min Items: 1
 Max Items: 1111

Transaction Size Percentiles (Whole Dataset):
 25th percentile: 6.0 items
 50th percentile (median): 15.0 items
 75th percentile: 29.0 items
 90th percentile: 54.0 items
 95th percentile: 79.0 items
 99th percentile: 222.0 items

Comparison Table (Transaction Metrics):
                    Whole  First Third  Last Two Thirds
Metric                                                 
Avg Items       26.536119    25.727557        26.957656
Median Items    15.000000    15.000000        15.000000
Min Items        1.000000     1.000000         1.000000
Max Items     1111.000000   673.000000      1111.000000
P25              6.000000     6.000000         6.000000
P50             15.000000    15.000000        15.000000
P75             29.000000    28.000000        29.0000

### Products Analysis

Examine the product-related characteristics:

- **Top 10 most frequently purchased products**.
- **Product category distribution**: Percentage of items in each category.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

print("=" * 80)
print("PRODUCT ANALYSIS SCRIPT STARTED")
print("=" * 80)

# ===== VISUALIZATION SETTINGS =====
print("\nSetting up visualization configuration...")
MAIN_COLOR = "#1f77b4"       
TERTIARY_COLOR = "#2ca02c"  
BACKGROUND_COLOR = "#f5f5f5"
TEXT_COLOR = "#333333"     

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette([MAIN_COLOR, TERTIARY_COLOR])
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.facecolor'] = BACKGROUND_COLOR
plt.rcParams['axes.edgecolor'] = TEXT_COLOR
plt.rcParams['axes.labelcolor'] = TEXT_COLOR
plt.rcParams['text.color'] = TEXT_COLOR
plt.rcParams['xtick.color'] = TEXT_COLOR
plt.rcParams['ytick.color'] = TEXT_COLOR
print("Visualization settings configured successfully")

# ===== DATA LOADING =====
print("\nLoading dataset...")
current_dir = os.getcwd() 
print(f"Current working directory: {current_dir}")

dataset_dir = os.path.join(current_dir, "Dataset")
print(f"Dataset directory: {dataset_dir}")

excel_file = 'full_validated_dataset.xlsx'
excel_file_path = os.path.join(dataset_dir, excel_file)
print(f"Excel file path: {excel_file_path}")

if not os.path.exists(excel_file_path):
    print(f"ERROR: Excel file not found at {excel_file_path}")
    exit(1)

print("Reading Excel file...")
df = pd.read_excel(excel_file_path)
print(f"Dataset loaded successfully with {len(df)} records")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


print(f"\nFiltering out 'Miscellaneous' category...")
original_count = len(df)
df = df[df['category'] != 'Miscellaneous']
filtered_count = len(df)
print(f"Original records: {original_count:,}")
print(f"After filtering: {filtered_count:,}")
print(f"Removed {original_count - filtered_count:,} records ({((original_count - filtered_count)/original_count)*100:.1f}%)")


print(f"\nChecking Date column...")
if 'Date' in df.columns:
    print("Date column found")
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        print("Converting Date column to datetime...")
        df['Date'] = pd.to_datetime(df['Date'])
        print("Date conversion completed")
    else:
        print("Date column already in datetime format")
else:
    print("No Date column found in dataset")


print(f"\nSetting up plots directory...")
plots_dir = os.path.join(dataset_dir, 'Analysis_plots')
print(f"Plots directory: {plots_dir}")
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)
    print("Plots directory created")
else:
    print("Plots directory already exists")

# ===== DATA PARTITIONING =====
print(f"\nPartitioning dataset...")
n = len(df)
one_third_index = n // 3
print(f"Total records: {n:,}")
print(f"One-third index: {one_third_index:,}")

df_whole = df.copy()
df_first_third = df.iloc[:one_third_index]
df_last_two_thirds = df.iloc[one_third_index:]

print(f"Whole dataset: {len(df_whole):,} records")
print(f"First third: {len(df_first_third):,} records")
print(f"Last two thirds: {len(df_last_two_thirds):,} records")

partitions = {
    "Whole_Dataset": df_whole,
    "First_Third": df_first_third,
    "Last_Two_Thirds": df_last_two_thirds
}
print(f"Created {len(partitions)} data partitions")

# ===== PRODUCT ANALYSIS FUNCTIONS =====
def plot_top_products(dataframe, partition_name, save_dir):
    """
    Plot the top 10 products by frequency for the given partition.
    The plot is saved as 'top_products_{partition_name}.png'.
    """
    print(f"\n" + "="*70)
    print(f"PROCESSING TOP PRODUCTS FOR: {partition_name.upper()}")
    print(f"="*70)
    print(f"Analyzing {len(dataframe):,} records...")
    
  
    all_products = dataframe['Itemname'].value_counts()
    top_products = all_products.head(10)
    total_items = len(dataframe)
    unique_products = dataframe['Itemname'].nunique()
    
    print(f"\nOVERALL STATISTICS:")
    print(f"  Total items in partition: {total_items:,}")
    print(f"  Unique products in partition: {unique_products:,}")
    print(f"  Coverage of top 10: {top_products.sum():,} items ({(top_products.sum()/total_items)*100:.1f}%)")
    
    print(f"\nTOP 10 PRODUCTS BREAKDOWN:")
    print("-" * 80)
    print(f"{'Rank':<4} {'Product Name':<40} {'Count':<10} {'Percentage':<12}")
    print("-" * 80)
    
    for i, (product, count) in enumerate(top_products.items(), 1):
        percentage = (count / total_items) * 100
        product_truncated = product[:37] + "..." if len(product) > 40 else product
        print(f"{i:<4} {product_truncated:<40} {count:<10} {percentage:<12.2f}%")
    
    print("-" * 80)
    print(f"Top product: '{top_products.index[0]}' with {top_products.iloc[0]:,} occurrences ({(top_products.iloc[0]/total_items)*100:.1f}%)")
    print(f"Least frequent in top 10: '{top_products.index[-1]}' with {top_products.iloc[-1]:,} occurrences ({(top_products.iloc[-1]/total_items)*100:.1f}%)")
    

    remaining_products = unique_products - 10
    remaining_items = total_items - top_products.sum()
    if remaining_products > 0:
        avg_frequency_remaining = remaining_items / remaining_products
        print(f"Remaining {remaining_products:,} products: {remaining_items:,} items (avg {avg_frequency_remaining:.1f} per product)")
    

    print(f"\nCreating horizontal bar chart...")
    plt.figure(figsize=(14, 10), facecolor=BACKGROUND_COLOR)
    
    bars = plt.barh(top_products.index[::-1], top_products.values[::-1], color=MAIN_COLOR, edgecolor='white')
    plt.title(f"Top 10 Products by Frequency - {partition_name.replace('_', ' ')}", 
              fontweight='bold', fontsize=16, color=TEXT_COLOR)
    plt.xlabel("Number of Occurrences", fontsize=14, color=TEXT_COLOR)
    plt.ylabel("Product Name", fontsize=14, color=TEXT_COLOR)
    

    for i, bar in enumerate(bars):
        value = top_products.values[::-1][i]
        percentage = (value / total_items) * 100
        plt.text(value + 10, bar.get_y() + bar.get_height()/2, 
                 f"{value} ({percentage:.1f}%)", va='center', fontsize=10, color=TEXT_COLOR)
    
    plt.grid(axis='x', alpha=0.3, color="#cccccc")
    plt.tight_layout()
    

    save_path = os.path.join(save_dir, f"top_products_{partition_name}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()
    print(f"Top products plot saved to: {save_path}")

def plot_category_distribution(dataframe, partition_name, save_dir):
    """
    Plot the product category distribution for the given partition if 'category' column exists.
    The plot is saved as 'category_distribution_{partition_name}.png'.
    """
    if 'category' not in dataframe.columns:
        print(f"No 'category' column found for {partition_name}. Skipping category distribution plot.")
        return
    
    print(f"\n" + "="*70)
    print(f"PROCESSING CATEGORY DISTRIBUTION FOR: {partition_name.upper()}")
    print(f"="*70)
    print(f"Analyzing {len(dataframe):,} records...")
    
 
    all_categories = dataframe['category'].value_counts()
    total_items = len(dataframe)
    unique_categories = dataframe['category'].nunique()
    
    print(f"\nOVERALL CATEGORY STATISTICS:")
    print(f"  Total items in partition: {total_items:,}")
    print(f"  Unique categories in partition: {unique_categories}")
    print(f"  Most popular category: '{all_categories.index[0]}' with {all_categories.iloc[0]:,} items ({(all_categories.iloc[0]/total_items)*100:.1f}%)")
    print(f"  Least popular category: '{all_categories.index[-1]}' with {all_categories.iloc[-1]:,} items ({(all_categories.iloc[-1]/total_items)*100:.1f}%)")
    
    print(f"\nCOMPLETE CATEGORY BREAKDOWN:")
    print("-" * 80)
    print(f"{'Rank':<4} {'Category Name':<30} {'Count':<12} {'Percentage':<12} {'Cumulative %':<12}")
    print("-" * 80)
    
    cumulative_percentage = 0
    for i, (category, count) in enumerate(all_categories.items(), 1):
        percentage = (count / total_items) * 100
        cumulative_percentage += percentage
        category_truncated = category[:27] + "..." if len(category) > 30 else category
        print(f"{i:<4} {category_truncated:<30} {count:<12,} {percentage:<12.1f}% {cumulative_percentage:<12.1f}%")
    
    print("-" * 80)
    
 
    print(f"\nCATEGORY SIZE ANALYSIS:")
    large_categories = all_categories[all_categories >= total_items * 0.1]  # Categories with 10%+ of items
    medium_categories = all_categories[(all_categories >= total_items * 0.05) & (all_categories < total_items * 0.1)]  # 5-10%
    small_categories = all_categories[(all_categories >= total_items * 0.01) & (all_categories < total_items * 0.05)]  # 1-5%
    tiny_categories = all_categories[all_categories < total_items * 0.01]  # <1%
    
    print(f"  Large categories (≥10%): {len(large_categories)} categories, {large_categories.sum():,} items ({(large_categories.sum()/total_items)*100:.1f}%)")
    print(f"  Medium categories (5-10%): {len(medium_categories)} categories, {medium_categories.sum():,} items ({(medium_categories.sum()/total_items)*100:.1f}%)")
    print(f"  Small categories (1-5%): {len(small_categories)} categories, {small_categories.sum():,} items ({(small_categories.sum()/total_items)*100:.1f}%)")
    print(f"  Tiny categories (<1%): {len(tiny_categories)} categories, {tiny_categories.sum():,} items ({(tiny_categories.sum()/total_items)*100:.1f}%)")
    

    category_counts = all_categories.head(10)
    
 
    print(f"\nCreating bar chart for top 10 categories...")
    plt.figure(figsize=(14, 8), facecolor=BACKGROUND_COLOR)
    bars = plt.bar(category_counts.index, category_counts.values, color=MAIN_COLOR, edgecolor='white')
    plt.title(f"Product Categories Distribution - {partition_name.replace('_', ' ')}", 
              fontweight='bold', fontsize=16, color=TEXT_COLOR)
    plt.xlabel("Category", fontsize=14, color=TEXT_COLOR)
    plt.ylabel("Number of Items", fontsize=14, color=TEXT_COLOR)
    plt.xticks(rotation=45, ha='right', color=TEXT_COLOR)
    

    for i, bar in enumerate(bars):
        value = category_counts.values[i]
        percentage = (value / total_items) * 100
        plt.text(bar.get_x() + bar.get_width()/2, value + 5, 
                 f"{value:,}\n({percentage:.1f}%)", ha='center', va='bottom', 
                 fontsize=10, color=TEXT_COLOR, fontweight='bold')
    
    plt.grid(axis='y', alpha=0.3, color="#cccccc")
    plt.tight_layout()
    

    save_path = os.path.join(save_dir, f"category_distribution_{partition_name}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()
    print(f"Category distribution plot saved to: {save_path}")


print(f"\n" + "=" * 80)
print("GENERATING COMPREHENSIVE ANALYSIS FOR ALL PARTITIONS")
print("=" * 80)

for i, (partition_name, partition_df) in enumerate(partitions.items(), 1):
    print(f"\n[{i}/{len(partitions)}] PROCESSING PARTITION: {partition_name.upper()}")
    print(f"Partition size: {len(partition_df):,} records")
    print(f"Date range: {partition_df['Date'].min()} to {partition_df['Date'].max()}" if 'Date' in partition_df.columns else "No date information")
    

    plot_top_products(partition_df, partition_name, plots_dir)
    

    plot_category_distribution(partition_df, partition_name, plots_dir)
    
    print(f"\nCompleted processing for {partition_name}")
    print("-" * 70)

print(f"\n" + "=" * 80)
print("PRODUCT ANALYSIS COMPLETED SUCCESSFULLY!")
print(f"All plots saved to: {plots_dir}")
print(f"Generated {len(partitions) * 2} visualization files")
print("=" * 80)

PRODUCT ANALYSIS SCRIPT STARTED

Setting up visualization configuration...
Visualization settings configured successfully

Loading dataset...
Current working directory: c:\Users\moham\Coding-Projects\Apriori_VS_Word2Vec
Dataset directory: c:\Users\moham\Coding-Projects\Apriori_VS_Word2Vec\Dataset
Excel file path: c:\Users\moham\Coding-Projects\Apriori_VS_Word2Vec\Dataset\full_validated_dataset.xlsx
Reading Excel file...
Dataset loaded successfully with 520609 records
Dataset shape: (520609, 8)
Columns: ['BillNo', 'Itemname', 'Quantity', 'Date', 'Price', 'CustomerID', 'Country', 'category']

Filtering out 'Miscellaneous' category...
Original records: 520,609
After filtering: 517,587
Removed 3,022 records (0.6%)

Checking Date column...
Date column found
Date column already in datetime format

Setting up plots directory...
Plots directory: c:\Users\moham\Coding-Projects\Apriori_VS_Word2Vec\Dataset\Analysis_plots
Plots directory already exists

Partitioning dataset...
Total records: 517,5