### Data Analysis

- **How many rows**: Number of roows in the dataset.
- **How many columns**: Number of columns in the dataset.
- **List of the columns**: List of columns in the dataset.
- **Total Products**: Number of products in the dataset.
- **Distinct Products**: Number of unique products.
- **Total Transactions**: Number of transactions recorded.
- **Total Customers**: Number of customers in the dataset.
- **Distinct Categories**: Number of unique product categories.


In [3]:
import pandas as pd
import os

# ===== CONFIGURATION =====
BACKGROUND_COLOR = "#f5f5f5"  # Light gray background
TEXT_COLOR = "#333333"        # Dark gray for text

# Set path and file name
path = r'C:\Users\moham\Apriori_VS_Word2Vec\Dataset'
excel_file = 'df_merged_items_category.xlsx'
excel_file_path = os.path.join(path, excel_file)

# ===== DATA LOADING =====
# Load the dataset
df = pd.read_excel(excel_file_path)
print("\n ===== DATA ANALYSIS ===== \n")

# Create output directory for plots if it doesn't exist
plots_dir = os.path.join(path, 'Analysis_plots')
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

# ===== FUNCTION DEFINITION =====
def compute_stats(dataframe):
    """
    Compute key statistics for the given DataFrame.
    Returns a dictionary with analysis results.
    """
    stats = {}
    stats['Rows'] = len(dataframe)
    stats['Columns'] = len(dataframe.columns)
    stats['Total Products'] = len(dataframe)  # Each row represents a product purchase
    stats['Distinct Products'] = dataframe['Itemname'].nunique()
    stats['Total Transactions'] = dataframe['BillNo'].nunique()
    stats['Total Customers'] = int(dataframe['CustomerID'].dropna().nunique())
    stats['Distinct Categories'] = dataframe['category'].nunique()
    return stats

# ===== ANALYSIS FOR DIFFERENT PARTITIONS =====
# Whole dataset statistics
stats_whole = compute_stats(df)

# Compute the index to split the first third from the remaining two thirds
one_third_index = len(df) // 3

# Split into first third and last two thirds
df_first_third = df.iloc[:one_third_index]
df_last_two_thirds = df.iloc[one_third_index:]

stats_first_third = compute_stats(df_first_third)
stats_last_two_thirds = compute_stats(df_last_two_thirds)

# ===== PRINT OVERVIEW FOR WHOLE DATASET =====
print("Dataset Overview (Whole Dataset):")
print("=" * 50)
print(f"  Rows: {stats_whole['Rows']:,}")
print(f"  Columns: {stats_whole['Columns']}")
print(f"  Column Names: {', '.join(df.columns.tolist())}")
print("=" * 50)
print(f"  Total Products: {stats_whole['Total Products']:,}")
print(f"  Distinct Products: {stats_whole['Distinct Products']:,}")
print(f"  Total Transactions: {stats_whole['Total Transactions']:,}")
print(f"  Total Customers: {stats_whole['Total Customers']:,}")
print(f"  Distinct Categories: {stats_whole['Distinct Categories']}")
print("=" * 50)

# ===== COMPARISON TABLE =====
# Create a DataFrame that compares statistics of the whole dataset,
# the first third, and the last two thirds.
comparison_data = {
    "Full Dataset": stats_whole,
    "First Third": stats_first_third,
    "Last Two Thirds": stats_last_two_thirds
}
comparison_table = pd.DataFrame(comparison_data)
comparison_table.index.name = "Metric"

print("\nComparison Table:")
print(comparison_table)





 ===== DATA ANALYSIS ===== 

Dataset Overview (Whole Dataset):
  Rows: 520,609
  Columns: 8
  Column Names: BillNo, Itemname, Quantity, Date, Price, CustomerID, Country, category
  Total Products: 520,609
  Distinct Products: 4,185
  Total Transactions: 20,208
  Total Customers: 4,297
  Distinct Categories: 21

Comparison Table:
                     Full Dataset  First Third  Last Two Thirds
Metric                                                         
Rows                       520609       173536           347073
Columns                         8            8                8
Total Products             520609       173536           347073
Distinct Products            4185         3326             3738
Total Transactions          20208         6847            13362
Total Customers              4297         2489             3632
Distinct Categories            21           21               21


In [11]:
import pandas as pd
import os

path = r'C:\Users\moham\Apriori_VS_Word2Vec\Dataset'
excel_file = 'df_merged_items_category.xlsx'
excel_file_path = os.path.join(path, excel_file)

def load_dataset(file_path):
    return pd.read_excel(file_path)

data_excel = load_dataset(excel_file_path)
data_excel.dropna(subset=['Itemname'], inplace=True)   
output_file = os.path.join(path, 'unique_items.xlsx')

unique_items_df = pd.DataFrame({'unique_items': data_excel['Itemname'].unique()})
unique_items_df.to_excel(output_file, index=False)

# DataFrame with distinct products and their categories
distinct_products_df = df[['Itemname', 'category']].drop_duplicates().reset_index(drop=True)

# Sort by category and then by product name for better organization
distinct_products_df = distinct_products_df.sort_values(['category', 'Itemname'])

# Save to Excel file
output_file = os.path.join(path, 'distinct_products_with_categories.xlsx')
distinct_products_df.to_excel(output_file, index=False)

print(f"Excel file saved to: {output_file}")



Excel file saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\distinct_products_with_categories.xlsx


### Transaction Patterns

Analyze the purchasing patterns visible in the data:

- **Average items per transaction**.
- **Distribution of transaction sizes**: Histogram showing the number of items per invoice.


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

# ===== VISUALIZATION SETTINGS =====
MAIN_COLOR = "#1f77b4"       # Primary blue
TERTIARY_COLOR = "#2ca02c"   # Green for additional elements
BACKGROUND_COLOR = "#f5f5f5" # Light gray background
TEXT_COLOR = "#333333"       # Dark gray for text

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette([MAIN_COLOR, TERTIARY_COLOR])
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.facecolor'] = BACKGROUND_COLOR
plt.rcParams['axes.edgecolor'] = TEXT_COLOR
plt.rcParams['axes.labelcolor'] = TEXT_COLOR
plt.rcParams['text.color'] = TEXT_COLOR
plt.rcParams['xtick.color'] = TEXT_COLOR
plt.rcParams['ytick.color'] = TEXT_COLOR

# ===== DATA LOADING =====
path = r'C:\Users\moham\Apriori_VS_Word2Vec\Dataset'
excel_file = 'df_merged_items_category.xlsx'
excel_file_path = os.path.join(path, excel_file)

# Load the dataset
df = pd.read_excel(excel_file_path)

# Ensure 'Date' is in datetime format if the column exists
if 'Date' in df.columns:
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        df['Date'] = pd.to_datetime(df['Date'])

# Create output directory for plots if it doesn't exist
plots_dir = os.path.join(path, 'Analysis_plots')
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

# ===== HELPER FUNCTION FOR TRANSACTION ANALYSIS =====
def compute_transaction_metrics(dataframe):
    """
    Compute transaction metrics:
      - items per transaction: average, median, min, max
      - transaction size percentiles (25th, 50th, 75th, 90th, 95th, 99th)
    Returns a tuple with (metrics dictionary, items_per_transaction series)
    """
    # Group by 'BillNo' to compute items per transaction
    items_per_transaction = dataframe.groupby('BillNo').size()
    
    metrics = {}
    metrics['Avg Items'] = items_per_transaction.mean()
    metrics['Median Items'] = items_per_transaction.median()
    metrics['Min Items'] = items_per_transaction.min()
    metrics['Max Items'] = items_per_transaction.max()
    
    # Compute percentiles
    percentiles = np.percentile(items_per_transaction, [25, 50, 75, 90, 95, 99])
    metrics['P25'] = percentiles[0]
    metrics['P50'] = percentiles[1]  # same as median
    metrics['P75'] = percentiles[2]
    metrics['P90'] = percentiles[3]
    metrics['P95'] = percentiles[4]
    metrics['P99'] = percentiles[5]
    
    return metrics, items_per_transaction

# ===== DATA PARTITIONING =====
# Split dataset into whole, first third, and last two thirds.
n = len(df)
one_third_index = n // 3

df_whole = df.copy()
df_first_third = df.iloc[:one_third_index]
df_last_two_thirds = df.iloc[one_third_index:]

# ===== ANALYZE TRANSACTION PATTERNS =====
print("\n ===== TRANSACTION PATTERNS ANALYSIS ===== \n")

# Compute metrics and items-per-transaction series for each partition.
metrics_whole, items_whole = compute_transaction_metrics(df_whole)
metrics_first_third, items_first_third = compute_transaction_metrics(df_first_third)
metrics_last_two_thirds, items_last_two_thirds = compute_transaction_metrics(df_last_two_thirds)

# Display overall transaction metrics (from the whole dataset) as an example.
print("Average Items per Transaction (Whole Dataset):")
print("=" * 50)
print(f" Average Items: {metrics_whole['Avg Items']:.2f}")
print(f" Median Items: {metrics_whole['Median Items']}")
print(f" Min Items: {metrics_whole['Min Items']}")
print(f" Max Items: {metrics_whole['Max Items']}")
print("=" * 50)
print("\nTransaction Size Percentiles (Whole Dataset):")
print(f" 25th percentile: {metrics_whole['P25']:.1f} items")
print(f" 50th percentile (median): {metrics_whole['P50']:.1f} items")
print(f" 75th percentile: {metrics_whole['P75']:.1f} items")
print(f" 90th percentile: {metrics_whole['P90']:.1f} items")
print(f" 95th percentile: {metrics_whole['P95']:.1f} items")
print(f" 99th percentile: {metrics_whole['P99']:.1f} items")
print("=" * 50)

# ===== COMPARISON TABLE =====
comparison_data = {
    "Whole": metrics_whole,
    "First Third": metrics_first_third,
    "Last Two Thirds": metrics_last_two_thirds
}
comparison_table = pd.DataFrame(comparison_data)
comparison_table.index.name = "Metric"
print("\nComparison Table (Transaction Metrics):")
print(comparison_table)

# Save the comparison table to an Excel file.
comparison_table_file = os.path.join(plots_dir, "transaction_comparison_table.xlsx")
comparison_table.to_excel(comparison_table_file)
print(f"\nComparison table saved to: {comparison_table_file}")

# ===== PLOTTING FUNCTION =====
def plot_transaction_distribution(items_series, title, save_path):
    """
    Plot the transaction size distribution given a series of items per transaction.
    Saves the plot to the provided save_path.
    """
    # Define bins and labels
    size_bins = [1, 5, 10, 15, 20, 30, 50, 100, np.inf]
    size_labels = ['1-4', '5-9', '10-14', '15-19', '20-29', '30-49', '50-99', '100+']
    
    # Bin the transaction sizes
    transaction_binned = pd.cut(items_series, bins=size_bins, labels=size_labels)
    size_counts = transaction_binned.value_counts().sort_index()
    total_transactions = len(items_series)
    
    # Create the plot
    plt.figure(figsize=(12, 7), facecolor=BACKGROUND_COLOR)
    bars = plt.bar(size_counts.index.astype(str), size_counts.values, color=MAIN_COLOR, edgecolor='white')
    plt.title(f'Transaction Size Distribution - {title}', fontweight='bold', fontsize=16, color=TEXT_COLOR)
    plt.xlabel('Number of Items', fontsize=14, color=TEXT_COLOR)
    plt.ylabel('Transactions', fontsize=14, color=TEXT_COLOR)
    
    # Annotate bars with percentage values
    for i, v in enumerate(size_counts.values):
        percentage = (v / total_transactions) * 100
        plt.text(i, v + 0.05 * total_transactions, f"{percentage:.1f}%", ha='center', fontsize=10, 
                 fontweight='bold', color=TEXT_COLOR)
    
    # Highlight the bin with the maximum count
    max_bin = size_counts.idxmax()
    max_idx = list(size_counts.index).index(max_bin)
    bars[max_idx].set_edgecolor('black')
    bars[max_idx].set_linewidth(2)
    
    plt.grid(axis='y', alpha=0.3, color="#cccccc")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()  # Close the figure to free memory

# ===== GENERATE AND SAVE SEPARATE PLOTS =====
# Dictionary of partitions with names and items_per_transaction series.
partitions = {
    "Whole_Dataset": items_whole,
    "First_Third": items_first_third,
    "Last_Two_Thirds": items_last_two_thirds
}

for partition_name, items_series in partitions.items():
    plot_title = partition_name.replace("_", " ")
    save_file = os.path.join(plots_dir, f"transaction_size_distribution_{partition_name}.png")
    plot_transaction_distribution(items_series, plot_title, save_file)
    print(f"Plot saved for {plot_title} at: {save_file}")



 ===== TRANSACTION PATTERNS ANALYSIS ===== 

Average Items per Transaction (Whole Dataset):
 Average Items: 25.76
 Median Items: 15.0
 Min Items: 1
 Max Items: 1114

Transaction Size Percentiles (Whole Dataset):
 25th percentile: 5.0 items
 50th percentile (median): 15.0 items
 75th percentile: 28.0 items
 90th percentile: 53.0 items
 95th percentile: 77.0 items
 99th percentile: 218.0 items

Comparison Table (Transaction Metrics):
                   Whole  First Third  Last Two Thirds
Metric                                                
Avg Items       25.76252    25.344823         25.97463
Median Items    15.00000    14.000000         15.00000
Min Items        1.00000     1.000000          1.00000
Max Items     1114.00000   675.000000       1114.00000
P25              5.00000     6.000000          5.00000
P50             15.00000    14.000000         15.00000
P75             28.00000    28.000000         29.00000
P90             53.00000    52.000000         53.00000
P95          

### Products Analysis

Examine the product-related characteristics:

- **Top 10 most frequently purchased products**.
- **Product category distribution**: Percentage of items in each category.


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

# ===== VISUALIZATION SETTINGS =====
MAIN_COLOR = "#1f77b4"       # Primary blue
TERTIARY_COLOR = "#2ca02c"   # Green for additional elements
BACKGROUND_COLOR = "#f5f5f5" # Light gray background
TEXT_COLOR = "#333333"       # Dark gray for text

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette([MAIN_COLOR, TERTIARY_COLOR])
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial', 'Helvetica', 'DejaVu Sans']
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.facecolor'] = BACKGROUND_COLOR
plt.rcParams['axes.edgecolor'] = TEXT_COLOR
plt.rcParams['axes.labelcolor'] = TEXT_COLOR
plt.rcParams['text.color'] = TEXT_COLOR
plt.rcParams['xtick.color'] = TEXT_COLOR
plt.rcParams['ytick.color'] = TEXT_COLOR

# ===== DATA LOADING =====
path = r'C:\Users\moham\Apriori_VS_Word2Vec\Dataset'
excel_file = 'df_merged_items_category.xlsx'
excel_file_path = os.path.join(path, excel_file)

# Load the dataset
df = pd.read_excel(excel_file_path)

# Ensure 'Date' is in datetime format if the column exists
if 'Date' in df.columns:
    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        df['Date'] = pd.to_datetime(df['Date'])

# Create output directory for plots if it doesn't exist
plots_dir = os.path.join(path, 'Analysis_plots')
if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

# ===== DATA PARTITIONING =====
# Split dataset into Whole, First Third, and Last Two Thirds.
n = len(df)
one_third_index = n // 3

df_whole = df.copy()
df_first_third = df.iloc[:one_third_index]
df_last_two_thirds = df.iloc[one_third_index:]

partitions = {
    "Whole_Dataset": df_whole,
    "First_Third": df_first_third,
    "Last_Two_Thirds": df_last_two_thirds
}

# ===== PRODUCT ANALYSIS FUNCTIONS =====
def plot_top_products(dataframe, partition_name, save_dir):
    """
    Plot the top 10 products by frequency for the given partition.
    The plot is saved as 'top_products_{partition_name}.png'.
    """
    print(f"\nProcessing Top Products for: {partition_name}")
    top_products = dataframe['Itemname'].value_counts().head(10)
    total_items = len(dataframe)
    
    plt.figure(figsize=(14, 10), facecolor=BACKGROUND_COLOR)
    # Reverse the order for a horizontal bar plot (largest at the top)
    bars = plt.barh(top_products.index[::-1], top_products.values[::-1], color=MAIN_COLOR, edgecolor='white')
    plt.title(f"Top 10 Products by Frequency - {partition_name.replace('_', ' ')}", 
              fontweight='bold', fontsize=16, color=TEXT_COLOR)
    plt.xlabel("Number of Occurrences", fontsize=14, color=TEXT_COLOR)
    plt.ylabel("Product Name", fontsize=14, color=TEXT_COLOR)
    
    # Add count annotations
    for i, bar in enumerate(bars):
        value = top_products.values[::-1][i]
        plt.text(value + 10, bar.get_y() + bar.get_height()/2, 
                 f"{value}", va='center', fontsize=10, color=TEXT_COLOR)
    
    plt.grid(axis='x', alpha=0.3, color="#cccccc")
    plt.tight_layout()
    save_path = os.path.join(save_dir, f"top_products_{partition_name}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()
    print(f"Top products plot saved to: {save_path}")

def plot_category_distribution(dataframe, partition_name, save_dir):
    """
    Plot the product category distribution for the given partition if 'category' column exists.
    The plot is saved as 'category_distribution_{partition_name}.png'.
    """
    if 'category' not in dataframe.columns:
        print(f"No 'category' column found for {partition_name}. Skipping category distribution plot.")
        return
    
    print(f"\nProcessing Category Distribution for: {partition_name}")
    category_counts = dataframe['category'].value_counts().head(10)
    total_items = len(dataframe)
    
    plt.figure(figsize=(14, 8), facecolor=BACKGROUND_COLOR)
    bars = plt.bar(category_counts.index, category_counts.values, color=MAIN_COLOR, edgecolor='white')
    plt.title(f"Product Categories Distribution - {partition_name.replace('_', ' ')}", 
              fontweight='bold', fontsize=16, color=TEXT_COLOR)
    plt.xlabel("Category", fontsize=14, color=TEXT_COLOR)
    plt.ylabel("Number of Items", fontsize=14, color=TEXT_COLOR)
    plt.xticks(rotation=45, ha='right', color=TEXT_COLOR)
    
    # Add value annotations on top of each bar
    for i, bar in enumerate(bars):
        value = category_counts.values[i]
        plt.text(bar.get_x() + bar.get_width()/2, value + 5, 
                 f"{value}", ha='center', va='bottom', 
                 fontsize=10, color=TEXT_COLOR, fontweight='bold')
    
    plt.grid(axis='y', alpha=0.3, color="#cccccc")
    plt.tight_layout()
    save_path = os.path.join(save_dir, f"category_distribution_{partition_name}.png")
    plt.savefig(save_path, dpi=300)
    plt.close()
    print(f"Category distribution plot saved to: {save_path}")

# ===== GENERATE PLOTS FOR EACH PARTITION =====
for partition_name, partition_df in partitions.items():
    # Top Products Plot
    plot_top_products(partition_df, partition_name, plots_dir)
    
    # Product Category Distribution Plot
    plot_category_distribution(partition_df, partition_name, plots_dir)



Processing Top Products for: Whole_Dataset
Top products plot saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\Analysis_plots\top_products_Whole_Dataset.png

Processing Category Distribution for: Whole_Dataset
Category distribution plot saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\Analysis_plots\category_distribution_Whole_Dataset.png

Processing Top Products for: First_Third
Top products plot saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\Analysis_plots\top_products_First_Third.png

Processing Category Distribution for: First_Third
Category distribution plot saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\Analysis_plots\category_distribution_First_Third.png

Processing Top Products for: Last_Two_Thirds
Top products plot saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\Analysis_plots\top_products_Last_Two_Thirds.png

Processing Category Distribution for: Last_Two_Thirds
Category distribution plot saved to: C:\Users\moham\Apriori_VS_Word2Vec\Dataset\Analysis_p