## **THE WORD2VEC MODEL**

In [None]:
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec
import logging

# Configure logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def load_and_preprocess_data(file_path):
    """
  
    
    Args:
        file_path (str): Path to the Excel file
    
    Returns:
        tuple: (transactions as list of lists, unique products list, original dataframe)
    """
    # Read Excel file
    df = pd.read_excel(file_path)
    
    # Clean data
    df.dropna(subset=['Itemname'], inplace=True)
    
    # Group transactions
    transactions = df.groupby('BillNo')['Itemname'].apply(list).values.tolist()
    
    # Get unique products
    unique_products = df['Itemname'].unique().tolist()
    
    return transactions, unique_products, df

def train_word2vec_model(transactions, vector_size=100, window=5, min_count=2):
    """
    Train Word2Vec model on transaction data.
    
    Args:
        transactions (list of lists): Transaction data
        vector_size (int): Dimensionality of the word vectors
        window (int): Context window size
        min_count (int): Minimum frequency for a word to be included
    
    Returns:
        Word2Vec model
    """
    # Train Word2Vec model using Skip-Gram with Negative Sampling
    model = Word2Vec(
        sentences=transactions, 
        vector_size=vector_size,  # Embedding dimensionality
        window=window,  # Context window size
        sg=1,  # Skip-gram model (1) vs CBOW (0)
        negative=5,  # Negative sampling
        min_count=min_count,  # Ignore words with frequency below this
        workers=4,  # Parallel training
        epochs=10  # Number of training epochs
    )
    
    return model

def generate_comprehensive_complementary_products(model, unique_products, topn=10):
    """
    Generate a comprehensive dataset of complementary products for each unique item.
    
    Args:
        model (Word2Vec): Trained Word2Vec model
        unique_products (list): List of unique products
        topn (int): Number of top similar products to find
    
    Returns:
        pandas.DataFrame: Comprehensive dataset of complementary products
    """
    # Prepare lists to store results
    comprehensive_results = []
    
    # Find complementary products for each unique product
    for product in unique_products:
        try:
            # Find most similar products
            similar_products = model.wv.most_similar(product, topn=topn)
            
            # Prepare the list of complementary products for this item
            complementary_list = [
                {
                    'Original Product': product,
                    'Complementary Product': comp_product,
                    'Similarity Score': similarity,
                    'Rank': rank + 1
                }
                for rank, (comp_product, similarity) in enumerate(similar_products)
            ]
            
            comprehensive_results.extend(complementary_list)
        
        except KeyError:
            # Skip products not in the model's vocabulary
            print(f"Product '{product}' not found in the model vocabulary.")
    
    # Convert results to DataFrame
    complementary_products_df = pd.DataFrame(comprehensive_results)
    
    return complementary_products_df

def analyze_complementary_products(complementary_products_df):
    """
    Analyze the generated complementary products dataset.
    
    Args:
        complementary_products_df (pandas.DataFrame): Complementary products dataset
    
    Returns:
        dict: Analysis insights
    """
    analysis = {
        'Total Unique Original Products': complementary_products_df['Original Product'].nunique(),
        'Total Complementary Product Pairs': len(complementary_products_df),
        'Average Similarity Score': complementary_products_df['Similarity Score'].mean(),
        'Median Similarity Score': complementary_products_df['Similarity Score'].median(),
        'Top 10 Strongest Complementary Relationships': (
            complementary_products_df.nlargest(10, 'Similarity Score')
            [['Original Product', 'Complementary Product', 'Similarity Score']]
        )
    }
    
    return analysis

def main():
    # Set file path - MODIFY THIS TO YOUR ACTUAL FILE PATH
    path = r'C:\Users\moham\Apriori_VS_Word2Vec\Dataset'
    excel_file = 'df_merged_items_category.xlsx' 
    excel_file_path = os.path.join(path, excel_file)
    
    # Load and preprocess transaction data
    print("Loading and preprocessing transaction data...")
    transactions, unique_products, original_df = load_and_preprocess_data(excel_file_path)
    
    # Train Word2Vec model
    print("Training Word2Vec model...")
    word2vec_model = train_word2vec_model(transactions)
    
    # Generate comprehensive complementary products dataset
    print("Generating comprehensive complementary products dataset...")
    comprehensive_complementary_products_df = generate_comprehensive_complementary_products(
        word2vec_model, 
        unique_products, 
        topn=10  # Number of top complementary products per item
    )
    
    # Analyze complementary products
    print("Analyzing complementary products...")
    analysis_results = analyze_complementary_products(comprehensive_complementary_products_df)
    
    # Save results
    output_path = os.path.join(path, 'comprehensive_complementary_products.xlsx')
    comprehensive_complementary_products_df.to_excel(output_path, index=False)
    
    # Print analysis results
    print("\nComplementary Products Analysis:")
    for key, value in analysis_results.items():
        print(f"\n{key}:")
        print(value)
    
    # Save model
    model_path = os.path.join(path, 'comprehensive_product2vec_model.bin')
    word2vec_model.save(model_path)
    
    print(f"\nResults saved to {output_path}")
    print(f"Model saved to {model_path}")

if __name__ == "__main__":
    main()

Loading and preprocessing transaction data...


2025-04-06 15:30:28,299 : INFO : collecting all words and their counts
2025-04-06 15:30:28,300 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-04-06 15:30:28,348 : INFO : PROGRESS: at sentence #10000, processed 244376 words, keeping 3614 word types
2025-04-06 15:30:28,386 : INFO : PROGRESS: at sentence #20000, processed 513626 words, keeping 4179 word types
2025-04-06 15:30:28,388 : INFO : collected 4185 word types from a corpus of 520609 raw words and 20208 sentences
2025-04-06 15:30:28,389 : INFO : Creating a fresh vocabulary
2025-04-06 15:30:28,409 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3877 unique words (92.64% of original 4185, drops 308)', 'datetime': '2025-04-06T15:30:28.409092', 'gensim': '4.3.3', 'python': '3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-04-06 15:30:28,410 : INFO : Word2Vec 

Training Word2Vec model...


2025-04-06 15:30:28,498 : INFO : sample=0.001 downsamples 6 most-common words
2025-04-06 15:30:28,500 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 518266.4566297258 word corpus (99.6%% of prior 520301)', 'datetime': '2025-04-06T15:30:28.500949', 'gensim': '4.3.3', 'python': '3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-04-06 15:30:28,615 : INFO : estimated required memory for 3877 words and 100 dimensions: 5040100 bytes
2025-04-06 15:30:28,616 : INFO : resetting layer weights
2025-04-06 15:30:28,623 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-04-06T15:30:28.623789', 'gensim': '4.3.3', 'python': '3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'build_vocab'}
2025-04-06 15:30:28,624 : INFO :

Generating comprehensive complementary products dataset...
Product 'GIRLY PINK TOOL SET' not found in the model vocabulary.
Product 'PINK MARSHMALLOW SCARF KNITTING KIT' not found in the model vocabulary.
Product 'CAKESTAND, 3 TIER, LOVEHEART' not found in the model vocabulary.
Product 'GLASS BELL JAR SMALL' not found in the model vocabulary.
Product 'GLASS BELL JAR LARGE' not found in the model vocabulary.
Product 'VINTAGE BLUE TINSEL REEL' not found in the model vocabulary.
Product 'JARDIN ETCHED GLASS BUTTER DISH' not found in the model vocabulary.
Product 'WEEKEND BAG VINTAGE ROSE PAISLEY' not found in the model vocabulary.
Product 'WRAP  PINK FLOCK' not found in the model vocabulary.
Product 'OCEAN SCENT CANDLE JEWELLED DRAWER' not found in the model vocabulary.
Product 'PINK BAROQUE FLOCK CANDLE HOLDER' not found in the model vocabulary.
Product 'BEADED LOVE HEART JEWELLERY SET' not found in the model vocabulary.
Product 'PINK PAINTED KASHMIRI CHAIR' not found in the model vocabu

2025-04-06 15:30:59,388 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'C:\\Users\\moham\\Apriori_VS_Word2Vec\\Dataset\\comprehensive_product2vec_model.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-04-06T15:30:59.388482', 'gensim': '4.3.3', 'python': '3.10.16 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:19:12) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'saving'}
2025-04-06 15:30:59,389 : INFO : not storing attribute cum_table
2025-04-06 15:30:59,401 : INFO : saved C:\Users\moham\Apriori_VS_Word2Vec\Dataset\comprehensive_product2vec_model.bin



Complementary Products Analysis:

Total Unique Original Products:
3877

Total Complementary Product Pairs:
38770

Average Similarity Score:
0.7385364575348535

Median Similarity Score:
0.7312331795692444

Top 10 Strongest Complementary Relationships:
                         Original Product               Complementary Product  \
34540          AMBER 3 BEAD DROP EARRINGS          BLACK 3 BEAD DROP EARRINGS   
27120          BLACK 3 BEAD DROP EARRINGS          AMBER 3 BEAD DROP EARRINGS   
26150  TURQ PENDANT TRIPLE SHELL NECKLACE        GREEN STONE/CRYSTAL EARRINGS   
20460        GREEN STONE/CRYSTAL EARRINGS  TURQ PENDANT TRIPLE SHELL NECKLACE   
27121          BLACK 3 BEAD DROP EARRINGS    BLACK DROP EARRINGS W LONG BEADS   
28430    BLACK DROP EARRINGS W LONG BEADS          BLACK 3 BEAD DROP EARRINGS   
26930       PINK ROSEBUD & PEARL NECKLACE       CLASSIC DIAMANTE EARRINGS JET   
29440       CLASSIC DIAMANTE EARRINGS JET       PINK ROSEBUD & PEARL NECKLACE   
23480          IVOR

## **COMPARASION METRICS**