In [1]:
import pandas as pd
import numpy as np
import os
import time
import json
from itertools import product, combinations
from collections import defaultdict
from scipy.stats import chi2_contingency, ttest_ind
from gensim.models import Word2Vec
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
path = os.path.join(project_root, "Dataset")
path_results = os.path.join(project_root, "Results")
excel_file = 'full_validated_dataset.xlsx'

In [2]:
def extract_apriori_pairs_optimized(basket_encoded_values, basket_encoded_columns, min_support, min_confidence):
    """
    Optimized Apriori extraction using pre-computed encodings
    """
    try:
        basket_encoded = pd.DataFrame(basket_encoded_values, columns=basket_encoded_columns)
        
        frequent_itemsets = apriori(basket_encoded, min_support=min_support, use_colnames=True)
        
        if len(frequent_itemsets) == 0:
            return {}
        
        rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
        
        pairs = {}
        for _, rule in rules.iterrows():
            antecedent = list(rule['antecedents'])[0] if len(rule['antecedents']) == 1 else None
            consequent = list(rule['consequents'])[0] if len(rule['consequents']) == 1 else None
            
            if antecedent and consequent:
                pair = tuple(sorted([antecedent, consequent]))
                confidence = rule['confidence']
                
                if pair not in pairs or pairs[pair] < confidence:
                    pairs[pair] = confidence
        
        return pairs
        
    except Exception as e:
        return {}

def extract_word2vec_pairs_optimized(transactions, vector_size, window, min_count, similarity_threshold):
    """
    Optimized Word2Vec extraction
    """
    try:
        model = Word2Vec(
            sentences=transactions,
            vector_size=vector_size,
            window=window,
            min_count=min_count,
            epochs=10,
            sg=1,
            seed=42
        )
        
        pairs = {}
        vocab_words = list(model.wv.key_to_index.keys())
        
        for i, word1 in enumerate(vocab_words):
            for word2 in vocab_words[i+1:]:
                try:
                    similarity = model.wv.similarity(word1, word2)
                    if similarity >= similarity_threshold:
                        pair = tuple(sorted([word1, word2]))
                        pairs[pair] = similarity
                except KeyError:
                    continue
        
        return pairs
        
    except Exception as e:
        return {}

def calculate_agreement_metrics_optimized(apriori_pairs, word2vec_pairs):
    """
    Optimized agreement metrics calculation
    """
    apriori_set = set(apriori_pairs.keys())
    word2vec_set = set(word2vec_pairs.keys())
    
    intersection = apriori_set & word2vec_set
    union = apriori_set | word2vec_set
    
    agreement_rate = len(intersection) / len(union) if len(union) > 0 else 0
    
    apriori_only = apriori_set - word2vec_set
    word2vec_only = word2vec_set - apriori_set
    
    agreed_apriori_scores = [apriori_pairs[pair] for pair in intersection if pair in apriori_pairs]
    agreed_word2vec_scores = [word2vec_pairs[pair] for pair in intersection if pair in word2vec_pairs]
    
    metrics = {
        'agreement_rate': agreement_rate,
        'total_pairs': len(union),
        'apriori_pairs': len(apriori_set),
        'word2vec_pairs': len(word2vec_set),
        'agreed_pairs': len(intersection),
        'apriori_only_pairs': len(apriori_only),
        'word2vec_only_pairs': len(word2vec_only),
        'avg_agreed_apriori_score': np.mean(agreed_apriori_scores) if agreed_apriori_scores else 0,
        'avg_agreed_word2vec_score': np.mean(agreed_word2vec_scores) if agreed_word2vec_scores else 0,
        'intersection_pairs': list(intersection),
        'apriori_only_list': list(apriori_only),
        'word2vec_only_list': list(word2vec_only)
    }
    
    return metrics

def process_single_combination(combo, basket_encoded_values, basket_encoded_columns, transactions):
    """
    Process a single parameter combination
    """
    start_time = time.time()
    
    try:
        apriori_pairs = extract_apriori_pairs_optimized(
            basket_encoded_values,
            basket_encoded_columns,
            combo['apriori_params']['min_support'],
            combo['apriori_params']['min_confidence']
        )
        
        word2vec_pairs = extract_word2vec_pairs_optimized(
            transactions,
            combo['word2vec_params']['vector_size'],
            combo['word2vec_params']['window'],
            combo['word2vec_params']['min_count'],
            combo['word2vec_params']['similarity_threshold']
        )
        
        metrics = calculate_agreement_metrics_optimized(apriori_pairs, word2vec_pairs)
        
        execution_time = time.time() - start_time
        
        result = {
            'combination_id': combo['combination_id'],
            'apriori_min_support': combo['apriori_params']['min_support'],
            'apriori_min_confidence': combo['apriori_params']['min_confidence'],
            'word2vec_vector_size': combo['word2vec_params']['vector_size'],
            'word2vec_window': combo['word2vec_params']['window'],
            'word2vec_min_count': combo['word2vec_params']['min_count'],
            'word2vec_similarity_threshold': combo['word2vec_params']['similarity_threshold'],
            'execution_time': execution_time,
            **metrics
        }
        
        return result
        
    except Exception as e:
        return {
            'combination_id': combo['combination_id'],
            'error': str(e),
            'agreement_rate': 0,
            'total_pairs': 0,
            'execution_time': time.time() - start_time
        }

In [None]:
class MaximumAgreementGridSearch:
    """
    Grid Search for Maximum Agreement Analysis
    """
    
    def __init__(self, data_path, excel_file, use_subset=True):
        """
        Initialize with data preprocessing and encoding optimization
        """
        print("="*90)
        print("MAXIMUM AGREEMENT GRID SEARCH ANALYSIS")
        print("="*90)
        print("Phase 1.1: Data Preparation")
        print("-" * 50)
        
        self.data_path = data_path
        self.excel_file = excel_file
        self.file_path = os.path.join(data_path, excel_file)
        self.use_subset = use_subset
        
        print(f"Loading dataset from: {self.file_path}")
        self.data_excel = pd.read_excel(self.file_path)
        print(f"Raw dataset shape: {self.data_excel.shape}")
        
        
        original_size = len(self.data_excel)
        self.data_excel = self.data_excel[self.data_excel['category'] != 'Miscellaneous']
        self.data_excel.dropna(subset=['Itemname'], inplace=True)
        print(f"Filtered dataset: {len(self.data_excel)} rows ({original_size - len(self.data_excel)} removed)")
        
        
        self.basket = self.data_excel.groupby('BillNo')['Itemname'].apply(list)
        self.transactions = self.basket.values.tolist()
        
        
        if self.use_subset:
            subset_size = len(self.transactions) // 3
            self.transactions = self.transactions[:subset_size]
            print(f"Using subset: {len(self.transactions)} transactions (first third of data)")
        
        self.unique_products = sorted(self.data_excel['Itemname'].unique().tolist())
        self.product_categories = dict(zip(
            self.data_excel['Itemname'], 
            self.data_excel['category']
        ))
        
        
        print("Pre-computing transaction encodings...")
        self.te = TransactionEncoder()
        self.te_ary = self.te.fit(self.transactions).transform(self.transactions)
        self.basket_encoded = pd.DataFrame(self.te_ary, columns=self.te.columns_)
        print(f"✓ Transaction encoding completed: {self.basket_encoded.shape}")
        
        
        self.grid_search_results = []
        self.best_agreement_params = None
        self.best_agreement_score = 0
        self.optimal_params = None
        
        avg_transaction_size = np.mean([len(t) for t in self.transactions])
        unique_categories = self.data_excel['category'].nunique()
        
        print(f"✓ Transactions: {len(self.transactions)}")
        print(f"✓ Unique products: {len(self.unique_products)}")
        print(f"✓ Product categories: {unique_categories}")
        print(f"✓ Average transaction size: {avg_transaction_size:.2f}")
        print("✓ Pre-computed encodings ready")
        
    def define_parameter_search_space(self):
        """
        Define parameter search space
        """
        print("\n" + "="*90)
        print("Phase 1.2: Define Parameter Search Space")
        print("-" * 50)
        
        self.apriori_params = {
            'min_support': [0.01, 0.05, 0.1],      
            'min_confidence': [0.1, 0.3, 0.5]       
        }
        
        self.word2vec_params = {
            'vector_size': [50, 100, 300],           
            'window': [3, 5, 10],                      
            'min_count': [5, 10, 20],                     
            'similarity_threshold': [0.1, 0.2, 0.5] 
        }
        
        
        apriori_combinations = len(list(product(*self.apriori_params.values())))
        word2vec_combinations = len(list(product(*self.word2vec_params.values())))
        total_combinations = apriori_combinations * word2vec_combinations
        
        print("APRIORI PARAMETER SPACE:")
        for param, values in self.apriori_params.items():
            print(f"   {param}: {values}")
        print(f"   → {apriori_combinations} combinations")
        
        print("\nWORD2VEC PARAMETER SPACE:")
        for param, values in self.word2vec_params.items():
            print(f"   {param}: {values}")
        print(f"   → {word2vec_combinations} combinations")
        
        print(f"\nTOTAL GRID SEARCH COMBINATIONS: {total_combinations}")
        
        
        self.param_combinations = []
        combination_id = 0
        for support, confidence in product(*self.apriori_params.values()):
            for vector_size, window, min_count, sim_threshold in product(*self.word2vec_params.values()):
                combination_id += 1
                self.param_combinations.append({
                    'combination_id': combination_id,
                    'apriori_params': {'min_support': support, 'min_confidence': confidence},
                    'word2vec_params': {
                        'vector_size': vector_size,
                        'window': window,
                        'min_count': min_count,
                        'similarity_threshold': sim_threshold
                    }
                })
        
        print(f"Prepared {len(self.param_combinations)} parameter combinations")
        print("Phase 1.2 Complete: Parameter search space defined")
        return total_combinations
    
    def run_grid_search(self):
        """
        Run grid search sequentially
        """
        print("\n" + "="*90)
        print("Phase 2.2: Sequential Grid Search Implementation")
        print("-" * 50)
        
        total_combinations = self.define_parameter_search_space()
        
        print(f"Starting sequential processing...")
        print("Focus: Finding maximum agreement on complementary product pairs")
        
        start_time = time.time()
        
        for i, combo in enumerate(self.param_combinations, 1):
            try:
                result = process_single_combination(
                    combo,
                    self.basket_encoded.values,
                    self.basket_encoded.columns.tolist(),
                    self.transactions
                )
                
                self.grid_search_results.append(result)
                
                
                if 'error' not in result and result['agreement_rate'] > self.best_agreement_score:
                    self.best_agreement_score = result['agreement_rate']
                    self.best_agreement_params = result.copy()
                
                
                if i % 10 == 0:
                    elapsed_time = time.time() - start_time
                    avg_time_per_combo = elapsed_time / i
                    remaining_combos = total_combinations - i
                    estimated_remaining = avg_time_per_combo * remaining_combos
                    
                    print(f"Progress: {i}/{total_combinations} "
                          f"({i/total_combinations*100:.1f}%) "
                          f"Best agreement so far: {self.best_agreement_score:.3f} "
                          f"Est. remaining time: {estimated_remaining/60:.1f} min")
                
            except Exception as exc:
                print(f'Combination {combo["combination_id"]} generated an exception: {exc}')
        
        total_time = time.time() - start_time
        print(f"\nPhase 2.2 Complete: Tested {total_combinations} parameter combinations")
        print(f"Total execution time: {total_time/60:.2f} minutes")
        print(f"Average time per combination: {total_time/total_combinations:.2f} seconds")
        print(f"Maximum agreement found: {self.best_agreement_score:.3f}")
    
    def analyze_optimal_parameters(self):
        """
        Analyze optimal parameters
        """
        print("\n" + "="*90)
        print("Phase 2.3 & 3: Optimal Parameter Analysis")
        print("-" * 50)
        
        if not self.grid_search_results:
            print("ERROR: No grid search results found. Run grid search first.")
            return None
        
        
        valid_results = [r for r in self.grid_search_results if 'error' not in r]
        
        if not valid_results:
            print("ERROR: No valid results found. All combinations failed.")
            return None
        
        results_df = pd.DataFrame(valid_results)
        top_5 = results_df.nlargest(5, 'agreement_rate')
        
        print("TOP 5 PARAMETER COMBINATIONS BY AGREEMENT RATE:")
        print("=" * 70)
        for i, (_, row) in enumerate(top_5.iterrows(), 1):
            print(f"\nRANK {i}: Agreement Rate = {row['agreement_rate']:.4f}")
            print(f"   Apriori: support={row['apriori_min_support']}, confidence={row['apriori_min_confidence']}")
            print(f"   Word2Vec: vector={row['word2vec_vector_size']}, window={row['word2vec_window']}")
            print(f"             min_count={row['word2vec_min_count']}, similarity={row['word2vec_similarity_threshold']}")
            print(f"   Pairs: {row['agreed_pairs']} agreed, {row['apriori_pairs']} Apriori, {row['word2vec_pairs']} Word2Vec")
            print(f"   Execution time: {row['execution_time']:.2f} seconds")
        
        optimal = top_5.iloc[0]
        self.optimal_params = {
            'apriori': {
                'min_support': optimal['apriori_min_support'],
                'min_confidence': optimal['apriori_min_confidence']
            },
            'word2vec': {
                'vector_size': int(optimal['word2vec_vector_size']),
                'window': int(optimal['word2vec_window']),
                'min_count': int(optimal['word2vec_min_count']),
                'similarity_threshold': optimal['word2vec_similarity_threshold']
            },
            'performance': {
                'agreement_rate': optimal['agreement_rate'],
                'agreed_pairs': int(optimal['agreed_pairs']),
                'total_pairs': int(optimal['total_pairs'])
            }
        }
        
        print(f"\nOPTIMAL PARAMETERS SELECTED:")
        print(f"Agreement Rate: {optimal['agreement_rate']:.4f}")
        print(f"Agreed Pairs: {optimal['agreed_pairs']}")
        
        
        total_time = sum(result['execution_time'] for result in valid_results)
        print(f"\nPERFORMANCE SUMMARY:")
        print(f"Total combinations tested: {len(valid_results)}")
        print(f"Failed combinations: {len(self.grid_search_results) - len(valid_results)}")
        print(f"Total execution time: {total_time/60:.2f} minutes")
        print(f"Average time per combination: {total_time/len(valid_results):.2f} seconds")
        
        return optimal
    
    def save_results(self):
        """
        Save results
        """
        if self.optimal_params is None:
            print("ERROR: No optimal parameters to save. Run analysis first.")
            return
        
        os.makedirs(path_results, exist_ok=True)
        
        
        with open(os.path.join(path_results, 'maximum_agreement_optimal_parameters.json'), 'w') as f:
            json.dump(self.optimal_params, f, indent=2)
        
        
        results_df = pd.DataFrame(self.grid_search_results)
        results_df.to_csv(os.path.join(path_results, 'maximum_agreement_grid_search_results.csv'), index=False)
        
        
        valid_results = [r for r in self.grid_search_results if 'error' not in r]
        total_time = sum(result['execution_time'] for result in valid_results)
        
        summary = {
            'analysis_details': {
                'used_data_subset': self.use_subset,
                'subset_size': len(self.transactions),
                'pre_computed_encodings': True,
                'processing_type': 'sequential'
            },
            'performance_metrics': {
                'total_combinations': len(self.grid_search_results),
                'valid_combinations': len(valid_results),
                'failed_combinations': len(self.grid_search_results) - len(valid_results),
                'total_execution_time_minutes': total_time / 60,
                'average_time_per_combination_seconds': total_time / len(valid_results) if valid_results else 0
            },
            'optimal_results': self.optimal_params
        }
        
        with open(os.path.join(path_results, 'analysis_summary.json'), 'w') as f:
            json.dump(summary, f, indent=2)
        
        print("Results saved successfully!")

In [4]:

print("INITIALIZING MAXIMUM AGREEMENT GRID SEARCH")
print("="*90)

try:
    
    analyzer = MaximumAgreementGridSearch(path, excel_file, use_subset=True)
    
    
    analyzer.run_grid_search()
    
    
    optimal_results = analyzer.analyze_optimal_parameters()
    
    if optimal_results is not None:
        
        analyzer.save_results()
        
        print("\n" + "="*90)
        print("ANALYSIS PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*90)
        print("Sequential processing completed.")
        print("All results have been saved to the Results directory.")
    else:
        print("Analysis failed - no valid results found.")
        
except Exception as e:
    print(f"Error in main execution: {e}")
    import traceback
    traceback.print_exc()

INITIALIZING MAXIMUM AGREEMENT GRID SEARCH
MAXIMUM AGREEMENT GRID SEARCH ANALYSIS
Phase 1.1: Data Preparation
--------------------------------------------------
Loading dataset from: c:\Users\moham\Coding-Projects\Apriori_VS_Word2Vec\Dataset\full_validated_dataset.xlsx
Raw dataset shape: (520609, 8)
Filtered dataset: 517587 rows (3022 removed)
Using subset: 6501 transactions (first third of data)
Pre-computing transaction encodings...
✓ Transaction encoding completed: (6501, 3235)
✓ Transactions: 6501
✓ Unique products: 4009
✓ Product categories: 19
✓ Average transaction size: 25.78
✓ Pre-computed encodings ready

Phase 2.2: Sequential Grid Search Implementation
--------------------------------------------------

Phase 1.2: Define Parameter Search Space
--------------------------------------------------
APRIORI PARAMETER SPACE:
   min_support: [0.01, 0.05, 0.1]
   min_confidence: [0.1, 0.3, 0.5]
   → 9 combinations

WORD2VEC PARAMETER SPACE:
   vector_size: [50, 100, 300]
   window: [3