In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [None]:
transactions_df = pd.read_csv('400_transactions.csv')
products_df = pd.read_csv('400_products.csv')

In [None]:
basket_data = transactions_df.merge(
    products_df,
    on='PRODUCT_NUM',
    how='left'
)

print("Total number of transactions:", len(basket_data))
print("Number of unique baskets:", basket_data['BASKET_NUM'].nunique())
print("Number of unique products:", basket_data['PRODUCT_NUM'].nunique())
print("\nProduct departments in our data:")
print(basket_data['DEPARTMENT'].value_counts())

In [None]:
def find_frequent_pairs(data, min_support=0.01):
    """
    Find pairs of departments that are frequently bought together.
    
    Args:
        data: DataFrame containing basket and department information
        min_support: Minimum support threshold (as a fraction of total baskets)
    
    Returns:
        Dictionary of department pairs and their support values
    """
    total_baskets = data['BASKET_NUM'].nunique()
    pairs = defaultdict(int)
    
    for basket_num, basket_items in data.groupby('BASKET_NUM'):
        departments = basket_items['DEPARTMENT'].unique()
        
        for i in range(len(departments)):
            for j in range(i + 1, len(departments)):
                pair = tuple(sorted([departments[i], departments[j]]))
                pairs[pair] += 1
    
    support_values = {
        pair: count/total_baskets
        for pair, count in pairs.items()
        if count/total_baskets >= min_support
    }
    
    return support_values

frequent_pairs = find_frequent_pairs(basket_data)

print(frequent_pairs)

In [None]:
def create_cooccurrence_matrix(pairs):
    """
    Create a matrix showing how often departments are bought together.
    """
    departments = set()
    for pair in pairs.keys():
        departments.update(pair)
    departments = sorted(list(departments))
    
    matrix = pd.DataFrame(
        0, 
        index=departments,
        columns=departments
    )
    
    for (dept1, dept2), support in pairs.items():
        matrix.loc[dept1, dept2] = support
        matrix.loc[dept2, dept1] = support
    
    return matrix

cooccurrence_matrix = create_cooccurrence_matrix(frequent_pairs)

plt.figure(figsize=(12, 10))
sns.heatmap(
    cooccurrence_matrix,
    annot=True,
    cmap='YlOrRd',
    fmt='.2f'
)
plt.title('Department Co-occurrence Matrix')
plt.tight_layout()
plt.show()

In [None]:
basket_matrix = pd.crosstab(
    basket_data['BASKET_NUM'],
    basket_data['DEPARTMENT']
)

def train_department_models(matrix, test_size=0.2):
    """
    Train a Random Forest model for each department to predict its presence
    based on other departments in the basket.
    """
    departments = matrix.columns
    models = {}
    importances = {}
    
    for target_dept in departments:
        X = matrix.drop(columns=[target_dept])
        y = matrix[target_dept]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )
        
        rf = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )
        rf.fit(X_train, y_train)
        
        models[target_dept] = rf
        importances[target_dept] = pd.Series(
            rf.feature_importances_,
            index=X.columns
        ).sort_values(ascending=False)
        
        print(f"\nModel for {target_dept}:")
        print(f"Accuracy: {rf.score(X_test, y_test):.3f}")
        print("Top 3 complementary departments:")
        print(importances[target_dept].head(3))
    
    return models, importances

department_models, department_importances = train_department_models(basket_matrix)

plt.figure(figsize=(15, 10))
for i, dept in enumerate(list(department_importances.keys())[:5]):
    plt.subplot(2, 3, i+1)
    department_importances[dept].head(5).plot(kind='bar')
    plt.title(f'Top predictors for {dept}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def get_cross_selling_recommendations(department, importance_threshold=0.01):
    """
    Get recommended products for cross-selling based on a target department.
    """
    if department not in department_importances:
        return []
    
    important_departments = department_importances[department][
        department_importances[department] > importance_threshold
    ]
    print(important_departments)
    
    recommendations = []
    for dept, importance in important_departments.items():
        support = cooccurrence_matrix.loc[department, dept]
        print(support)
        recommendations.append({
            'department': dept,
            'importance': importance,
            'support': support,
        })
    
    return sorted(recommendations, key=lambda x: x['importance'], reverse=True)

target_department = 'GROCERY'
recommendations = get_cross_selling_recommendations(target_department)
print(f"\nCross-selling recommendations for {target_department}:")
for rec in recommendations:
    print(f"Department: {rec['department']}")
    print(f"Importance Score: {rec['importance']:.3f}")
    print(f"Support (Co-occurrence): {rec['support']:.3f}")
    print()