In [None]:
# -*- coding: utf-8 -*-
"""Recommendation System using Association Rules Evaluation
"""

#%pip uninstall mlxtend
#%pip install mlxtend==0.17.3

import mlxtend
print(mlxtend.__version__)

# Commented out IPython magic to ensure Python compatibility.
# %pip install apyori

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import networkx as nx
import warnings
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from itertools import combinations
from apyori import apriori
import warnings
from sklearn.preprocessing import MultiLabelBinarizer
from mlxtend.frequent_patterns import fpgrowth, association_rules
from sklearn.metrics import precision_score, recall_score, f1_score

transaction_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\transaction_data.csv")
couponred_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\coupon_redempt.csv")
coupon_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\coupon.csv")
campaign_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\campaign_table.csv")
campaigndesc_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\campaign_desc.csv")
causal_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\causal_data.csv")
product_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\product.csv")
demographic_df = pd.read_csv(r"C:\Users\User\Downloads\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey\dunnhumby_The-Complete-Journey CSV\hh_demographic.csv")

campaigndesc_df['Duration'] = campaigndesc_df['END_DAY'] - campaigndesc_df['START_DAY']

# We merge campaign desc and Campaign Table by Campaign
campaign = pd.merge(
    campaigndesc_df[['CAMPAIGN', 'DESCRIPTION', 'START_DAY', 'END_DAY', 'Duration']],
    campaign_df[['household_key', 'CAMPAIGN']],
    on="CAMPAIGN",
    how="left"
)

campaign['total_campaign'] = campaign.groupby('household_key')['CAMPAIGN'].transform('count')
campaign.drop_duplicates(subset=['household_key', 'total_campaign'], keep="first", inplace=True)

# Calculate redemptions per household
redemptions_per_household = couponred_df.groupby('household_key')['CAMPAIGN'].count()

campaignred = pd.merge(campaign, redemptions_per_household, on=['household_key'], how="left")

customertrans = pd.merge(transaction_df, demographic_df, on='household_key', how='left')
customertrans.dropna(inplace=True)

transacts = pd.merge(customertrans, product_df[['PRODUCT_ID', 'DEPARTMENT', 'COMMODITY_DESC']], on='PRODUCT_ID', how='left')
final_df = pd.merge(transacts, campaignred, on='household_key', how='left')
final_df = final_df.drop('CAMPAIGN_y', axis=1)
final_df = final_df.rename(columns={'CAMPAIGN_x': 'CAMPAIGN'})

# Drop irrelevant columns for market basket analysis
columns_to_drop = ['household_key', 'DAY', 'QUANTITY', 'STORE_ID', 'RETAIL_DISC', 'TRANS_TIME', 'WEEK_NO',
                   'classification_4', 'KID_CATEGORY_DESC', 'CAMPAIGN', 'START_DAY', 'END_DAY', 'Duration', 'total_campaign']
final_df = final_df.drop(columns=columns_to_drop, axis=1)

# Handle missing values
final_df = final_df.dropna(subset=['PRODUCT_ID', 'COMMODITY_DESC'])
final_df['DEPARTMENT'] = final_df['DEPARTMENT'].fillna('Unknown')
final_df['COMMODITY_DESC'] = final_df['COMMODITY_DESC'].fillna('Unknown')

# Group transactions by 'BASKET_ID'
grouped = final_df.groupby('BASKET_ID')['COMMODITY_DESC'].apply(list)
transactions_df = grouped.reset_index()
transactions_df.columns = ['BASKET_ID', 'ITEMS']

# Use random sampling to keep a sample of 20000
transactions_df = transactions_df.sample(n=20000, random_state=42)

# Filter out less frequent items
item_counts = transactions_df['ITEMS'].explode().value_counts()
top_items = item_counts[item_counts > 10].index
transactions_df['ITEMS'] = transactions_df['ITEMS'].apply(lambda x: [item for item in x if item in top_items])

# Save the DataFrame to a pickle file
import os
if not os.path.exists('transactions_df.pkl'):
    transactions_df.to_pickle('transactions_df.pkl')

# Process transactions to generate association rules
def process_transactions(transactions_df, min_support=0.005):
    """
    Process transactions and generate association rules
    
    Args:
        transactions_df (pd.DataFrame): DataFrame with transaction items
        min_support (float): Minimum support threshold for frequent itemsets

    Returns:
        tuple: Processed DataFrame, frequent itemsets, and association rules
    """
    # One-hot encode using MultiLabelBinarizer with sparse output
    mlb = MultiLabelBinarizer(sparse_output=True)
    transactions_encoded_sparse = mlb.fit_transform(transactions_df['ITEMS'])

    # Convert sparse matrix to DataFrame
    columns = mlb.classes_
    transactions_encoded = pd.DataFrame.sparse.from_spmatrix(
        transactions_encoded_sparse, columns=columns
    )

    # Find frequent itemsets
    frequent_itemsets = fpgrowth(transactions_encoded, min_support=min_support, use_colnames=True)

    if len(frequent_itemsets) == 0:
        print("No frequent itemsets found. Try lowering min_support.")
        return None, None, None

    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)
    filtered_rules = rules[(rules['support'] > 0.01) & (rules['lift'] > 3)]
    top_20_rules = filtered_rules.nlargest(20, 'lift')

    return transactions_encoded, rules, top_20_rules

def recommend_products(purchased_items, rules, top_n=5):
    """Recommend products based on association rules"""
    related_rules = rules[
        rules['antecedents'].apply(lambda x: set(purchased_items).issubset(x))
    ]

    if related_rules.empty:
        return []

    ranked_rules = related_rules.sort_values(
        by=['lift', 'confidence'],
        ascending=[False, False]
    )

    recommended_items = set()
    for _, row in ranked_rules.iterrows():
        recommended_items.update(row['consequents'])
        if len(recommended_items) >= top_n:
            break

    return list(recommended_items)[:top_n]

def evaluate_recommender(test_df, rules, top_n=5):
    """
    Evaluate the recommendation system
    
    Args:
        test_df (pd.DataFrame): DataFrame of testing transactions
        rules (pd.DataFrame): Association rules DataFrame
        top_n (int): Number of recommendations to return

    Returns:
        tuple: True and predicted labels
    """
    y_true = []
    y_pred = []
    
    for index, row in test_df.iterrows():
        purchased_items = row['ITEMS']
        recommended_items = recommend_products(purchased_items, rules, top_n=top_n)
        
        # True labels are the actual items purchased, excluding the input items
        true_items = set(purchased_items) - set(recommended_items)
        
        # Predicted labels are the recommended items
        y_true.append(list(true_items))
        y_pred.append(recommended_items)
    
    return y_true, y_pred

# Split transactions into training and testing sets
train_df, test_df = train_test_split(transactions_df, test_size=0.2, random_state=42)

# Generate rules using only the training data
transactions_encoded, rules, _ = process_transactions(train_df, min_support=0.001)

if transactions_encoded is not None and rules is not None:
    # Get true and predicted labels
    y_true, y_pred = evaluate_recommender(test_df, rules)

    # Flatten the list of lists to compute metrics
    y_true_flat = [item for sublist in y_true for item in sublist]
    y_pred_flat = [item for sublist in y_pred for item in sublist]

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_true_flat, y_pred_flat, average='macro', zero_division=0)
    recall = recall_score(y_true_flat, y_pred_flat, average='macro', zero_division=0)
    f1 = f1_score(y_true_flat, y_pred_flat, average='macro', zero_division=0)

    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')


0.17.3
