at this stage, we will extract associative rules using apriori or fp-growth algorithms. Let's look at the associative rules and, if possible, create attributes based on the associations found or improve the class balance.

In [28]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [2]:
training_data_df = pd.read_csv('/content/training_data.csv')

In [3]:
training_data_df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
training_data_df.head()

Unnamed: 0,user_id,verb_bought,verb_cancelled,verb_ordered,verb_paid,verb_returned,verb_sold,object_accessory,object_book,object_laptop,object_phone,object_product,object_service,mask_time,mask_amount,unix_time_norm,amount_norm
0,40578,True,False,False,False,False,False,False,True,False,False,False,False,1,1,0.99147,0.652465
1,94768,False,False,False,True,False,False,False,False,True,False,False,False,1,0,0.991085,0.0
2,65142,False,False,False,True,False,False,False,False,False,False,False,True,1,0,0.992044,0.0
3,10499,False,False,False,False,True,False,False,True,False,False,False,False,1,0,0.994159,0.0
4,89538,False,False,True,False,False,False,False,False,False,True,False,False,1,1,0.998656,0.894989


In [6]:
training_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           20000 non-null  int64  
 1   verb_bought       20000 non-null  bool   
 2   verb_cancelled    20000 non-null  bool   
 3   verb_ordered      20000 non-null  bool   
 4   verb_paid         20000 non-null  bool   
 5   verb_returned     20000 non-null  bool   
 6   verb_sold         20000 non-null  bool   
 7   object_accessory  20000 non-null  bool   
 8   object_book       20000 non-null  bool   
 9   object_laptop     20000 non-null  bool   
 10  object_phone      20000 non-null  bool   
 11  object_product    20000 non-null  bool   
 12  object_service    20000 non-null  bool   
 13  mask_time         20000 non-null  int64  
 14  mask_amount       20000 non-null  int64  
 15  unix_time_norm    20000 non-null  float64
 16  amount_norm       20000 non-null  float6

In [7]:
transactions_df = pd.read_csv('/content/transactions.csv')

In [22]:
transactions_df.head()

Unnamed: 0,subject,verb,object,time,amount,mask
0,User 40578,bought,book,2023-01-11,1524,"[1, 1, 1, 1, 1]"
1,User 94768,paid,laptop,2022-11-23,-1,"[1, 1, 1, 1, 0]"
2,User 65142,paid,service,2023-03-25,-1,"[1, 1, 1, 1, 0]"
3,User 10499,returned,book,2023-12-19,-1,"[1, 1, 1, 1, 0]"
4,User 89538,ordered,phone,2025-07-13,3949,"[1, 1, 1, 1, 1]"


In [10]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  20000 non-null  object
 1   verb     20000 non-null  object
 2   object   20000 non-null  object
 3   time     20000 non-null  object
 4   amount   20000 non-null  int64 
 5   mask     20000 non-null  object
dtypes: int64(1), object(5)
memory usage: 937.6+ KB


In [11]:
# preparing transactions for apriori
def prepare_transactions(df):
  transactions = []
  for _, row in df.iterrows():
    items  = []
    # add verb
    items.append(f"verb_{row['verb']}")
    # add object
    items.append(f"object_{row['object']}")
    transactions.append(items)
  return transactions

In [12]:
transactions = prepare_transactions(transactions_df)

In [13]:
# apriori implementation
def apriori(transactions, min_support=0.1):
    # the first pass is counting individual elements
    item_counts = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_counts[frozenset([item])] += 1

    num_transactions = len(transactions)
    frequent_itemsets = {}
    k = 1
    frequent_itemsets[k] = {
        itemset: support
        for itemset, support in item_counts.items()
        if support / num_transactions >= min_support
    }

    # next passages
    while True:
        k += 1
        candidate_itemsets = generate_candidates(frequent_itemsets[k-1], k-1)

        if not candidate_itemsets:
            break

        # counting support for candidates
        candidate_counts = defaultdict(int)
        for transaction in transactions:
            transaction_set = set(transaction)
            for candidate in candidate_itemsets:
                if candidate.issubset(transaction_set):
                    candidate_counts[candidate] += 1

        # filtering by minimum support
        frequent_k_itemsets = {
            itemset: support
            for itemset, support in candidate_counts.items()
            if support / num_transactions >= min_support
        }

        if not frequent_k_itemsets:
            break

        frequent_itemsets[k] = frequent_k_itemsets

    return frequent_itemsets

In [14]:
def generate_candidates(prev_itemsets, k):
    candidates = set()
    for itemset1 in prev_itemsets:
        for itemset2 in prev_itemsets:
            union = itemset1.union(itemset2)
            if len(union) == k + 1:
                candidates.add(union)
    return candidates

In [15]:
# extracting association rules
def generate_rules(frequent_itemsets, min_confidence=0.01):
    rules = []
    for k, itemsets in frequent_itemsets.items():
        if k < 2:
            continue

        for itemset in itemsets:
            support = itemsets[itemset]
            for item in itemset:
                antecedent = frozenset([item])
                consequent = itemset - antecedent

                # finding support for the antecedent
                antecedent_support = 0
                for prev_itemset in frequent_itemsets[1]:
                    if antecedent.issubset(prev_itemset):
                        antecedent_support = frequent_itemsets[1][prev_itemset]
                        break

                if antecedent_support == 0:
                    continue

                confidence = support / antecedent_support
                if confidence >= min_confidence:
                    rules.append((antecedent, consequent, confidence, support))

    return rules

In [16]:
min_support = 0.1
min_confidence = 0.01

frequent_itemsets = apriori(transactions, min_support)
rules = generate_rules(frequent_itemsets, min_confidence)

In [17]:
print('Frequent sets of items:')
for k in frequent_itemsets:
    print(f'Size {k}:')
    for itemset, support in frequent_itemsets[k].items():
        print(f'  {set(itemset)}: support = {support/len(transactions):.2f}')

print('\nAssociation Rules:')
for antecedent, consequent, confidence, support in rules:
    print(f'  {set(antecedent)} => {set(consequent)} (trust: {confidence:.2f}, support: {support/len(transactions):.2f})')

Frequent sets of items:
Size 1:
  {'verb_bought'}: support = 0.16
  {'object_book'}: support = 0.17
  {'verb_paid'}: support = 0.17
  {'object_laptop'}: support = 0.17
  {'object_service'}: support = 0.17
  {'verb_returned'}: support = 0.17
  {'verb_ordered'}: support = 0.17
  {'object_phone'}: support = 0.16
  {'object_product'}: support = 0.17
  {'verb_sold'}: support = 0.17
  {'verb_cancelled'}: support = 0.17
  {'object_accessory'}: support = 0.16

Association Rules:


it seems that frequent sets are not enough to generate rules, let's try using FP-Growth with min_support=0.01

In [23]:
itemsets = []
for _, row in transactions_df.iterrows():
    itemset = [
        f"verb_{row['verb']}",
        f"object_{row['object']}",
        f"time_{row['time']}",
        f"amount_{row['amount']}"
    ]
    itemsets.append(itemset)

In [25]:
te = TransactionEncoder()
te_ary = te.fit(itemsets).transform(itemsets)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [32]:
frequent_itemsets = fpgrowth(df, min_support=0.001, use_colnames=True)
# sort by support
frequent_itemsets = frequent_itemsets.sort_values('support', ascending=False)

In [33]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])

In [34]:
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [36]:
print(frequent_itemsets.sort_values('support', ascending=False).head(10))

print(frequent_itemsets['itemsets'].apply(len).value_counts())

    support          itemsets
2   0.30060       (amount_-1)
12  0.19750      (time_[PAD])
0   0.16975     (object_book)
3   0.16875   (object_laptop)
11  0.16860       (verb_sold)
8   0.16750    (verb_ordered)
6   0.16740   (verb_returned)
10  0.16665  (object_product)
4   0.16630       (verb_paid)
5   0.16585  (object_service)
itemsets
3    84
2    61
4    36
1    22
Name: count, dtype: int64


In [37]:
# Analyze co-occurrence patterns manually
top_items = ['amount_-1', 'object_book', 'object_laptop', 'verb_sold']
cooccurrence_matrix = pd.DataFrame(index=top_items, columns=top_items)

for item1 in top_items:
    for item2 in top_items:
        if item1 != item2:
            cooccurrence_matrix.loc[item1, item2] = len(df[df[item1] & df[item2]])/len(df)

print("Co-occurrence rates:")
print(cooccurrence_matrix)

Co-occurrence rates:
              amount_-1 object_book object_laptop verb_sold
amount_-1           NaN      0.0508       0.05085    0.0523
object_book      0.0508         NaN           0.0    0.0294
object_laptop   0.05085         0.0           NaN   0.02745
verb_sold        0.0523      0.0294       0.02745       NaN


Well, it looks like the data is still missing, we'll come back if we expand the number of rows >100000.
We'll leave it at that for now.