In [1]:
import pandas as pd

# Load the dataset (replace 'your_data.csv' with the path to your uploaded file)
df = pd.read_excel('data_report2.xlsx')

# Assuming the dataframe `df` is as per the structure seen in the screenshot:
# Convert 'Y'/'N' columns to boolean True/False
boolean_columns = ['frisked', 'searched', 'pistol', 'pf_hcuff']
for col in boolean_columns:
    df[col] = df[col] == 'Y'

# Create a new dataframe where each row represents a transaction
# and each transaction contains a set of items
transactions = []

for index, row in df.iterrows():
    transaction = set()
    for col in boolean_columns:
        if row[col]:
            transaction.add(col)
    # Add other categorical attributes as items
    transaction.add(f"race_{row['race']}")
    transaction.add(f"sex_{row['sex']}")
    transaction.add(f"crimsusp_{row['crimsusp']}")
    transaction.add(f"city_{row['city']}")

    transactions.append(transaction)

# Now, `transactions` is a list of sets, where each set is a transaction

# Example: a CSV where each row is a transaction and items are comma-separated
with open('transactions.csv', 'w') as f:
    for transaction in transactions:
        f.write(','.join(transaction) + '\n')


In [3]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns

# One-hot encode the transaction data
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Using the apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)  # adjust the min_support as necessary

# Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)  # adjust the min_threshold as necessary

# Displaying tables of frequent itemsets and association rules
print(frequent_itemsets)
print(rules)

# Visualization of results:

# Frequent Itemsets Visualization
plt.figure(figsize=(12, 6))
sns.barplot(x=frequent_itemsets['support'], y=frequent_itemsets['itemsets'].astype(str))
plt.title('Frequent Itemsets')
plt.xlabel('Support')
plt.ylabel('Itemsets')
plt.show()

# Association Rules Visualization
import numpy as np

# Let's assume 'rules' is your DataFrame and it has a 'lift' column.
# We add small random noise to 'support' and 'confidence' to prevent overlapping.
support_jitter = rules['support'] + np.random.uniform(-0.0005, 0.0005, size=len(rules))
confidence_jitter = rules['confidence'] + np.random.uniform(-0.0005, 0.0005, size=len(rules))

plt.figure(figsize=(10, 6))
scatter = plt.scatter(support_jitter, confidence_jitter, alpha=0.5, c=rules['lift'], cmap='viridis')
plt.title('Association Rules - Support vs Confidence Colored by Lift')
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.colorbar(scatter, label='Lift')
plt.show()


MemoryError: Unable to allocate 3.40 GiB for an array with shape (532911, 6844) and data type bool