In [None]:
from apyori import apriori
import pandas as pd
import numpy as np
from scipy import stats
import pyfpgrowth


In [None]:

df = pd.read_csv("breast-cancer_cleaned.csv")

df['diagnosis'] = df['diagnosis'].map({1: 'malignant', 0: 'benign'})

def get_range_labels(data, num_bins):
    bins = pd.qcut(data, num_bins, retbins=True)[1]
    return [f'{bins[i]:.2f}to{bins[i+1]:.2f}' for i in range(len(bins)-1)]



#Shapiro-Wilk Test Results

In [None]:

for column in df.columns[:-1]:
    stat, p_value = stats.shapiro(df[column])
    print(f"{column}: Statistic = {stat:.4f}, p-value = {p_value:.4e}")
print("-" * 50)
#kda data not normally distributed



#Get bins numbers

In [None]:

# Discretize numerical columns beeee Freedman-Diaconis Rule
bins_array = []  # Store number of bins for each column
for column in df.columns[:-1]:

    col_data = df[column]

    # Calculate IQR
    q1 = np.percentile(col_data, 25)
    q3 = np.percentile(col_data, 75)
    iqr = q3 - q1

    # Bin Width and Number of Bins
    bin_width = 2 * iqr / len(col_data) ** (1 / 3)
    num_bins = int(np.ceil((col_data.max() - col_data.min()) / bin_width))
    bins_array.append((column, num_bins))
    # Create custom labels for the bins based on value ranges
    bin_labels = get_range_labels(col_data, num_bins)
    # Discretize the column into bins
    df[column] = pd.cut(df[column], bins=num_bins, labels=bin_labels)
    print(f"Column: {column}, Bins: {num_bins}")
print("-" * 50)

In [None]:

transactions = []
for i in range(len(df)):
    transaction = []
    for col in df.columns:
        value = df[col][i]
        if pd.notna(value):
            transaction.append(f"{col}={value}")
    transactions.append(transaction)


In [None]:


# Apply Apriori Algorithm
association_rules = apriori(transactions, min_support=0.05, min_confidence=0.6)
association_results = list(association_rules)

# Display Results in Predicate Format
print("Association Rules in Predicate Format:\n")
for rule in association_results:
    for ordered_stat in rule.ordered_statistics:
        if ordered_stat.confidence >= 0.6:  # Filter by confidence threshold
            antecedent = ", ".join(
                [f"{item.split('=')[0]}({item.split('=')[1]})" for item in ordered_stat.items_base]
            )
            consequent = ", ".join(
                [f"{item.split('=')[0]}({item.split('=')[1]})" for item in ordered_stat.items_add]
            )
            print(
                f"{antecedent} -> {consequent} (support: {rule.support:.2f}, confidence: {ordered_stat.confidence:.2f})")



In [None]:

FrequentPatterns = pyfpgrowth.find_frequent_patterns(transactions=transactions, support_threshold=26)  # Same as 0.05 support
FrequentPatterns

# Generate association rules with confidence threshold of 0.6
Rules = pyfpgrowth.generate_association_rules(patterns=FrequentPatterns, confidence_threshold=0.6)

print("Association Rules in Predicate Format:\n")
# Print the rules in predicate format
for antecedent, (consequent, confidence) in Rules.items():
    # Convert antecedent and consequent to string representations for readability
    antecedent_str = ", ".join(antecedent)
    consequent_str = ", ".join(consequent)

    print(f"({antecedent_str}) ⇒ ({consequent_str}) with confidence {confidence}")