# Data mining Project

In [79]:
# Standard Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import time

# Third-Party Libraries
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
from memory_profiler import memory_usage
from pymining import itemmining, assocrules


# Converting .txt to .csv

In [80]:
import pandas as pd

# Read the data from the file
file_path = 'data.txt'
df = pd.read_csv(file_path, delimiter='\t', header=None)

# Assign column names to the DataFrame
columns = [
    "ID", "Gender", "Age_Group", "Residence", "Education_Level", "Source_of_Income",
    "Marital_Status", "Smoked_Cigarettes", "Year_Diagnosed", "Surgical_Treatment",
    "Chemotherapy", "Radiotherapy", "Immunotherapy", "Molecular_targeted_Therapy",
    "Hospitalization_Number", "Time_to_Treatment", "Medical_Treatment_Need",
    "Emotional_Impact", "Travel_Impact", "Quality_of_Life", "Symptoms_exp_cough",
    "Symptoms_exp_Hoarseness", "Symptoms_exp_Blood_cough", "Symptoms_exp_chestpain",
    "Symptoms_exp_Shortness_of_breath", "Symptoms_exp_weakness", "Symptoms_exp_None",
    "Symptom_Frequency", "Symptom_Household_Impact", "Sleep_Issues", "Support_From_Close",
    "Dependency_Fear", "Health_Satisfaction", "Daily_Life_Impact_physical", "Daily_Life_Impact_Psychological",
    "Daily_Life_Impact_proffesional", "Daily_Life_Impact_family_life", "Daily_Life_Impact_social_life",
    "Daily_Life_Impact_no_effect", "Energy_Level", "Self_Care", "Daily_Activities_Difficulty",
    "Work_Readiness", "Support_Satisfaction", "Coping_Strategy", "Negative_Emotions"
]
df.columns = columns

# List of single-choice categorical variables to be one-hot encoded
single_choice_vars = [
    'Gender', 'Age_Group', 'Residence', 'Education_Level', 'Source_of_Income', 'Marital_Status',
    'Hospitalization_Number', 'Time_to_Treatment', 'Medical_Treatment_Need', 'Quality_of_Life',
    'Symptom_Frequency', 'Symptom_Household_Impact', 'Sleep_Issues', 'Dependency_Fear',
    'Energy_Level', 'Self_Care', 'Daily_Activities_Difficulty', 'Work_Readiness',
    'Support_Satisfaction', 'Coping_Strategy', 'Negative_Emotions'
]

# Mapping of variables to their options for meaningful labels
options_dict = {
    'Gender': {1: 'Female', 2: 'Male', 3: 'Nonbinary', 4: 'Prefer_not_to_say'},
    'Age_Group': {1: 'Below_30', 2: '30_45', 3: '46_60', 4: 'Above_60'},
    'Residence': {1: 'Village', 2: 'Town_upto_100k', 3: 'Town_100k_500k', 4: 'City_over_500k'},
    'Education_Level': {1: 'Primary', 2: 'Vocational', 3: 'Secondary', 4: 'Higher'},
    'Source_of_Income': {1: 'Employment', 2: 'Pension', 3: 'Retirement', 4: 'Other'},
    'Marital_Status': {1: 'Single', 2: 'Married', 3: 'Divorced', 4: 'Widowed'},
    'Hospitalization_Number': {1: 'Zero', 2: 'One_to_three', 3: 'Three_to_five', 4: 'Above_five'},
    'Time_to_Treatment': {1: 'Up_to_one_month', 2: 'One_to_three_months', 3: 'More_than_three_months'},
    'Medical_Treatment_Need': {1: 'Not_at_all', 2: 'Moderately', 3: 'Large_extent', 4: 'Very_large_extent'},
    'Quality_of_Life': {1: 'Very_good', 2: 'Good', 3: 'Bad', 4: 'Very_bad'},
    'Symptom_Frequency': {1: 'Did_not_occur', 2: 'Rather_rarely', 3: 'Most_of_time', 4: 'All_the_time'},
    'Symptom_Household_Impact': {1: 'Do_not_affect', 2: 'Rarely_affect', 3: 'Often_affect', 4: 'Always_affect'},
    'Sleep_Issues': {1: 'Not_once', 2: 'Rather_rarely', 3: 'Most_of_time', 4: 'All_the_time'},
    'Dependency_Fear': {1: 'Not_afraid', 2: 'Minimally', 3: 'Large_extent', 4: 'Very_large_extent'},
    'Energy_Level': {1: 'Fully', 2: 'Mostly', 3: 'Very_little', 4: 'Not_at_all'},
    'Self_Care': {1: 'No_problems', 2: 'Minor_problems', 3: 'Serious_problems', 4: 'Cannot_perform'},
    'Daily_Activities_Difficulty': {1: 'No_problems', 2: 'Minor_problems', 3: 'Moderate_problems', 4: 'Serious_problems', 5: 'Unable_to_perform'},
    'Work_Readiness': {1: 'Satisfied', 2: 'Moderately_satisfied', 3: 'Dissatisfied'},
    'Support_Satisfaction': {1: 'Satisfied', 2: 'Moderately_satisfied', 3: 'Dissatisfied'},
    'Coping_Strategy': {1: 'Well', 2: 'Hard_to_say', 3: 'Badly'},
    'Negative_Emotions': {1: 'Never', 2: 'Rarely', 3: 'Often', 4: 'All_the_time'}
}

# Map numerical codes to meaningful labels
for var in single_choice_vars:
    if var in options_dict:
        df[var] = df[var].map(options_dict[var])

# One-hot encode the single-choice categorical variables
df = pd.get_dummies(df, columns=single_choice_vars)

# Save the processed data to a CSV file
df.to_csv('processed_survey_data_columns.csv', index=False)

# Display the first few rows of the processed DataFrame
print(df.head())


   ID  Smoked_Cigarettes  Year_Diagnosed  Surgical_Treatment  Chemotherapy  \
0   1                  1            2021                   0             1   
1   2                  1            2020                   1             1   
2   3                  1            2020                   1             0   
3   4                  1            2021                   1             1   
4   5                  1            2019                   1             1   

   Radiotherapy  Immunotherapy  Molecular_targeted_Therapy  Emotional_Impact  \
0             0              1                           0                 1   
1             0              0                           0                 1   
2             0              0                           0                 1   
3             0              0                           0                 1   
4             0              0                           0                 1   

   Travel_Impact  ...  Support_Satisfaction_Dissat

# Data cleaning

In [81]:
df.shape

(300, 103)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Columns: 103 entries, ID to Negative_Emotions_Rarely
dtypes: bool(78), int64(25)
memory usage: 81.6 KB


In [83]:
# 1. Remove duplicate rows
df = df.drop_duplicates()
print(f"\nData after removing duplicates: {df.shape}")

# 2. Handle missing values
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Fill missing categorical values with mode
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))
print(f"\nData after handling missing values:\n{df.isnull().sum()}")

# 3. Drop irrelevant or redundant columns  
if 'ID' in df.columns:
    df = df.drop(columns=['ID'])

# 4. Standardize column names  
df.columns = [col.strip().replace(' ', '_').lower() for col in df.columns]


Data after removing duplicates: (300, 103)

Data after handling missing values:
ID                                0
Smoked_Cigarettes                 0
Year_Diagnosed                    0
Surgical_Treatment                0
Chemotherapy                      0
                                 ..
Coping_Strategy_Well              0
Negative_Emotions_All_the_time    0
Negative_Emotions_Never           0
Negative_Emotions_Often           0
Negative_Emotions_Rarely          0
Length: 103, dtype: int64


**All columns are integer value, numerical columns. But we have categorical columns in them like Gender, Marital Status, Smoked cigaretes, Chemoheraphy etc. ( some are binary, some are multiclass)**

In [84]:
import pandas as pd

 
df = pd.read_csv('processed_survey_data_columns.csv')

 
columns_to_exclude = ['ID', 'Year_Diagnosed']
df_for_itemsets = df.drop(columns=columns_to_exclude)

# Convert all non-zero values to 1 (binary)
df_for_itemsets = df_for_itemsets.applymap(lambda x: 1 if x != 0 else 0)

 
transactions = []
 
for index, row in df_for_itemsets.iterrows():
    # Extract the column names where the value is 1
    itemset = row[row == 1].index.tolist()
    transactions.append(itemset)

# Display the first 5 transactions
for i, transaction in enumerate(transactions[:5]):
    print(f"Transaction {i+1}: {transaction}")


Transaction 1: ['Smoked_Cigarettes', 'Chemotherapy', 'Immunotherapy', 'Emotional_Impact', 'Travel_Impact', 'Symptoms_exp_cough', 'Symptoms_exp_Shortness_of_breath', 'Support_From_Close', 'Daily_Life_Impact_Psychological', 'Gender_Female', 'Age_Group_Below_30', 'Residence_Town_upto_100k', 'Education_Level_Higher', 'Source_of_Income_Employment', 'Marital_Status_Single', 'Hospitalization_Number_Three_to_five', 'Time_to_Treatment_One_to_three_months', 'Medical_Treatment_Need_Large_extent', 'Quality_of_Life_Good', 'Symptom_Frequency_Most_of_time', 'Symptom_Household_Impact_Often_affect', 'Sleep_Issues_Most_of_time', 'Dependency_Fear_Large_extent', 'Energy_Level_Mostly', 'Self_Care_No_problems', 'Daily_Activities_Difficulty_No_problems', 'Work_Readiness_Moderately_satisfied', 'Support_Satisfaction_Moderately_satisfied', 'Coping_Strategy_Badly', 'Negative_Emotions_Often']
Transaction 2: ['Smoked_Cigarettes', 'Surgical_Treatment', 'Chemotherapy', 'Emotional_Impact', 'Travel_Impact', 'Symptoms_

  df_for_itemsets = df_for_itemsets.applymap(lambda x: 1 if x != 0 else 0)


In [85]:
import mlxtend
print("mlxtend version:", mlxtend.__version__)


mlxtend version: 0.23.3


In [86]:
# Load the data
df = pd.read_csv('processed_survey_data_columns.csv')

# List of columns to exclude (not relevant for itemsets)
columns_to_exclude = ['ID', 'Year_Diagnosed']

# Drop the columns to exclude
df_for_itemsets = df.drop(columns=columns_to_exclude)

# Ensure all values are binary (1 or 0)
df_for_itemsets = df_for_itemsets.apply(lambda x: x.apply(lambda y: 1 if y != 0 else 0))

# Prepare transactions
transactions = []
for index, row in df_for_itemsets.iterrows():
    # Get the features where the value is 1
    itemset = row[row == 1].index.tolist()
    transactions.append(itemset)


def run_fp_growth():
    
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_fpgrowth = pd.DataFrame(te_ary, columns=te.columns_)
    return fpgrowth(df_fpgrowth, min_support= 0.2, use_colnames=True)

 
memory_usage_fp = memory_usage(run_fp_growth, interval=0.1)   

peak_memory_fp = max(memory_usage_fp)

print(f"Peak Memory Usage for FP-Growth: {peak_memory_fp:.2f} MiB")

start_time = time.time()
fp_frequent_itemsets = run_fp_growth()
end_time = time.time()

fp_growth_execution_time = end_time - start_time

print(f"Execution Time for FP-Growth: {fp_growth_execution_time:.2f} seconds")
print("Frequent Itemsets:")
print(fp_frequent_itemsets.sort_values(by='support', ascending=False))

Peak Memory Usage for FP-Growth: 473.09 MiB
Execution Time for FP-Growth: 4.69 seconds
Frequent Itemsets:
       support                                           itemsets
0     0.903333                               (Support_From_Close)
1     0.863333                                 (Emotional_Impact)
65    0.810000             (Support_From_Close, Emotional_Impact)
33    0.776667                   (Support_Satisfaction_Satisfied)
6811  0.776667  (Support_From_Close, Support_Satisfaction_Sati...
...        ...                                                ...
2990  0.200000  (Symptom_Frequency_Most_of_time, Travel_Impact...
2992  0.200000  (Emotional_Impact, Health_Satisfaction, Sympto...
2993  0.200000  (Emotional_Impact, Symptom_Frequency_Most_of_t...
2994  0.200000  (Health_Satisfaction, Symptom_Frequency_Most_o...
9719  0.200000  (Self_Care_No_problems, Symptom_Frequency_Did_...

[9720 rows x 2 columns]


## Lowest ST, highest number of items 

In [87]:
print("Frequent Itemsets:") 

for i in fp_frequent_itemsets[-1:]["itemsets"]:
    print(i)

Frequent Itemsets:
frozenset({'Self_Care_No_problems', 'Symptom_Frequency_Did_not_occur', 'Symptom_Household_Impact_Do_not_affect', 'Coping_Strategy_Well', 'Symptoms_exp_None'})


# Relim

In [88]:
#!pip install pymining
from pymining import itemmining

In [89]:
 
def run_relim():
    trans_sets = [set(transaction) for transaction in transactions]
    relim_input = itemmining.get_relim_input(trans_sets)
    total_transactions = len(transactions)
    min_support_ratio = 0.2
    min_support = max(1, int(min_support_ratio * total_transactions))
    start_time = time.time()
    # Relim
    relim_itemsets = itemmining.relim(relim_input, min_support=min_support)
    # End the timer
    end_time = time.time()
    # Calculate execution time
    relim_execution_time = end_time - start_time
    print("Frequent Itemsets:")
    for itemset, support in relim_itemsets.items():
        itemset_str = ', '.join(itemset)
        print(f"Itemset: {{{itemset_str}}}, Support: {support}")
    return relim_itemsets, relim_execution_time

# Measure memory usage
mem_usage, (relim_itemsets, relim_execution_time) = memory_usage(run_relim, retval=True)

# Get peak memory usage
peak_mem_relim = max(mem_usage)
print(f"Peak memory usage: {peak_mem_relim} MiB")


Frequent Itemsets:
Itemset: {Self_Care_Minor_problems}, Support: 60
Itemset: {Self_Care_Minor_problems, Emotional_Impact}, Support: 60
Itemset: {Symptom_Frequency_Did_not_occur}, Support: 61
Itemset: {Symptoms_exp_None, Symptom_Frequency_Did_not_occur}, Support: 60
Itemset: {Symptoms_exp_None, Symptom_Frequency_Did_not_occur, Symptom_Household_Impact_Do_not_affect}, Support: 60
Itemset: {Symptoms_exp_None, Symptom_Frequency_Did_not_occur, Symptom_Household_Impact_Do_not_affect, Coping_Strategy_Well}, Support: 60
Itemset: {Symptom_Frequency_Did_not_occur, Self_Care_No_problems, Symptom_Household_Impact_Do_not_affect, Coping_Strategy_Well, Symptoms_exp_None}, Support: 60
Itemset: {Symptoms_exp_None, Symptom_Frequency_Did_not_occur, Self_Care_No_problems, Symptom_Household_Impact_Do_not_affect}, Support: 60
Itemset: {Symptoms_exp_None, Symptom_Frequency_Did_not_occur, Coping_Strategy_Well}, Support: 60
Itemset: {Symptoms_exp_None, Symptom_Frequency_Did_not_occur, Self_Care_No_problems, Co

# Comparing the algorithms

In [90]:
fp_frequent_itemsets # itermsets genenrated by fpgrowth
relim_itemsets # itemsets generated by relim

{frozenset({'Self_Care_Minor_problems'}): 60,
 frozenset({'Emotional_Impact', 'Self_Care_Minor_problems'}): 60,
 frozenset({'Symptom_Frequency_Did_not_occur'}): 61,
 frozenset({'Symptom_Frequency_Did_not_occur', 'Symptoms_exp_None'}): 60,
 frozenset({'Symptom_Frequency_Did_not_occur',
            'Symptom_Household_Impact_Do_not_affect',
            'Symptoms_exp_None'}): 60,
 frozenset({'Coping_Strategy_Well',
            'Symptom_Frequency_Did_not_occur',
            'Symptom_Household_Impact_Do_not_affect',
            'Symptoms_exp_None'}): 60,
 frozenset({'Coping_Strategy_Well',
            'Self_Care_No_problems',
            'Symptom_Frequency_Did_not_occur',
            'Symptom_Household_Impact_Do_not_affect',
            'Symptoms_exp_None'}): 60,
 frozenset({'Self_Care_No_problems',
            'Symptom_Frequency_Did_not_occur',
            'Symptom_Household_Impact_Do_not_affect',
            'Symptoms_exp_None'}): 60,
 frozenset({'Coping_Strategy_Well',
            'Sympto

### Time

In [91]:
print("execution time for fpgrowth",fp_growth_execution_time)
print("execution time for relim", relim_execution_time)

execution time for fpgrowth 4.69186544418335
execution time for relim 0.25864362716674805


### Memory usage

In [92]:
print("peak memory usage for fpgrowth", peak_memory_fp)
print("peak memory usage for relim", peak_mem_relim)   

peak memory usage for fpgrowth 473.0859375
peak memory usage for relim 473.203125


### Frequent itemset generation

In [93]:
num_itemsets_fp_growth = len(fp_frequent_itemsets)
num_itemsets_relim = len(relim_itemsets)

print("Number of itemsets generated by FP-Growth:", num_itemsets_fp_growth)
print("Number of itemsets generated by Relim:", num_itemsets_relim)

Number of itemsets generated by FP-Growth: 9720
Number of itemsets generated by Relim: 9720


###  Itemset Sizes (Lengths) Comparison

In [94]:
from collections import Counter

# Extract lengths of itemsets from FP-Growth results
fp_growth_lengths = Counter([len(itemset) for itemset in fp_frequent_itemsets['itemsets']])

# Extract lengths of itemsets from RElim results
relim_lengths = Counter([len(itemset) for itemset in relim_itemsets.keys()])

# Display the counts of itemsets by length
print("Lengths of frequent itemsets generated by FP-Growth:", fp_growth_lengths)
print("Lengths of frequent itemsets generated by RElim:", relim_lengths)


Lengths of frequent itemsets generated by FP-Growth: Counter({4: 2732, 5: 2537, 3: 1696, 6: 1481, 7: 550, 2: 532, 8: 117, 1: 65, 9: 10})
Lengths of frequent itemsets generated by RElim: Counter({4: 2732, 5: 2537, 3: 1696, 6: 1481, 7: 550, 2: 532, 8: 117, 1: 65, 9: 10})


### Assosiation Rule metrics

In [95]:
def run_fp_growth():
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df_fpgrowth = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df_fpgrowth, min_support=0.2, use_colnames=True)
    num_itemsets = len(frequent_itemsets)  # Number of frequent itemsets generated
    return df_fpgrowth, frequent_itemsets, num_itemsets

df_fpgrowth, fp_frequent_itemsets, num_itemsets_fp = run_fp_growth()

print(f"Number of frequent itemsets generated by FP-Growth: {num_itemsets_fp}")

# Generate association rules from FP-Growth frequent itemsets
fp_rules = mlxtend_association_rules(fp_frequent_itemsets,num_itemsets= 9720, metric="confidence", min_threshold=0.6)

# Calculate additional metrics for FP-Growth rules
def compute_additional_metrics(rule):
    antecedent_support = rule['antecedent support']
    consequent_support = rule['consequent support']
    confidence = rule['confidence']
    support = rule['support']

    # Kulczynski measure
    # The reversed confidence is the confidence of the rule with antecedent and consequent swapped
    confidence_reversed = support / consequent_support if consequent_support > 0 else 0
    kulczynski = 0.5 * (confidence + confidence_reversed)

    # Cosine similarity
    cosine_similarity = support / np.sqrt(antecedent_support * consequent_support) if (antecedent_support * consequent_support) > 0 else 0

    return pd.Series([kulczynski, cosine_similarity])

fp_rules[['kulczynski', 'cosine_similarity']] = fp_rules.apply(compute_additional_metrics, axis=1)

print("FP-Growth Association Rules with Metrics:")
print(fp_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'kulczynski', 'cosine_similarity']])

# Relim Model
def run_relim():
    trans_sets = [set(transaction) for transaction in transactions]
    relim_input = itemmining.get_relim_input(trans_sets)
    total_transactions = len(transactions)
    min_support_ratio = 0.2
    min_support = max(1, int(min_support_ratio * total_transactions))
    relim_itemsets = itemmining.relim(relim_input, min_support=min_support)
    num_itemsets = len(relim_itemsets)  # Number of frequent itemsets generated
    return relim_itemsets, num_itemsets

relim_itemsets, num_itemsets_relim = run_relim()

print(f"\nNumber of frequent itemsets generated by Relim: {num_itemsets_relim}")

# Convert relim_itemsets to DataFrame
relim_frequent_itemsets = pd.DataFrame([
    {'itemsets': frozenset(itemset), 'support': support / len(transactions)}
    for itemset, support in relim_itemsets.items()
])

# Generate association rules from Relim frequent itemsets
if not relim_frequent_itemsets.empty:
    relim_rules = mlxtend_association_rules(relim_frequent_itemsets, num_itemsets=9720, metric="confidence", min_threshold=0.6)

    # Calculate additional metrics for Relim rules
    relim_rules[['kulczynski', 'cosine_similarity']] = relim_rules.apply(compute_additional_metrics, axis=1)

    print("\nRelim Association Rules with Metrics:")
    print(relim_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'kulczynski', 'cosine_similarity']])
else:
    print("\nNo association rules generated by Relim with the given parameters.")


Number of frequent itemsets generated by FP-Growth: 9720
FP-Growth Association Rules with Metrics:
                                      antecedents  \
0                            (Support_From_Close)   
1                              (Emotional_Impact)   
2                                 (Travel_Impact)   
3                              (Emotional_Impact)   
4                            (Support_From_Close)   
...                                           ...   
163918  (Symptoms_exp_None, Coping_Strategy_Well)   
163919          (Symptom_Frequency_Did_not_occur)   
163920   (Symptom_Household_Impact_Do_not_affect)   
163921                     (Coping_Strategy_Well)   
163922                        (Symptoms_exp_None)   

                                              consequents   support  \
0                                      (Emotional_Impact)  0.810000   
1                                    (Support_From_Close)  0.810000   
2                                      (Emotional_I

In [96]:
print(fp_rules.head())

            antecedents           consequents  antecedent support  \
0  (Support_From_Close)    (Emotional_Impact)            0.903333   
1    (Emotional_Impact)  (Support_From_Close)            0.863333   
2       (Travel_Impact)    (Emotional_Impact)            0.680000   
3    (Emotional_Impact)       (Travel_Impact)            0.863333   
4  (Support_From_Close)       (Travel_Impact)            0.903333   

   consequent support   support  confidence      lift  representativity  \
0            0.863333  0.810000    0.896679  1.038624               1.0   
1            0.903333  0.810000    0.938224  1.038624               1.0   
2            0.863333  0.676667    0.995098  1.152623               1.0   
3            0.680000  0.676667    0.783784  1.152623               1.0   
4            0.680000  0.636667    0.704797  1.036466               1.0   

   leverage  conviction  zhangs_metric   jaccard  certainty  kulczynski  \
0  0.030122    1.322738       0.384703  0.846690   0.243992

In [97]:
print(relim_rules.head())

                                         antecedents  \
0                         (Self_Care_Minor_problems)   
1                                (Symptoms_exp_None)   
2                  (Symptom_Frequency_Did_not_occur)   
3  (Symptoms_exp_None, Symptom_Frequency_Did_not_...   
4  (Symptoms_exp_None, Symptom_Household_Impact_D...   

                                consequents  antecedent support  \
0                        (Emotional_Impact)            0.200000   
1         (Symptom_Frequency_Did_not_occur)            0.203333   
2                       (Symptoms_exp_None)            0.203333   
3  (Symptom_Household_Impact_Do_not_affect)            0.200000   
4         (Symptom_Frequency_Did_not_occur)            0.200000   

   consequent support  support  confidence      lift  representativity  \
0            0.863333      0.2    1.000000  1.158301               1.0   
1            0.203333      0.2    0.983607  4.837409               1.0   
2            0.203333      0.2    0.98

In [98]:
import pandas as pd

# Function to calculate aggregate metrics
def calculate_aggregate_metrics(rules_df, model_name):
    aggregate_metrics = {
        'Model': model_name,
        'Total_Rules': len(rules_df),
        'Average_Support': rules_df['support'].mean(),
        'Median_Support': rules_df['support'].median(),
        'Support_25th_Percentile': rules_df['support'].quantile(0.25),
        'Support_75th_Percentile': rules_df['support'].quantile(0.75),
        'Average_Confidence': rules_df['confidence'].mean(),
        'Median_Confidence': rules_df['confidence'].median(),
        'Confidence_25th_Percentile': rules_df['confidence'].quantile(0.25),
        'Confidence_75th_Percentile': rules_df['confidence'].quantile(0.75),
        'Average_Lift': rules_df['lift'].mean(),
        'Median_Lift': rules_df['lift'].median(),
        'Average_Kulczynski': rules_df['kulczynski'].mean(),
        'Median_Kulczynski': rules_df['kulczynski'].median(),
        'Average_Cosine_Similarity': rules_df['cosine_similarity'].mean(),
        'Median_Cosine_Similarity': rules_df['cosine_similarity'].median()
    }
    return aggregate_metrics

# Calculate for FP-Growth
fp_aggregate = calculate_aggregate_metrics(fp_rules, 'FP-Growth')

# Calculate for Relim
relim_aggregate = calculate_aggregate_metrics(relim_rules, 'Relim')

# Create a summary DataFrame
summary_df = pd.DataFrame([fp_aggregate, relim_aggregate])

print("Aggregate Metrics for Models:")
print(summary_df)

Aggregate Metrics for Models:
       Model  Total_Rules  Average_Support  Median_Support  \
0  FP-Growth       163923         0.238765        0.223333   
1      Relim       163923         0.238765        0.223333   

   Support_25th_Percentile  Support_75th_Percentile  Average_Confidence  \
0                 0.206667                 0.253333            0.788707   
1                 0.206667                 0.253333            0.788707   

   Median_Confidence  Confidence_25th_Percentile  Confidence_75th_Percentile  \
0              0.775                    0.684685                    0.895349   
1              0.775                    0.684685                    0.895349   

   Average_Lift  Median_Lift  Average_Kulczynski  Median_Kulczynski  \
0        1.5397     1.514423            0.631696           0.627413   
1        1.5397     1.514423            0.631696           0.627413   

   Average_Cosine_Similarity  Median_Cosine_Similarity  
0                   0.599702                 