In [1]:
import pandas as pd
import numpy as np

## Q1
Create a dataset containing two categorical attributes (one with 2 categories and the other
one with 3 categories) and 2 numerical attributes with 100 data points. Write a program
to form contingency table for the categorical attributes and compute correlation between
the attributes using Chi-square test and compute correlation between numerical attributes
using correlation coefficient.

In [9]:
num_data_points = 100


cat1_categories = ['A', 'B']
category1 = np.random.choice(cat1_categories, size=num_data_points)
cat2_categories = ['X', 'Y', 'Z']
category2 = np.random.choice(cat2_categories, size=num_data_points)

numerical1 = np.random.rand(num_data_points) * 100
numerical2 = np.random.rand(num_data_points) * 50 + 20
data = {
    'Category1': category1,
    'Category2': category2,
    'Numerical1': numerical1,
    'Numerical2': numerical2
}
df = pd.DataFrame(data)

In [10]:
df.head()

Unnamed: 0,Category1,Category2,Numerical1,Numerical2
0,B,Z,0.151617,48.61987
1,A,Y,40.792677,55.249218
2,B,Y,48.453818,27.871206
3,B,Z,5.827235,66.335076
4,A,Z,47.183782,46.532804


In [11]:
contingency_table = pd.crosstab(df['Category1'], df['Category2'])
contingency_table

Category2,X,Y,Z
Category1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,23,17,20
B,11,14,15


In [14]:
contingency_dict = {}
for i in range(df['Category1'].unique().shape[0]):
  if df['Category1'].unique()[i] not in contingency_dict:
    contingency_dict[df['Category1'].unique()[i]] = {}
    for j in range(df['Category2'].unique().shape[0]):
      contingency_dict[df['Category1'].unique()[i]][df['Category2'].unique()[j]] = 0

for index, row in df.iterrows():
  contingency_dict[row['Category1']][row['Category2']] += 1

contingency_table = pd.DataFrame(contingency_dict.values(), index=contingency_dict.keys())

Unnamed: 0,Z,Y,X
B,15,14,11
A,20,17,23


In [19]:
total = contingency_table.sum().sum()
row_sums = contingency_table.sum(axis=1).to_dict()
col_sums = contingency_table.sum(axis=0).to_dict()

chi_square = 0
for i in range(contingency_table.shape[0]):
  for j in range(contingency_table.shape[1]):
    observed = contingency_table.iloc[i, j]
    expected = (row_sums[contingency_table.index[i]] * col_sums[contingency_table.columns[j]]) / total
    chi_square += (observed - expected) ** 2 / expected

print(f"Chi-square value: {chi_square}")

Chi-square value: 1.2915650131020153


In [21]:
from scipy.stats import chi2

dof = (contingency_table.shape[0] - 1) * (contingency_table.shape[1] - 1)
alpha = 0.05
p_value = 1 - chi2.cdf(chi_square, dof)

chisq_thresh = chi2.ppf(1-alpha, dof)

print(f"p-value: {p_value}")
print(f"chisq_thresh: {chisq_thresh}")

p-value: 0.5242521508182028
chisq_thresh: 5.991464547107979


<b>p value > level of significance (0.05) => they are not correlated (or) chisq_value < chisq_thresh so they are not correlated</b>

In [22]:
p_corr = 0
m1 = np.mean(df['Numerical1'])
m2 = np.mean(df['Numerical2'])
for i in range(df.shape[0]):
  p_corr += (df['Numerical1'][i] - m1) * (df['Numerical2'][i] - m2)
p_corr /= df.shape[0]
p_corr = p_corr / (np.std(df['Numerical1']) * np.std(df['Numerical2']))
print(f"Correlation coefficient: {p_corr}")

Correlation coefficient: -0.08100829361891143


## Q2
Write a program to transform the numerical attributes in the generated dataset using min-
max normalization and Z-score normalization.

In [23]:
from numpy.random import normal
def min_max_normalization(column):
  mini = column.min()
  maxi = column.max()
  return (column - mini) / (maxi - mini)

def z_score_normalization(column):
  mean = column.mean()
  std = column.std()
  return (column - mean) / std

normalized_df = df.copy()
normalized_df['Numerical1_min_max'] = min_max_normalization(normalized_df['Numerical1'])
normalized_df['Numerical2_min_max'] = min_max_normalization(normalized_df['Numerical2'])
normalized_df['Numerical1_z_score'] = z_score_normalization(normalized_df['Numerical1'])
normalized_df['Numerical2_z_score'] = z_score_normalization(normalized_df['Numerical2'])

normalized_df

Unnamed: 0,Category1,Category2,Numerical1,Numerical2,Numerical1_min_max,Numerical2_min_max,Numerical1_z_score,Numerical2_z_score
0,B,Z,0.151617,48.619870,0.000000,0.577266,-1.825400,0.203926
1,A,Y,40.792677,55.249218,0.416989,0.711095,-0.392906,0.639673
2,B,Y,48.453818,27.871206,0.495594,0.158405,-0.122870,-1.159886
3,B,Z,5.827235,66.335076,0.058233,0.934889,-1.625349,1.368347
4,A,Z,47.183782,46.532804,0.482563,0.535134,-0.167636,0.066743
...,...,...,...,...,...,...,...,...
95,A,Z,95.552791,59.767425,0.978843,0.802306,1.537249,0.936655
96,A,X,31.205960,62.346985,0.318626,0.854380,-0.730813,1.106210
97,A,X,57.242989,40.900398,0.585774,0.421430,0.186926,-0.303476
98,A,X,75.311997,59.048301,0.771167,0.787788,0.823812,0.889387


In [24]:
print('Minimum and maximum value of min-max columns (Numerical1) =>', normalized_df['Numerical1_min_max'].min(), normalized_df['Numerical1_min_max'].max())
print('Minimum and maximum value of z_score columns (Numerical1) =>', normalized_df['Numerical1_z_score'].min(), normalized_df['Numerical1_z_score'].max())

Minimum and maximum value of min-max columns (Numerical1) => 0.0 1.0
Minimum and maximum value of z_score columns (Numerical1) => -1.8254000909852266 1.609929962409665


## Q3
Write a program to perform the following. (Refer Chapter 6 in the book)
a. Find frequent itemsets from the transactional dataset using Apriori algorithm and
FP Growth algorithm. Show the itemsets at each step.
b. Mine association rules from the frequent itemsets.
c. Display whether the rules are positively correlated or negatively correlated using
lift measure. (Refer Example 6.8).

In [2]:
transactions = { "T1": ["A", "B", "C"], "T2": ["A", "C"], "T3": ["B", "D"] }

In [3]:
unique_items = set()
for transaction in transactions.values():
  for item in transaction:
    unique_items.add(item)
unique_items

{'A', 'B', 'C', 'D'}

In [35]:
def generate_combinations(itemset, n, poss_combin,curr_combin, i):
    if i == n:
        curr_combin = sorted(curr_combin)
        if curr_combin not in poss_combin:
            poss_combin.append(curr_combin)
        return
    else:
        for item in itemset:
            if item not in curr_combin:
                generate_combinations(itemset, n, poss_combin, curr_combin + [item], i+1)

a)

In [57]:
def aprioi_freq_itemset_mining(unique_items, transactions, min_support = 0.2):
    freq_item_sets = {}
    items_considered = list(unique_items)

    while(len(items_considered) > 0):
        num_items_added = 0
        for i in range(len(items_considered)):
            itemset = items_considered[i]
            support = 0
            for j in range(len(transactions.keys())):
                if set(itemset.split(',')).issubset(set(transactions[list(transactions.keys())[j]])):
                    support += 1
            support /= len(transactions.keys())
            if support >= min_support:
                freq_item_sets[itemset] = support
                num_items_added += 1

        items_considered = []
        len_itemset_prev = len(list(freq_item_sets.keys())[-1].split(','))
        for itemset in list(freq_item_sets.keys())[-num_items_added:]:
            for singleton_items in unique_items:

                if (singleton_items not in freq_item_sets.keys()) or (singleton_items in itemset.split(',')):
                    continue

                valid_combination = True
                        
                poss_combin = []
                if len_itemset_prev > 1:
                    generate_combinations(itemset.split(','), len_itemset_prev - 1, poss_combin,[], 0)
                    poss_combin = [sorted(combin + [singleton_items]) for combin in poss_combin]

                    for i in range(len(poss_combin)):
                        itemset_t = ','.join(poss_combin[i])
                        if itemset_t not in freq_item_sets.keys():
                            valid_combination = False
                            break

                    temp = ','.join(sorted(itemset.split(',') + [singleton_items]))
                    if valid_combination and temp not in items_considered:
                        items_considered.append(temp)

                else:
                    items_considered.append(','.join(sorted(itemset.split(',') + [singleton_items])))

    return freq_item_sets

In [58]:
freq_itemsets = aprioi_freq_itemset_mining(unique_items, transactions)
freq_itemsets

{'B': 0.6666666666666666,
 'A': 0.6666666666666666,
 'D': 0.3333333333333333,
 'C': 0.6666666666666666,
 'A,B': 0.3333333333333333,
 'B,D': 0.3333333333333333,
 'B,C': 0.3333333333333333,
 'A,C': 0.6666666666666666,
 'A,B,C': 0.3333333333333333}

In [59]:
from typing import Dict, List, Tuple

def generate_combinations(items: List[str], r: int, 
                         result: List[List[str]], 
                         current: List[str], 
                         start: int):
    """Helper to generate all combinations of size r"""
    if len(current) == r:
        result.append(current[:])
        return
    for i in range(start, len(items)):
        current.append(items[i])
        generate_combinations(items, r, result, current, i + 1)
        current.pop()


def mine_association_rules(freq_itemsets: Dict[str, float], 
                           min_confidence: float = 0.2) -> List[Tuple[str, str, float]]:
    """
    Mines strong association rules from frequent itemsets.
    
    Returns: List of (antecedent, consequent, confidence)
    Example: ('A', 'B,C', 0.85) means A => B,C with 85% confidence
    """
    rules = []
    
    for itemset_str, support in freq_itemsets.items():
        items = itemset_str.split(',')
        
        if len(items) < 2:  # Skip singletons
            continue
            
        # Generate all proper non-empty subsets (antecedents)
        subsets = []
        for r in range(1, len(items)):  # from size 1 to |itemset|-1
            comb = []
            generate_combinations(items, r, comb, [], 0)
            subsets.extend(comb)
        
        for subset in subsets:
            antecedent = ','.join(sorted(subset))
            remaining = [item for item in items if item not in subset]
            consequent = ','.join(sorted(remaining))
            
            # Skip if support of antecedent not available (should always be, but safe)
            if antecedent not in freq_itemsets:
                continue
                
            confidence = support / freq_itemsets[antecedent]
            
            if confidence >= min_confidence:
                rules.append((antecedent, consequent, round(confidence, 4)))
    
    # Optional: sort by confidence descending
    rules.sort(key=lambda x: x[2], reverse=True)
    
    return rules

In [60]:
associations = mine_association_rules(freq_itemsets)
associations

[('D', 'B', 1.0),
 ('A', 'C', 1.0),
 ('C', 'A', 1.0),
 ('A,B', 'C', 1.0),
 ('B,C', 'A', 1.0),
 ('A', 'B', 0.5),
 ('B', 'A', 0.5),
 ('B', 'D', 0.5),
 ('B', 'C', 0.5),
 ('C', 'B', 0.5),
 ('A', 'B,C', 0.5),
 ('B', 'A,C', 0.5),
 ('C', 'A,B', 0.5),
 ('A,C', 'B', 0.5)]

## Q4
Write a program to find closed and maximal frequent itemsets from the transactional
dataset.

In [66]:
# Closed frequent itemsets
closed_freq_itemsets = []

for i in range(len(freq_itemsets.keys())):
    itemset = list(freq_itemsets.keys())[i]
    flag = True
    for j in range(len(freq_itemsets.keys())):
        temp = set(list(freq_itemsets.keys())[j].split(','))
        if i == j:
            continue
        if set(itemset.split(',')).issubset(temp) and freq_itemsets[list(freq_itemsets.keys())[i]] == freq_itemsets[list(freq_itemsets.keys())[j]]:
            flag = False
            break
    if flag:
        closed_freq_itemsets.append(itemset)

closed_freq_itemsets

['B', 'B,D', 'A,C', 'A,B,C']

In [65]:
#Maximal frequent itemsets
maximal_itemsets = []
for i in range(len(freq_itemsets.keys())):
    itemset = list(freq_itemsets.keys())[i]
    flag = True
    for j in range(len(freq_itemsets.keys())):
        temp = set(list(freq_itemsets.keys())[j].split(','))
        if i == j:
            continue
        if set(itemset.split(',')).issubset(temp):
            flag = False
            break
    if flag:
        maximal_itemsets.append(itemset)

maximal_itemsets

['B,D', 'A,B,C']

## Q5
Write a program to convert the given transactional dataset into vertical format. ie., (TID,
Items) to (ItemID, Transactions).


In [25]:
transactions = { "T1": ["A", "B", "C"], "T2": ["A", "C"], "T3": ["B", "D"] }
vertical_format = {}
for transaction in transactions.keys():
  for item in transactions[transaction]:
    if item not in vertical_format:
      vertical_format[item] = [transaction]
    else:
      vertical_format[item].append(transaction)
pd.DataFrame(index = vertical_format.keys(), data = vertical_format.values())

Unnamed: 0,0,1
A,T1,T2
B,T1,T3
C,T1,T2
D,T3,
