In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import scipy.stats as stats


In [None]:
def fisher_p_value(rule):
    N = len(df)
    support_a_b = rule['support']
    support_a = rule['antecedent support']
    support_b = rule['consequent support']

    # Criando tabela de contingência
    a_and_b = support_a_b * N
    a_not_b = (support_a - support_a_b) * N
    not_a_b = (support_b - support_a_b) * N
    not_a_not_b = (1 - (support_a + support_b - support_a_b)) * N

    contingency_table = [[a_and_b, a_not_b], [not_a_b, not_a_not_b]]
    print(contingency_table)
    print('------')

    # Aplicando o teste de Fisher
    _, p_value = stats.fisher_exact(contingency_table)
    return p_value

def eclat(df, min_support):
    vertical_db = {item: set(df.index[df[item] == 1]) for item in df.columns}

    def recursive_eclat(prefix, items):
        results = []
        for i, (item, tids) in enumerate(items):
            new_prefix = prefix + [item]
            new_tids = tids
            support = len(new_tids)
            relative_support = support / len(df)

            if relative_support >= min_support:
                results.append((set(new_prefix), support, relative_support))
                new_items = [(other, other_tids & new_tids) for other, other_tids in items[i+1:] if len(other_tids & new_tids) >= min_support]
                results.extend(recursive_eclat(new_prefix, new_items))

        return results

    frequent_itemsets = recursive_eclat([], sorted(vertical_db.items(), key=lambda x: len(x[1]), reverse=True))

    return pd.DataFrame(frequent_itemsets, columns=["itemsets", "support (freq)", "support"])

In [None]:
df = pd.read_csv('data/trabalho4_dados_4.csv')
df = df.astype(bool)
df

In [None]:
itemsets = apriori(df, min_support=0.01, use_colnames=True)
itemsets

In [None]:
print(f"Mediana de suporte para itemsets: {itemsets['support'].median()}")
print(f"75% maiores suportes: {itemsets['support'].quantile(0.75)}")
print(f"90% maiores suportes: {itemsets['support'].quantile(0.90)}")
print(f"95% maiores suportes: {itemsets['support'].quantile(0.95)}")
itemsets['support'].hist(bins=100)
plt.yscale('log')

In [None]:
itemsets = apriori(df, min_support=0.02, use_colnames=True)
regras = association_rules(itemsets, len(df), metric="confidence", min_threshold=0.01)
regras.sort_values(by="lift", ascending=False)
regras  

In [None]:
print(f"Mediana de confiaça para regras: {regras['confidence'].median()}")
print(f"75% maiores confianças: {regras['confidence'].quantile(0.75)}")
print(f"90% maiores confianças: {regras['confidence'].quantile(0.90)}")
print(f"95% maiores confianças: {regras['confidence'].quantile(0.95)}")
regras['confidence'].hist(bins=100)
# plt.yscale('log')

In [None]:
itemsets = apriori(df, min_support=0.02, use_colnames=True)
regras = association_rules(itemsets, len(df), metric="confidence", min_threshold=0.5)
regras.sort_values(by="lift", ascending=False)
regras['p-value'] = regras.apply(fisher_p_value, axis=1)
regras  

In [100]:
itemsets = eclat(df, min_support=0.02)
regras = association_rules(itemsets, len(df), metric="confidence", min_threshold=0.5)
regras.sort_values(by="lift", ascending=False)
regras['p-value'] = regras.apply(fisher_p_value, axis=1)
regras

[[107.0, 58.99999999999999], [703.0, 3914.9999999999995]]
------
[[97.0, 72.99999999999999], [283.0, 4331.0]]
------
[[97.0, 24.0], [586.9999999999999, 4076.0]]
------
[[106.0, 106.0], [675.0, 3897.0]]
------
[[103.0, 88.00000000000001], [678.0, 3915.0000000000005]]
------
[[245.0, 135.0], [438.9999999999999, 3965.0]]
------
[[125.0, 111.00000000000001], [559.0, 3989.0000000000005]]
------
[[108.0, 104.0], [576.0, 3996.0000000000005]]
------
[[132.0, 27.999999999999996], [318.00000000000006, 4306.0]]
------
[[168.0, 0.0], [239.0, 4377.0]]
------
[[96.0, 61.99999999999999], [274.0, 4352.0]]
------
[[106.0, 106.0], [190.0, 4382.0]]
------
[[98.0, 93.0], [163.0, 4430.0]]
------
[[102.0, 89.0], [110.0, 4483.0]]
------


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski,p-value
0,(MORTADELA),(PAES),0.034699,0.169314,0.022366,0.644578,3.806991,1.0,0.016491,2.337183,0.763829,0.12313,0.572135,0.388339,1.0264629999999999e-44
1,"(LEGUMES, PROD_LIMPEZA)",(LIMP_ROUPAS),0.035535,0.079431,0.020276,0.570588,7.183406,1.0,0.017453,2.14379,0.892506,0.214128,0.533536,0.412926,6.495071e-66
2,"(LEGUMES, LIMP_ROUPAS)",(PROD_LIMPEZA),0.025293,0.142977,0.020276,0.801653,5.606882,1.0,0.01666,4.320826,0.842969,0.137006,0.768563,0.471733,1.187068e-61
3,(FEIJAO),(LEGUMES),0.044314,0.163253,0.022157,0.5,3.06274,1.0,0.014923,1.673495,0.704724,0.119504,0.402448,0.317862,1.599007e-31
4,(ARROZ),(LEGUMES),0.039925,0.163253,0.02153,0.539267,3.303269,1.0,0.015012,1.816122,0.726266,0.118527,0.449376,0.335575,2.384744e-34
5,(LIMP_ROUPAS),(PROD_LIMPEZA),0.079431,0.142977,0.051212,0.644737,4.509388,1.0,0.039856,2.412362,0.845391,0.299145,0.585469,0.501462,7.2843829999999995e-127
6,(PAPEL_HIGIENICO),(PROD_LIMPEZA),0.049331,0.142977,0.026129,0.529661,3.70453,1.0,0.019076,1.82214,0.767944,0.157233,0.451195,0.356205,1.832602e-47
7,(FEIJAO),(PROD_LIMPEZA),0.044314,0.142977,0.022575,0.509434,3.563059,1.0,0.016239,1.747009,0.752698,0.137056,0.427593,0.333664,1.0362599999999999e-38
8,(PRESUNTO),(QUEIJOS),0.033445,0.094064,0.027592,0.825,8.770667,1.0,0.024446,5.17678,0.91664,0.276151,0.80683,0.559167,3.561973e-115
9,(CREME_DE_LEITE),(LEITE),0.035117,0.085075,0.035117,1.0,11.7543,1.0,0.032129,inf,0.948224,0.412776,1.0,0.706388,8.045834e-197
