In [1]:
#setup, grabbing all relevant libraries
import sys
!{sys.executable} -m pip install mlxtend
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import math





In [2]:
#Q1
data = pd.read_csv(r"mammographic_masses.csv",delimiter=",",header=0)

#preprocessing: change to one hot encoding so as to be able to use apriori from mlxtend
d=data.values.tolist()

#removing nan values
for i in range(0,len(d)):
    j=0
    while(True):
        if (type(d[i][j])==float and math.isnan(d[i][j])) :
            del d[i][j]
            j-=1
        j+=1
        if (j>len(d[i])-1):
            break

#adding attributes
for i in range(len(d)):
    for j in range (len(d[i])):
        d[i][j]=data.columns[j] + "=" +str(d[i][j])

            
te = TransactionEncoder()
te_ary = te.fit(d).transform(d)

df = pd.DataFrame(te_ary, columns=te.columns_)

#computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

a=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

#visualizing association rules results
a[["antecedents","consequents","support","confidence"]]

fname = 'Severity'
filtered_solution = a[a['consequents'].astype(str).str.contains(fname)]
filtered_solution.sort_values(by=['confidence'], ascending=False)

#Interestingly, BI-RADS=5 and Shape=4 correctly predict that a lesion is malignant in both of the high support high confidence 
# results. This could suggest that these two values together are good predictors or that doctors' prediction is more heavily 
# biased by irregular shapes (that is, Shape=4 -> BI-RADS=5 whether or not Shape=4 -> Severity=1)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,"(Density=3, BI-RADS=5, Shape=4)",(Severity=1),0.245578,0.463059,0.224766,0.915254,1.976538,0.111049,6.3359
2,"(BI-RADS=4, Margin=1)",(Severity=0),0.328824,0.536941,0.299688,0.911392,1.69738,0.123129,5.225955
5,"(BI-RADS=5, Shape=4)",(Severity=1),0.271592,0.463059,0.246618,0.908046,1.960971,0.120855,5.83923
7,"(BI-RADS=4, Density=3, Margin=1)",(Severity=0),0.263267,0.536941,0.238293,0.905138,1.685732,0.096934,4.881417


In [10]:
#Q2

#computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

a=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

#visualizing association rules results
a[["antecedents","consequents","support","confidence"]]

cons_name = 'Severity'
excl_name = 'BI-RADS'
filt_name = ','
filtered_solution = a[a['consequents'].astype(str).str.contains(cons_name)]
filtered_solution = filtered_solution[~filtered_solution['antecedents'].astype(str).str.contains(excl_name)]
filtered_solution = filtered_solution[filtered_solution['antecedents'].astype(str).str.contains(filt_name)]
filtered_solution.sort_values(by=['confidence'], ascending=False)
#https://stackoverflow.com/questions/17097643/search-for-does-not-contain-on-a-dataframe-in-pandas
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html

#lines 12 and 28 suggest that Margin=1 and Shape <=2 together most likely indicate that the lesion is benign (Severity=0). 
# Conservatively, Margin should exactly equal 1 since that exact value is observed in both sets, while Shape seems to be able to
# vary slightly. Also, since Density=3 increases support but decreases confidence it is notable but is not the value with the
# greatest predictive power since it is observed more times but is predictive in fewer of those obervations.

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
12,"(Shape=2, Margin=1)",(Severity=0),0.150884,0.536941,0.136316,0.903448,1.682585,0.0553,4.795971
29,"(Density=3, Margin=1, Shape=1)",(Severity=0),0.159209,0.536941,0.1436,0.901961,1.679815,0.058115,4.723205


In [4]:
#Q3
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

a=association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

#visualizing association rules results
a[["antecedents","consequents","support","confidence"]]

cons_name = 'Severity=0'
incl_name = 'BI-RADS'
filt_name = ','
filtered_solution = a[a['consequents'].astype(str).str.contains(cons_name)]
filtered_solution = filtered_solution[filtered_solution['antecedents'].astype(str).str.contains(incl_name)]
filtered_solution = filtered_solution[~filtered_solution['consequents'].astype(str).str.contains(filt_name)]
filtered_solution.sort_values(by=['confidence'], ascending=False)

#All significant results where BI-RADS=4 is an antecedent result in Severity=0, meaning the lesion is benign even when doctors 
# appear to show concern about the lesion. This seems to frequently coincide with Margin=1, Shape<=2, and Density=3, so 
# something about these particular values appears to confuse the human observers and cause them to rate the lesion as more 
# likely to be malignant than raw data suggests. My theory is that the observers are taking a 'better safe than sorry' view
# and performing tests on more people than is perhaps necessary to avoid lawsuits and avoidable death and suffering. To support 
# this, I would focus on lines 4 and 14, as these have the highest support and sufficiently high confidence, illustrating that
# when an observer gives a BI-RADS rating of 4 specifically when Margin=1, all else equal, the lesion ends up being benign

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
22,"(BI-RADS=4, Margin=1, Shape=2)",(Severity=0),0.133195,0.536941,0.12487,0.9375,1.746003,0.053352,7.408949
30,"(BI-RADS=4, Density=3, Margin=1, Shape=1)",(Severity=0),0.144641,0.536941,0.133195,0.920863,1.715019,0.055531,5.851386
17,"(BI-RADS=4, Density=3, Shape=1)",(Severity=0),0.159209,0.536941,0.145682,0.915033,1.70416,0.060196,5.449852
3,"(BI-RADS=4, Margin=1)",(Severity=0),0.328824,0.536941,0.299688,0.911392,1.69738,0.123129,5.225955
20,"(BI-RADS=4, Margin=1, Shape=1)",(Severity=0),0.171696,0.536941,0.156087,0.909091,1.693094,0.063897,5.093652
5,"(BI-RADS=4, Shape=1)",(Severity=0),0.191467,0.536941,0.173777,0.907609,1.690333,0.070971,5.011936
13,"(BI-RADS=4, Density=3, Margin=1)",(Severity=0),0.263267,0.536941,0.238293,0.905138,1.685732,0.096934,4.881417
6,"(BI-RADS=4, Shape=2)",(Severity=0),0.180021,0.536941,0.162331,0.901734,1.679392,0.06567,4.712309


In [5]:
#Q4


all_items = apriori(df, min_support=0.0001, use_colnames=True)
a4=association_rules(all_items, metric="confidence", min_threshold=0.0001)


cons1_name = 'Severity=1'
cons0_name = 'Severity=0'
incl_name = 'Age=35'
filt_name = ','
a4 = a4[a4['antecedents'].astype(str).str.contains(incl_name)]
a4 = a4[~a4['consequents'].astype(str).str.contains(filt_name)]

filtered_solution1 = a4[a4['consequents'].astype(str).str.contains(cons1_name)]
filtered_solution1_only = filtered_solution1[filtered_solution1['consequents'].astype(str).str.contains(cons1_name)]
filtered_solution1_only = filtered_solution1[~filtered_solution1['antecedents'].astype(str).str.contains(filt_name)]
filtered_solution1 = filtered_solution1[filtered_solution1['antecedents'].astype(str).str.contains(filt_name)]

filtered_solution0 = a4[a4['consequents'].astype(str).str.contains(cons0_name)]
filtered_solution0_only = filtered_solution0[~filtered_solution0['antecedents'].astype(str).str.contains(filt_name)]
filtered_solution0 = filtered_solution0[filtered_solution0['antecedents'].astype(str).str.contains(filt_name)]
#filtered_solution.sort_values(by=['confidence'], ascending=False)

filtered_solution0_only
#filtered_solution0

#

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
301,(Age=35),(Severity=0),0.013528,0.536941,0.012487,0.923077,1.719141,0.005223,6.019771


In [6]:
filtered_solution0.sort_values(by=['confidence'], ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4688,"(Age=35, BI-RADS=2)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf
111777,"(BI-RADS=4, Margin=1, Age=35, Shape=1)",(Severity=0),0.005203,0.536941,0.005203,1.0,1.862403,0.002409,inf
111686,"(BI-RADS=4, Density=?, Age=35, Margin=2)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf
111627,"(BI-RADS=4, Density=3, Age=35, Shape=4)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf
111567,"(BI-RADS=4, Density=3, Age=35, Shape=1)",(Severity=0),0.004162,0.536941,0.004162,1.0,1.862403,0.001927,inf
111476,"(BI-RADS=4, Margin=4, Density=3, Age=35)",(Severity=0),0.002081,0.536941,0.002081,1.0,1.862403,0.000964,inf
111327,"(BI-RADS=4, Density=2, Age=35, Shape=3)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf
111297,"(BI-RADS=4, Density=2, Age=35, Shape=1)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf
111236,"(BI-RADS=4, Density=2, Age=35, Margin=?)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf
111176,"(BI-RADS=4, Density=2, Margin=1, Age=35)",(Severity=0),0.001041,0.536941,0.001041,1.0,1.862403,0.000482,inf


In [7]:
filtered_solution1_only

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
303,(Age=35),(Severity=1),0.013528,0.463059,0.001041,0.076923,0.166119,-0.005223,0.581686


In [8]:
filtered_solution1.sort_values(by=['confidence'], ascending=False).head(10)

#The first output shows that Age=35 -> Severity=0 has very high confidence but very low support. This could mean that there are
# not many instances where Age=35 was observed but most of them resulted in Severity=0. This is supported by the third output
# where Age=35 -> Severity=1 shows very low support and very low confidence. So the data we've gathered may not be sufficiently
# comprehensive in order to support any conclusion surrounding the predictive power of Age=35. This is bolstered by the second
# and fourth outputs, which show Age=35 plus other antecedents and their resulting Severity=0- and Severity=1 respectively. 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
111840,"(BI-RADS=4, Margin=1, Age=35, Shape=2)",(Severity=1),0.003122,0.463059,0.001041,0.333333,0.71985,-0.000405,0.805411
112140,"(Shape=2, Density=3, Margin=1, Age=35)",(Severity=1),0.003122,0.463059,0.001041,0.333333,0.71985,-0.000405,0.805411
202629,"(Density=3, Age=35, BI-RADS=4, Shape=2, Margin=1)",(Severity=1),0.003122,0.463059,0.001041,0.333333,0.71985,-0.000405,0.805411
33794,"(BI-RADS=4, Age=35, Shape=2)",(Severity=1),0.004162,0.463059,0.001041,0.25,0.539888,-0.000887,0.715921
34060,"(Shape=2, Density=3, Age=35)",(Severity=1),0.004162,0.463059,0.001041,0.25,0.539888,-0.000887,0.715921
34144,"(Shape=2, Margin=1, Age=35)",(Severity=1),0.004162,0.463059,0.001041,0.25,0.539888,-0.000887,0.715921
111660,"(BI-RADS=4, Density=3, Age=35, Shape=2)",(Severity=1),0.004162,0.463059,0.001041,0.25,0.539888,-0.000887,0.715921
4963,"(Shape=2, Age=35)",(Severity=1),0.005203,0.463059,0.001041,0.2,0.43191,-0.001369,0.671176
33935,"(Density=3, Margin=1, Age=35)",(Severity=1),0.007284,0.463059,0.001041,0.142857,0.308507,-0.002332,0.626431
111390,"(BI-RADS=4, Density=3, Margin=1, Age=35)",(Severity=1),0.007284,0.463059,0.001041,0.142857,0.308507,-0.002332,0.626431


In [9]:
#Q5
import numpy as np

dataset5 = pd.read_csv('mammographic_masses.csv')
dataset5.drop( dataset5.index[ dataset5['Age'] == '?'], inplace = True )
#datasetln.head()

dataset5['AgeBucket'] = round( dataset5['Age'].astype(int) / 12 , 0)

d5=dataset5.values.tolist()

for i in range(len(d5)):
    j=0
    while(True):
        if (type(d5[i][j])==float and math.isnan(d5[i][j])) :
            del d5[i][j]
            j-=1
        j+=1
        if (j>len(d5[i])-1):
            break
            
#adding attributes
for i in range(len(d5)):
    for j in range (len(d5[i])):
        d5[i][j]=dataset5.columns[j] + "=" +str(d5[i][j])
        
te = TransactionEncoder()
te_ary = te.fit(d5).transform(d5)
df5 = pd.DataFrame(te_ary, columns=te.columns_)

#computing frequent itemsets and association rules
frequent_itemsets5 = apriori(df5, min_support=0.1, use_colnames=True)

a5=association_rules(frequent_itemsets5, metric="confidence", min_threshold=0.9)  

a5[["antecedents","consequents","support","confidence"]]

cons_name = 'Severity=1'
ante_name = 'AgeBucket'
#fname = ','
filtered_solution5 = a5[a5['consequents'].astype(str).str.contains(cons_name)]
#filtered_solution = filtered_solution[~filtered_solution['consequents'].astype(str).str.contains(fname)]
filtered_solution5 = filtered_solution5[filtered_solution5['antecedents'].astype(str).str.contains(ante_name)]
#filtered_solution.sort_values(by=['confidence'], ascending=False)

filtered_solution5

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,"(BI-RADS=5, AgeBucket=6.0)",(Severity=1),0.116109,0.460251,0.107741,0.927928,2.016134,0.054301,7.489017
17,"(BI-RADS=5, Density=3, AgeBucket=6.0)",(Severity=1),0.105649,0.460251,0.100418,0.950495,2.065167,0.051794,10.902929
