# Imports

In [126]:
import pandas as pd

# Itemset Analysis

## Markdown Cells

In [127]:
# load in the itemsets
md_itemsets_original = pd.read_pickle('md_frequent_itemsets.pkl')

In [128]:
# copy the original dataframe
md_itemsets = md_itemsets_original.copy()

In [147]:
# initial look at the data
md_itemsets.head()
print("original length is " + str(len(md_itemsets_original)) + " itemsets")

original length is 1267 itemsets


In [130]:
# filter out itemsets with length 1
md_itemsets_small = md_itemsets[md_itemsets['length'] == 1]
md_itemsets =  md_itemsets.drop(index = md_itemsets_small.index)
md_itemsets.head()

Unnamed: 0,support,itemsets,length
19,0.515932,"(has_author_F, longer_beginning_T)",2
20,0.50371,"(longer_beginning_T, num_contrib_lower)",2
21,0.503274,"(longer_beginning_T, has_comments_T)",2
22,0.516368,"(exec_inorder_higher, longer_beginning_T)",2
23,0.713226,"(has_author_F, longer_ending_F)",2


In [131]:
# drop all sets that are subsets of other sets
num_itemsets = len(md_itemsets)
for index in md_itemsets.index:
    
    # check if this index is a subset of a later frozenset
    for i in range(index + 1, num_itemsets):
        if md_itemsets.loc[index]['itemsets'].issubset(md_itemsets.loc[i]['itemsets']):
            md_itemsets = md_itemsets.drop(index = index)
            break
            
print("filtered down to " + str(len(md_itemsets)) + " itemsets")

filtered down to 301 itemsets


In [132]:
# add back the length 1 itemsets
md_itemsets = md_itemsets.append(md_itemsets_small)
print("added back " + str(len(md_itemsets_small)) + " itemsets")

added back 19 itemsets


In [133]:
# look at the filtered itemsets
md_itemsets

Unnamed: 0,support,itemsets,length
19,0.515932,"(has_author_F, longer_beginning_T)",2
20,0.503710,"(longer_beginning_T, num_contrib_lower)",2
21,0.503274,"(longer_beginning_T, has_comments_T)",2
22,0.516368,"(exec_inorder_higher, longer_beginning_T)",2
39,0.516805,"(has_author_F, image_prop_low)",2
...,...,...,...
14,0.539502,(md_format_F),1
15,0.874291,(non_exec_prop_low),1
16,0.951550,(exec_inorder_higher),1
17,0.840244,(exec_skips_lower),1


In [134]:
# create a copy of the filtered dataframe we will export
md_itemsets_export = md_itemsets.copy()

# change itemset column to string
md_itemsets_export['itemsets'] = md_itemsets_export['itemsets'].apply(lambda x : str(x).replace('frozenset', ''))

In [135]:
# check the export dataframe
md_itemsets_export

Unnamed: 0,support,itemsets,length
19,0.515932,"({'has_author_F', 'longer_beginning_T'})",2
20,0.503710,"({'longer_beginning_T', 'num_contrib_lower'})",2
21,0.503274,"({'longer_beginning_T', 'has_comments_T'})",2
22,0.516368,"({'exec_inorder_higher', 'longer_beginning_T'})",2
39,0.516805,"({'has_author_F', 'image_prop_low'})",2
...,...,...,...
14,0.539502,({'md_format_F'}),1
15,0.874291,({'non_exec_prop_low'}),1
16,0.951550,({'exec_inorder_higher'}),1
17,0.840244,({'exec_skips_lower'}),1


In [136]:
# export the dataframe
md_itemsets_export.to_csv('md_itemsets_filtered.csv')

## No Markdown Cells

In [137]:
# load in the itemsets
no_md_itemsets_original = pd.read_pickle('no_md_frequent_itemsets.pkl')

In [138]:
# copy the original dataframe
no_md_itemsets = no_md_itemsets_original.copy()

In [148]:
# initial look at the data
no_md_itemsets.head()
print("original length is " + str(len(no_md_itemsets_original)) + " itemsets")

original length is 346 itemsets


In [140]:
# filter out itemsets with length 1
no_md_itemsets_small = no_md_itemsets[no_md_itemsets['length'] == 1]
no_md_itemsets = no_md_itemsets.drop(index = no_md_itemsets_small.index)
no_md_itemsets.head()

Unnamed: 0,support,itemsets,length
11,0.669373,"(has_author_F, jupyter_prop_high)",2
12,0.904797,"(has_author_F, num_contrib_lower)",2
13,0.667897,"(has_author_F, image_prop_low)",2
14,0.883395,"(has_author_F, is_education_F)",2
15,0.669373,"(has_author_F, has_comments_T)",2


In [141]:
# drop all sets that are subsets of other sets
num_itemsets = len(no_md_itemsets)
for index in no_md_itemsets.index:
    
    # check if this index is a subset of a later frozenset
    for i in range(index + 1, num_itemsets):
        if no_md_itemsets.loc[index]['itemsets'].issubset(no_md_itemsets.loc[i]['itemsets']):
            no_md_itemsets = no_md_itemsets.drop(index = index)
            break

print("filtered down to " + str(len(no_md_itemsets)) + " itemsets")

filtered down to 94 itemsets


In [142]:
# add back the length 1 itemsets
no_md_itemsets = no_md_itemsets.append(no_md_itemsets_small)
print("added back " + str(len(no_md_itemsets_small)) + " itemsets")

added back 11 itemsets


In [143]:
# look at the filtered itemsets
no_md_itemsets

Unnamed: 0,support,itemsets,length
64,0.524723,"(has_author_F, jupyter_prop_high, num_commits_...",3
78,0.535055,"(has_author_F, image_prop_low, num_commits_lower)",3
110,0.500369,"(jupyter_prop_high, has_error_F, is_education_F)",3
166,0.500369,"(has_error_F, has_comments_T, exec_skips_lower)",3
180,0.529889,"(has_author_F, jupyter_prop_high, exec_skips_l...",4
...,...,...,...
6,0.769004,(num_commits_lower),1
7,0.921771,(non_exec_prop_lower),1
8,0.889299,(exec_inorder_higher),1
9,0.870111,(exec_skips_lower),1


In [144]:
# create a copy of the filtered dataframe we will export
no_md_itemsets_export = no_md_itemsets.copy()

# change itemset column to string
no_md_itemsets_export['itemsets'] = no_md_itemsets_export['itemsets'].apply(lambda x : str(x).replace('frozenset', ''))

In [145]:
# check the export dataframe
no_md_itemsets_export

Unnamed: 0,support,itemsets,length
64,0.524723,"({'has_author_F', 'jupyter_prop_high', 'num_co...",3
78,0.535055,"({'has_author_F', 'image_prop_low', 'num_commi...",3
110,0.500369,"({'jupyter_prop_high', 'has_error_F', 'is_educ...",3
166,0.500369,"({'has_error_F', 'has_comments_T', 'exec_skips...",3
180,0.529889,"({'has_author_F', 'jupyter_prop_high', 'exec_s...",4
...,...,...,...
6,0.769004,({'num_commits_lower'}),1
7,0.921771,({'non_exec_prop_lower'}),1
8,0.889299,({'exec_inorder_higher'}),1
9,0.870111,({'exec_skips_lower'}),1


In [146]:
# export the dataframe
no_md_itemsets_export.to_csv('no_md_itemsets_filtered.csv')