# Imports

In [1]:
import pandas as pd
import numpy
from mlxtend.frequent_patterns import apriori, association_rules

# Loading and Combining the Clustered and Binned Data

## Markdown Cells

In [2]:
# load in cluster data 
md_cluster_original = pd.read_csv('markdown_group_clusters.csv')

In [3]:
# initial look at the data
md_cluster_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars,cluster
0,0,594,True,False,False,False,0.507588,1.0,0.044444,3.0,...,1.0,1.809524,False,False,0.0,False,7.0,False,10.0,3
1,1,1222,True,False,False,False,1.0,0.0,0.16129,1.0,...,1.0,0.0,False,False,0.0,False,5.0,False,0.0,3
2,2,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,...,1.0,0.0,False,True,15.0,False,30.0,False,0.0,2
3,3,2705,True,False,False,False,1.0,0.5,0.461538,1.0,...,0.923077,1.615385,False,False,6.0,False,7.0,False,0.0,3
4,4,2861,True,False,False,True,1.0,0.214286,0.461538,1.0,...,1.0,6.333333,False,False,0.0,False,1.0,False,0.0,3


In [4]:
# copy the cluster column
md_cluster_col = md_cluster_original['cluster'].copy()

In [5]:
# load in binned data 
md_binned_original = pd.read_csv('../binning-data/markdown_group_binned.csv')

In [6]:
# initial look at the data
md_binned_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,0,594,True,False,False,False,medium,high,low,lower,...,low,higher,lower,False,False,lower,False,medium,False,higher
1,1,1222,True,False,False,False,high,low,medium,lower,...,high,higher,lower,False,False,lower,False,medium,False,lower
2,2,1447,True,False,False,False,high,low,medium,lower,...,high,higher,lower,False,True,higher,False,high,False,lower
3,3,2705,True,False,False,False,high,medium,medium,lower,...,low,higher,lower,False,False,lower,False,medium,False,lower
4,4,2861,True,False,False,True,high,low,medium,lower,...,low,higher,lower,False,False,lower,False,low,False,lower


In [7]:
# create a copy of the binned dataframe we will modify
md_df = md_binned_original.copy()

# drop the first column
md_df = md_df.drop(['Unnamed: 0'], axis = 1)

In [8]:
# check that the nb_id columns match up
print((md_df['nb_id'] == md_cluster_original['nb_id']).value_counts())

True    2291
Name: nb_id, dtype: int64


In [9]:
# add the cluster column to the dataframe
md_df['cluster'] = md_cluster_col

# drop the first column
md_df = md_df.drop(['nb_id'], axis = 1)

# check the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars,cluster
0,True,False,False,False,medium,high,low,lower,low,True,...,higher,lower,False,False,lower,False,medium,False,higher,3
1,True,False,False,False,high,low,medium,lower,low,True,...,higher,lower,False,False,lower,False,medium,False,lower,3
2,True,False,False,False,high,low,medium,lower,low,True,...,higher,lower,False,True,higher,False,high,False,lower,2
3,True,False,False,False,high,medium,medium,lower,low,True,...,higher,lower,False,False,lower,False,medium,False,lower,3
4,True,False,False,True,high,low,medium,lower,low,False,...,higher,lower,False,False,lower,False,low,False,lower,3


In [10]:
# check the number of notebooks per cluster
md_df['cluster'].value_counts()

3    1736
2     417
0     113
1      25
Name: cluster, dtype: int64

In [11]:
# separate the clusters
md_clusters = []
for i in range(4):
    cluster = md_df[md_df['cluster'] == i].copy()
    md_clusters.append(cluster)

In [12]:
# check that the clusters were separated correctly
for cluster in md_clusters:
    print(cluster['cluster'].value_counts())

0    113
Name: cluster, dtype: int64
1    25
Name: cluster, dtype: int64
2    417
Name: cluster, dtype: int64
3    1736
Name: cluster, dtype: int64


In [13]:
# drop the cluster column from each cluster dataframe
for (i, cluster) in enumerate(md_clusters):
    md_clusters[i] = cluster.drop(['cluster'], axis = 1)

## No Markdown Cells

In [14]:
# load in cluster data
no_md_cluster_original = pd.read_csv('no_markdown_group_clusters.csv')

In [15]:
# initial look at the data
no_md_cluster_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars,cluster
0,0,1589,False,0.71413,0.538462,1.0,0.142857,False,False,1.0,0.0,1.0,1.0,False,True,2.0,False,False,1.0,1
1,1,1919,False,0.99784,0.780488,1.0,0.1875,False,True,1.0,0.04878,0.789474,10.631579,True,True,5.0,False,False,1.0,1
2,2,2857,False,0.121957,0.461538,5.0,0.0,False,True,11.0,0.0,0.909091,6.181818,False,True,0.0,False,False,4.0,1
3,3,4339,False,1.0,0.190476,1.0,0.0,False,True,1.0,0.0,1.0,1.631579,False,False,1.0,False,False,0.0,1
4,4,4659,False,0.940063,0.125,3.0,0.0,False,True,3.0,0.125,1.0,1.0,True,False,3.0,False,False,0.0,1


In [16]:
# copy the cluster column
no_md_cluster_col = no_md_cluster_original['cluster'].copy()

In [17]:
# load in binned data
no_md_binned_original = pd.read_csv('../binning-data/no_markdown_group_binned.csv')

In [18]:
# initial look at the data
no_md_binned_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,0,1589,False,medium,medium,lower,low,False,False,lower,lower,higher,lower,False,True,lower,False,False,lower
1,1,1919,False,high,high,lower,low,False,True,lower,lower,higher,lower,True,True,lower,False,False,lower
2,2,2857,False,low,medium,higher,low,False,True,higher,lower,higher,lower,False,True,lower,False,False,lower
3,3,4339,False,high,low,lower,low,False,True,lower,lower,higher,lower,False,False,lower,False,False,lower
4,4,4659,False,high,low,lower,low,False,True,higher,lower,higher,lower,True,False,lower,False,False,lower


In [19]:
# create a copy
no_md_df = no_md_binned_original.copy()

# drop the first column
no_md_df = no_md_df.drop(['Unnamed: 0'], axis = 1)

In [20]:
# check that the nb_id columns match up
print((no_md_df['nb_id'] == no_md_cluster_original['nb_id']).value_counts())

True    1354
Name: nb_id, dtype: int64


In [22]:
# add the cluster column to the dataframe
no_md_df['cluster'] = no_md_cluster_col

# drop the first column
no_md_df = no_md_df.drop(['nb_id'], axis = 1)

no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars,cluster
0,False,medium,medium,lower,low,False,False,lower,lower,higher,lower,False,True,lower,False,False,lower,1
1,False,high,high,lower,low,False,True,lower,lower,higher,lower,True,True,lower,False,False,lower,1
2,False,low,medium,higher,low,False,True,higher,lower,higher,lower,False,True,lower,False,False,lower,1
3,False,high,low,lower,low,False,True,lower,lower,higher,lower,False,False,lower,False,False,lower,1
4,False,high,low,lower,low,False,True,higher,lower,higher,lower,True,False,lower,False,False,lower,1


In [23]:
# check the number of notebooks per cluster
no_md_df['cluster'].value_counts()

1    1206
3     121
0      15
2      12
Name: cluster, dtype: int64

In [24]:
# separate the clusters
no_md_clusters = []
for i in range(4):
    cluster = no_md_df[no_md_df['cluster'] == i].copy()
    no_md_clusters.append(cluster)

In [25]:
# check that the clusters were separated correctly
for cluster in no_md_clusters:
    print(cluster['cluster'].value_counts())

0    15
Name: cluster, dtype: int64
1    1206
Name: cluster, dtype: int64
2    12
Name: cluster, dtype: int64
3    121
Name: cluster, dtype: int64


In [26]:
# drop the cluster column
for (i, cluster) in enumerate(no_md_clusters):
    no_md_clusters[i] = cluster.drop(['cluster'], axis = 1)

# One-Hot Encoding

## Markdown Cells

In [27]:
# create copies of the clusters that we will one-hot encode
md_clusters_onehot = []
for i in range(4):
    md_clusters_onehot.append(md_clusters[i].copy())

In [28]:
# change True/False to T/F and medium to med
for (i, cluster) in enumerate(md_clusters_onehot):
    cluster = cluster.replace(True, 'T')
    cluster = cluster.replace(False, 'F')
    cluster = cluster.replace('medium', 'med')
    md_clusters_onehot[i] = cluster

In [29]:
# check the data
md_clusters_onehot[0].head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
14,T,F,F,T,med,high,med,lower,low,T,...,low,lower,higher,F,F,higher,F,low,F,lower
18,T,F,F,T,med,med,high,lower,low,F,...,low,higher,higher,F,F,lower,F,med,F,lower
25,T,F,F,T,med,high,med,lower,low,T,...,low,lower,higher,F,F,higher,F,low,F,lower
38,F,T,F,F,high,med,med,lower,med,F,...,low,higher,higher,F,F,lower,F,high,F,lower
45,F,F,F,F,high,med,med,higher,med,T,...,low,higher,higher,F,F,higher,F,med,F,lower


In [30]:
# one-hot encode the clusters
for (i, cluster) in enumerate(md_clusters_onehot):
    cluster = pd.get_dummies(cluster)
    md_clusters_onehot[i] = cluster

In [31]:
# check the data
md_clusters_onehot[0].head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,num_functions_lower,has_test_F,has_test_T,num_headers_high,num_headers_low,num_headers_med,has_param_F,has_param_T,num_stars_higher,num_stars_lower
14,0,1,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,1,0,0,1
18,0,1,1,0,1,0,0,1,0,0,...,1,1,0,0,0,1,1,0,0,1
25,0,1,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,1,0,0,1
38,1,0,0,1,1,0,1,0,1,0,...,1,1,0,1,0,0,1,0,0,1
45,1,0,1,0,1,0,1,0,1,0,...,0,1,0,0,0,1,1,0,0,1


## No Markdown Cells

In [32]:
# create copies that we will one-hot encode
no_md_clusters_onehot = []
for i in range(4):
    no_md_clusters_onehot.append(no_md_clusters[i].copy())

In [33]:
# change True/False to T/F and medium to med
for (i, cluster) in enumerate(no_md_clusters_onehot):
    cluster = cluster.replace(True, 'T')
    cluster = cluster.replace(False, 'F')
    cluster = cluster.replace('medium', 'med')
    no_md_clusters_onehot[i] = cluster

In [34]:
# check the data
no_md_clusters_onehot[0].head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
81,F,med,high,lower,low,F,T,lower,lower,higher,higher,F,F,higher,F,F,lower
82,F,med,high,lower,low,F,T,lower,lower,higher,higher,F,F,higher,F,F,lower
88,F,med,high,lower,high,F,T,lower,lower,higher,higher,F,F,lower,F,F,lower
150,F,high,med,lower,low,F,T,lower,lower,lower,higher,F,T,higher,F,F,lower
249,F,high,low,lower,low,F,T,lower,lower,lower,higher,F,T,lower,F,F,lower


In [35]:
# one-hot encode the clusters
for (i, cluster) in enumerate(no_md_clusters_onehot):
    cluster = pd.get_dummies(cluster)
    no_md_clusters_onehot[i] = cluster

In [36]:
# check the data
no_md_clusters_onehot[0].head()

Unnamed: 0,has_author_F,jupyter_prop_high,jupyter_prop_med,output_cell_prop_high,output_cell_prop_low,output_cell_prop_med,num_contrib_lower,image_prop_high,image_prop_low,image_prop_med,...,exec_skips_higher,has_error_F,has_error_T,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_param_F,num_stars_lower
81,1,0,1,1,0,0,1,0,1,0,...,1,1,0,1,0,1,0,1,1,1
82,1,0,1,1,0,0,1,0,1,0,...,1,1,0,1,0,1,0,1,1,1
88,1,0,1,1,0,0,1,1,0,0,...,1,1,0,1,0,0,1,1,1,1
150,1,1,0,0,0,1,1,0,1,0,...,1,1,0,0,1,1,0,1,1,1
249,1,1,0,0,1,0,1,0,1,0,...,1,1,0,0,1,0,1,1,1,1


# Performing the Apriori Algorithm

## Markdown Cells

In [37]:
# change to True/False instead of 1/0
for (i, cluster) in enumerate(md_clusters_onehot):
    cluster = cluster.replace(1, True)
    cluster = cluster.replace(0, False)
    md_clusters_onehot[i] = cluster

In [53]:
# perform the apriori algorithm on the dataframe 
md_cluster_itemsets = []
for cluster in md_clusters_onehot:
    itemset = apriori(cluster, min_support = 0.8, use_colnames = True)
    md_cluster_itemsets.append(itemset)

In [54]:
# look at an itemset
md_cluster_itemsets[0]

Unnamed: 0,support,itemsets
0,0.946903,(has_author_F)
1,0.964602,(num_contrib_lower)
2,0.991150,(has_comments_T)
3,0.946903,(non_exec_prop_low)
4,1.000000,(exec_skips_higher)
...,...,...
230,0.805310,"(has_param_F, num_contrib_lower, non_exec_prop..."
231,0.823009,"(has_param_F, non_exec_prop_low, num_stars_low..."
232,0.805310,"(has_comments_T, has_param_F, num_contrib_lowe..."
233,0.814159,"(has_comments_T, has_param_F, num_contrib_lowe..."


In [55]:
# check counts
for itemset in md_cluster_itemsets:
    print(len(itemset))

235
3527
116
233


In [56]:
# add a column that counts the number of elements in the itemset
for (i, itemset) in enumerate(md_cluster_itemsets):
    itemset['length'] = itemset['itemsets'].apply(len)
    md_cluster_itemsets[i] = itemset

In [57]:
# check the itemset again
md_cluster_itemsets[0]

Unnamed: 0,support,itemsets,length
0,0.946903,(has_author_F),1
1,0.964602,(num_contrib_lower),1
2,0.991150,(has_comments_T),1
3,0.946903,(non_exec_prop_low),1
4,1.000000,(exec_skips_higher),1
...,...,...,...
230,0.805310,"(has_param_F, num_contrib_lower, non_exec_prop...",6
231,0.823009,"(has_param_F, non_exec_prop_low, num_stars_low...",6
232,0.805310,"(has_comments_T, has_param_F, num_contrib_lowe...",7
233,0.814159,"(has_comments_T, has_param_F, num_contrib_lowe...",7


## No Markdown Cells

In [58]:
# change to True/False instead of 1/0
for (i, cluster) in enumerate(no_md_clusters_onehot):
    cluster = cluster.replace(1, True)
    cluster = cluster.replace(0, False)
    no_md_clusters_onehot[i] = cluster

In [62]:
# perform apriori algorithm
no_md_cluster_itemsets = []
for cluster in no_md_clusters_onehot:
    itemset = apriori(cluster, min_support = 0.8, use_colnames = True)
    no_md_cluster_itemsets.append(itemset)

In [63]:
# look at an itemset
no_md_cluster_itemsets[0]

Unnamed: 0,support,itemsets
0,1.000000,(has_author_F)
1,1.000000,(num_contrib_lower)
2,0.933333,(is_education_F)
3,1.000000,(has_comments_T)
4,1.000000,(non_exec_prop_lower)
...,...,...
1274,0.800000,"(non_exec_prop_lower, has_param_F, num_contrib..."
1275,0.800000,"(non_exec_prop_lower, has_param_F, num_contrib..."
1276,0.800000,"(non_exec_prop_lower, has_param_F, exec_skips_..."
1277,0.800000,"(non_exec_prop_lower, has_param_F, num_contrib..."


In [64]:
# check counts
for itemset in no_md_cluster_itemsets:
    print(len(itemset))

1279
261
639
255


In [65]:
# add a column that records length
for (i, itemset) in enumerate(no_md_cluster_itemsets):
    itemset['length'] = itemset['itemsets'].apply(len)
    no_md_cluster_itemsets[i] = itemset

In [66]:
# check the itemset again
no_md_cluster_itemsets[0]

Unnamed: 0,support,itemsets,length
0,1.000000,(has_author_F),1
1,1.000000,(num_contrib_lower),1
2,0.933333,(is_education_F),1
3,1.000000,(has_comments_T),1
4,1.000000,(non_exec_prop_lower),1
...,...,...,...
1274,0.800000,"(non_exec_prop_lower, has_param_F, num_contrib...",9
1275,0.800000,"(non_exec_prop_lower, has_param_F, num_contrib...",9
1276,0.800000,"(non_exec_prop_lower, has_param_F, exec_skips_...",9
1277,0.800000,"(non_exec_prop_lower, has_param_F, num_contrib...",9


# Extracting Association Rules 

## Markdown Cells

In [70]:
# use itemsets to extract association rules for each cluster
md_cluster_rules = []
for itemset in md_cluster_itemsets:
    rules = association_rules(itemset, metric = 'confidence', min_threshold = 0.95)
    md_cluster_rules.append(rules)

In [71]:
# look at the sizes of rules
for rule in md_cluster_rules:
    print(len(rule))

1163
105934
238
533


I kept upping the threshold to shrink the size of the first rule set, but this is as small as I could get it

In [72]:
# add columns that hold length
for (i, rule) in enumerate(md_cluster_rules):
    rule['antecedent_len'] = rule['antecedents'].apply(len)
    rule['consequent_len'] = rule['consequents'].apply(len)
    md_cluster_rules[i] = rule

## No Markdown Cells

In [73]:
# use itemsets to extract association rules
no_md_cluster_rules = []
for itemset in no_md_cluster_itemsets:
    rules = association_rules(itemset, metric = 'confidence', min_threshold = 0.95)
    no_md_cluster_rules.append(rules)

In [74]:
# look at the sizes of rules
for rule in no_md_cluster_rules:
    print(len(rule))

37831
1202
7316
3991


In [75]:
# add columns to hold length
for (i, rule) in enumerate(no_md_cluster_rules):
    rule['antecedent_len'] = rule['antecedents'].apply(len)
    rule['consequent_len'] = rule['consequents'].apply(len)
    no_md_cluster_rules[i] = rule

# Exporting Results

In [76]:
# export itemsets for markdown cell group
for (i, itemset) in enumerate(md_cluster_itemsets):
    filename = 'md-rule-learning/md_cluster' + str(i) + '_itemsets.pkl'
    itemset.to_pickle(filename)
    filename = 'md-rule-learning/csv-files/md_cluster' + str(i) + '_itemsets.csv'
    itemset.to_csv(filename)

In [77]:
# export association rules for markdown cell group
for (i, rule) in enumerate(md_cluster_rules):
    filename = 'md-rule-learning/md_cluster' + str(i) + '_association_rules.pkl'
    rule.to_pickle(filename)
    filename = 'md-rule-learning/csv-files/md_cluster' + str(i) + '_association_rules.csv'
    rule.to_csv(filename)

In [78]:
# export itemsets for no markdown cell group
for (i, itemset) in enumerate(no_md_cluster_itemsets):
    filename = 'no-md-rule-learning/no_md_cluster' + str(i) + '_itemsets.pkl'
    itemset.to_pickle(filename)
    filename = 'no-md-rule-learning/csv-files/no_md_cluster' + str(i) + '_itemsets.csv'
    itemset.to_csv(filename)

In [79]:
# export association rules for no markdown cell group
for (i, rule) in enumerate(no_md_cluster_rules):
    filename = 'no-md-rule-learning/no_md_cluster' + str(i) + '_association_rules.pkl'
    rule.to_pickle(filename)
    filename = 'no-md-rule-learning/csv-files/no_md_cluster' + str(i) + '_association_rules.csv'
    rule.to_csv(filename)