# Imports

In [58]:
import pandas as pd
import numpy
from mlxtend.frequent_patterns import apriori, association_rules

# Loading and Combining the Clustered and Binned Data

## Markdown Cells

In [48]:
# load in cluster data 
md_cluster_original = pd.read_csv('markdown_group_clusters.csv')

In [49]:
# initial look at the data
md_cluster_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,cluster
0,0,594,True,False,False,False,0.507588,1.0,0.044444,3.0,...,True,0.0,1.0,1.809524,False,False,0.0,False,7.0,1
1,1,1222,True,False,False,False,1.0,0.0,0.16129,1.0,...,True,0.961538,1.0,0.0,False,False,0.0,False,5.0,1
2,2,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,...,True,0.988636,1.0,0.0,False,True,15.0,False,30.0,3
3,3,2705,True,False,False,False,1.0,0.5,0.461538,1.0,...,False,0.0,0.923077,1.615385,False,False,6.0,False,7.0,1
4,4,2861,True,False,False,True,1.0,0.214286,0.461538,1.0,...,False,0.0,1.0,6.333333,False,False,0.0,False,1.0,1


In [50]:
# copy the cluster column
md_cluster_col = md_cluster_original['cluster'].copy()

In [51]:
# load in binned data 
md_binned_original = pd.read_csv('../binning-data/markdown_group_binned.csv')

In [52]:
# initial look at the data
md_binned_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
0,0,594,True,False,False,False,medium,high,low,lower,...,lower,True,low,higher,lower,False,False,lower,False,medium
1,1,1222,True,False,False,False,high,low,medium,lower,...,lower,True,high,higher,lower,False,False,lower,False,medium
2,2,1447,True,False,False,False,high,low,medium,lower,...,lower,True,high,higher,lower,False,True,higher,False,high
3,3,2705,True,False,False,False,high,medium,medium,lower,...,lower,False,low,higher,lower,False,False,lower,False,medium
4,4,2861,True,False,False,True,high,low,medium,lower,...,lower,False,low,higher,lower,False,False,lower,False,low


In [53]:
# create a copy of the binned dataframe we will modify
md_df = md_binned_original.copy()

# drop the first column
md_df = md_df.drop(['Unnamed: 0'], axis = 1)

In [9]:
# check that the nb_id columns match up
print((md_df['nb_id'] == md_cluster_original['nb_id']).value_counts())

True    2291
Name: nb_id, dtype: int64


In [10]:
# add the cluster column to the dataframe
md_df['cluster'] = md_cluster_col

# drop the first column
md_df = md_df.drop(['nb_id'], axis = 1)

# check the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,cluster
0,True,False,False,False,medium,high,low,lower,low,True,...,True,low,higher,lower,False,False,lower,False,medium,1
1,True,False,False,False,high,low,medium,lower,low,True,...,True,high,higher,lower,False,False,lower,False,medium,1
2,True,False,False,False,high,low,medium,lower,low,True,...,True,high,higher,lower,False,True,higher,False,high,3
3,True,False,False,False,high,medium,medium,lower,low,True,...,False,low,higher,lower,False,False,lower,False,medium,1
4,True,False,False,True,high,low,medium,lower,low,False,...,False,low,higher,lower,False,False,lower,False,low,1


In [11]:
# check the number of notebooks per cluster
md_df['cluster'].value_counts()

1    1713
3     411
2     152
0      15
Name: cluster, dtype: int64

In [12]:
# separate the clusters
md_clusters = []
for i in range(4):
    cluster = md_df[md_df['cluster'] == i].copy()
    md_clusters.append(cluster)

In [13]:
# check that the clusters were separated correctly
for cluster in md_clusters:
    print(cluster['cluster'].value_counts())

0    15
Name: cluster, dtype: int64
1    1713
Name: cluster, dtype: int64
2    152
Name: cluster, dtype: int64
3    411
Name: cluster, dtype: int64


In [14]:
# drop the cluster column from each cluster dataframe
for (i, cluster) in enumerate(md_clusters):
    md_clusters[i] = cluster.drop(['cluster'], axis = 1)

## No Markdown Cells

In [39]:
# load in cluster data
no_md_cluster_original = pd.read_csv('no_markdown_group_clusters.csv')

In [40]:
# initial look at the data
no_md_cluster_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,cluster
0,0,1589,False,0.71413,0.538462,1.0,0.142857,False,False,1.0,0.0,1.0,1.0,False,True,2.0,False,1
1,1,1919,False,0.99784,0.780488,1.0,0.1875,False,True,1.0,0.04878,0.789474,10.631579,True,True,5.0,False,1
2,2,2857,False,0.121957,0.461538,5.0,0.0,False,True,11.0,0.0,0.909091,6.181818,False,True,0.0,False,1
3,3,4339,False,1.0,0.190476,1.0,0.0,False,True,1.0,0.0,1.0,1.631579,False,False,1.0,False,1
4,4,4659,False,0.940063,0.125,3.0,0.0,False,True,3.0,0.125,1.0,1.0,True,False,3.0,False,1


In [41]:
# copy the cluster column
no_md_cluster_col = no_md_cluster_original['cluster'].copy()

In [42]:
# load in binned data
no_md_binned_original = pd.read_csv('../binning-data/no_markdown_group_binned.csv')

In [43]:
# initial look at the data
no_md_binned_original.head()

Unnamed: 0.1,Unnamed: 0,nb_id,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test
0,0,1589,False,medium,medium,lower,low,False,False,lower,lower,higher,lower,False,True,lower,False
1,1,1919,False,high,high,lower,low,False,True,lower,lower,higher,lower,True,True,lower,False
2,2,2857,False,low,medium,higher,low,False,True,higher,lower,higher,lower,False,True,lower,False
3,3,4339,False,high,low,lower,low,False,True,lower,lower,higher,lower,False,False,lower,False
4,4,4659,False,high,low,lower,low,False,True,higher,lower,higher,lower,True,False,lower,False


In [95]:
# create a copy
no_md_df = no_md_binned_original.copy()

# drop the first column
no_md_df = no_md_df.drop(['Unnamed: 0'], axis = 1)

When doing clustering for the no markdown cell group, we filtered out a notebook that was in a cluster by itself. We do the same for the binned data

In [96]:
# drop notebook
no_md_df = no_md_df[no_md_df['nb_id'] != 588262].reset_index()

In [97]:
# check that the nb_id columns match up
print((no_md_df['nb_id'] == no_md_cluster_original['nb_id']).value_counts())

True    1353
Name: nb_id, dtype: int64


In [98]:
# add the cluster column to the dataframe
no_md_df['cluster'] = no_md_cluster_col

# drop the first two columns
no_md_df = no_md_df.drop(['nb_id', 'index'], axis = 1)

no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,cluster
0,False,medium,medium,lower,low,False,False,lower,lower,higher,lower,False,True,lower,False,1
1,False,high,high,lower,low,False,True,lower,lower,higher,lower,True,True,lower,False,1
2,False,low,medium,higher,low,False,True,higher,lower,higher,lower,False,True,lower,False,1
3,False,high,low,lower,low,False,True,lower,lower,higher,lower,False,False,lower,False,1
4,False,high,low,lower,low,False,True,higher,lower,higher,lower,True,False,lower,False,1


In [99]:
# check the number of notebooks per cluster
no_md_df['cluster'].value_counts()

1    1116
2     145
0      73
3      19
Name: cluster, dtype: int64

In [100]:
# separate the clusters
no_md_clusters = []
for i in range(4):
    cluster = no_md_df[no_md_df['cluster'] == i].copy()
    no_md_clusters.append(cluster)

In [101]:
# check that the clusters were separated correctly
for cluster in no_md_clusters:
    print(cluster['cluster'].value_counts())

0    73
Name: cluster, dtype: int64
1    1116
Name: cluster, dtype: int64
2    145
Name: cluster, dtype: int64
3    19
Name: cluster, dtype: int64


In [102]:
# drop the cluster column
for (i, cluster) in enumerate(no_md_clusters):
    no_md_clusters[i] = cluster.drop(['cluster'], axis = 1)

# One-Hot Encoding

## Markdown Cells

In [103]:
# create copies of the clusters that we will one-hot encode
md_clusters_onehot = []
for i in range(4):
    md_clusters_onehot.append(md_clusters[i].copy())

In [104]:
# change True/False to T/F and medium to med
for (i, cluster) in enumerate(md_clusters_onehot):
    cluster = cluster.replace(True, 'T')
    cluster = cluster.replace(False, 'F')
    cluster = cluster.replace('medium', 'med')
    md_clusters_onehot[i] = cluster

In [105]:
# check the data
md_clusters_onehot[0].head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
38,F,T,F,F,high,med,med,lower,med,F,...,lower,T,low,higher,higher,F,F,lower,F,high
174,F,F,F,F,high,med,med,lower,med,T,...,lower,F,low,higher,higher,F,F,lower,F,med
261,T,F,F,F,high,high,med,lower,med,F,...,lower,F,low,higher,higher,T,F,higher,F,low
262,T,F,F,F,high,high,med,lower,med,F,...,lower,F,low,higher,higher,T,F,higher,F,low
263,T,F,F,F,high,high,med,lower,med,F,...,lower,F,low,higher,higher,T,F,higher,F,low


In [106]:
# one-hot encode the clusters
for (i, cluster) in enumerate(md_clusters_onehot):
    cluster = pd.get_dummies(cluster)
    md_clusters_onehot[i] = cluster

In [107]:
# check the data
md_clusters_onehot[0].head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,has_error_F,has_error_T,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,num_headers_high,num_headers_low,num_headers_med
38,1,0,0,1,1,0,1,0,1,0,...,1,0,1,0,0,1,1,1,0,0
174,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,0,1,1,0,0,1
261,0,1,1,0,1,0,1,0,1,0,...,0,1,1,0,1,0,1,0,1,0
262,0,1,1,0,1,0,1,0,1,0,...,0,1,1,0,1,0,1,0,1,0
263,0,1,1,0,1,0,1,0,1,0,...,0,1,1,0,1,0,1,0,1,0


## No Markdown Cells

In [108]:
# create copies that we will one-hot encode
no_md_clusters_onehot = []
for i in range(4):
    no_md_clusters_onehot.append(no_md_clusters[i].copy())

In [109]:
# change True/False to T/F and medium to med
for (i, cluster) in enumerate(no_md_clusters_onehot):
    cluster = cluster.replace(True, 'T')
    cluster = cluster.replace(False, 'F')
    cluster = cluster.replace('medium', 'med')
    no_md_clusters_onehot[i] = cluster

In [110]:
# check the data
no_md_clusters_onehot[0].head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test
6,F,low,med,lower,low,F,T,lower,lower,higher,lower,T,T,higher,F
73,F,high,high,lower,low,F,T,lower,lower,higher,lower,F,F,higher,F
101,F,high,med,lower,low,F,T,higher,lower,higher,lower,F,F,higher,F
133,F,high,med,lower,med,F,T,lower,lower,higher,lower,T,T,higher,F
134,F,high,med,lower,med,F,T,lower,lower,higher,lower,T,T,higher,F


In [111]:
# one-hot encode the clusters
for (i, cluster) in enumerate(no_md_clusters_onehot):
    cluster = pd.get_dummies(cluster)
    no_md_clusters_onehot[i] = cluster

In [112]:
# check the data
no_md_clusters_onehot[0].head()

Unnamed: 0,has_author_F,has_author_T,jupyter_prop_high,jupyter_prop_low,jupyter_prop_med,output_cell_prop_high,output_cell_prop_low,output_cell_prop_med,num_contrib_higher,num_contrib_lower,...,exec_inorder_higher,exec_inorder_lower,exec_skips_higher,exec_skips_lower,has_error_F,has_error_T,has_export_F,has_export_T,num_functions_higher,has_test_F
6,1,0,0,1,0,0,0,1,0,1,...,1,0,0,1,0,1,0,1,1,1
73,1,0,1,0,0,1,0,0,0,1,...,1,0,0,1,1,0,1,0,1,1
101,1,0,1,0,0,0,0,1,0,1,...,1,0,0,1,1,0,1,0,1,1
133,1,0,1,0,0,0,0,1,0,1,...,1,0,0,1,0,1,0,1,1,1
134,1,0,1,0,0,0,0,1,0,1,...,1,0,0,1,0,1,0,1,1,1


# Performing the Apriori Algorithm

## Markdown Cells

In [113]:
# change to True/False instead of 1/0
for (i, cluster) in enumerate(md_clusters_onehot):
    cluster = cluster.replace(1, True)
    cluster = cluster.replace(0, False)
    md_clusters_onehot[i] = cluster

In [114]:
# perform the apriori algorithm on the dataframe 
md_cluster_itemsets = []
for cluster in md_clusters_onehot:
    itemset = apriori(cluster, min_support = 0.5, use_colnames = True)
    md_cluster_itemsets.append(itemset)

In [115]:
# look at an itemset
md_cluster_itemsets[0]

Unnamed: 0,support,itemsets
0,0.600000,(longer_beginning_F)
1,0.933333,(longer_ending_F)
2,0.933333,(has_author_F)
3,0.933333,(has_equation_F)
4,0.733333,(jupyter_prop_high)
...,...,...
37370,0.533333,"(longer_ending_F, non_exec_prop_low, has_test_..."
37371,0.533333,"(longer_ending_F, non_exec_prop_low, has_test_..."
37372,0.533333,"(longer_ending_F, non_exec_prop_low, has_test_..."
37373,0.533333,"(non_exec_prop_low, has_test_F, num_contrib_lo..."


In [116]:
# add a column that counts the number of elements in the itemset
for (i, itemset) in enumerate(md_cluster_itemsets):
    itemset['length'] = itemset['itemsets'].apply(len)
    md_cluster_itemsets[i] = itemset

In [117]:
# check the itemset again
md_cluster_itemsets[0]

Unnamed: 0,support,itemsets,length
0,0.600000,(longer_beginning_F),1
1,0.933333,(longer_ending_F),1
2,0.933333,(has_author_F),1
3,0.933333,(has_equation_F),1
4,0.733333,(jupyter_prop_high),1
...,...,...,...
37370,0.533333,"(longer_ending_F, non_exec_prop_low, has_test_...",13
37371,0.533333,"(longer_ending_F, non_exec_prop_low, has_test_...",13
37372,0.533333,"(longer_ending_F, non_exec_prop_low, has_test_...",13
37373,0.533333,"(non_exec_prop_low, has_test_F, num_contrib_lo...",13


## No Markdown Cells

In [118]:
# change to True/False instead of 1/0
for (i, cluster) in enumerate(no_md_clusters_onehot):
    cluster = cluster.replace(1, True)
    cluster = cluster.replace(0, False)
    no_md_clusters_onehot[i] = cluster

In [119]:
# perform apriori algorithm
no_md_cluster_itemsets = []
for cluster in no_md_clusters_onehot:
    itemset = apriori(cluster, min_support = 0.5, use_colnames = True)
    no_md_cluster_itemsets.append(itemset)

In [120]:
# look at an itemset
no_md_cluster_itemsets[0]

Unnamed: 0,support,itemsets
0,0.986301,(has_author_F)
1,0.780822,(jupyter_prop_high)
2,0.520548,(output_cell_prop_med)
3,0.890411,(num_contrib_lower)
4,0.602740,(image_prop_low)
...,...,...
2074,0.506849,"(num_functions_higher, has_test_F, num_contrib..."
2075,0.547945,"(num_functions_higher, has_test_F, num_contrib..."
2076,0.506849,"(num_functions_higher, has_test_F, num_contrib..."
2077,0.506849,"(num_functions_higher, has_test_F, num_contrib..."


In [121]:
# add a column that records length
for (i, itemset) in enumerate(no_md_cluster_itemsets):
    itemset['length'] = itemset['itemsets'].apply(len)
    no_md_cluster_itemsets[i] = itemset

In [122]:
# check the itemset again
no_md_cluster_itemsets[0]

Unnamed: 0,support,itemsets,length
0,0.986301,(has_author_F),1
1,0.780822,(jupyter_prop_high),1
2,0.520548,(output_cell_prop_med),1
3,0.890411,(num_contrib_lower),1
4,0.602740,(image_prop_low),1
...,...,...,...
2074,0.506849,"(num_functions_higher, has_test_F, num_contrib...",9
2075,0.547945,"(num_functions_higher, has_test_F, num_contrib...",9
2076,0.506849,"(num_functions_higher, has_test_F, num_contrib...",9
2077,0.506849,"(num_functions_higher, has_test_F, num_contrib...",9


# Extracting Association Rules 

## Markdown Cells

In [123]:
# use itemsets to extract association rules for each cluster
md_cluster_rules = []
for itemset in md_cluster_itemsets:
    rules = association_rules(itemset, metric = 'confidence', min_threshold = 0.95)
    md_cluster_rules.append(rules)

In [124]:
# look at the sizes of rules
for rule in md_cluster_rules:
    print(len(rule))

2060266
15939
6760
2055


I kept upping the threshold to shrink the size of the first rule set, but this is as small as I could get it

In [125]:
# add columns that hold length
for (i, rule) in enumerate(md_cluster_rules):
    rule['antecedent_len'] = rule['antecedents'].apply(len)
    rule['consequent_len'] = rule['consequents'].apply(len)
    md_cluster_rules[i] = rule

## No Markdown Cells

In [128]:
# use itemsets to extract association rules
no_md_cluster_rules = []
for itemset in no_md_cluster_itemsets:
    rules = association_rules(itemset, metric = 'confidence', min_threshold = 0.95)
    no_md_cluster_rules.append(rules)

In [129]:
# look at the sizes of rules
for rule in no_md_cluster_rules:
    print(len(rule))

9515
4964
5526
19670


In [130]:
# add columns to hold length
for (i, rule) in enumerate(no_md_cluster_rules):
    rule['antecedent_len'] = rule['antecedents'].apply(len)
    rule['consequent_len'] = rule['consequents'].apply(len)
    no_md_cluster_rules[i] = rule

# Exporting Results

In [135]:
# export itemsets for markdown cell group
for (i, itemset) in enumerate(md_cluster_itemsets):
    filename = 'md-rule-learning/md_cluster' + str(i) + '_itemsets.pkl'
    itemset.to_pickle(filename)
    filename = 'md-rule-learning/csv-files/md_cluster' + str(i) + '_itemsets.csv'
    itemset.to_csv(filename)

In [136]:
# export association rules for markdown cell group
for (i, rule) in enumerate(md_cluster_rules):
    filename = 'md-rule-learning/md_cluster' + str(i) + '_association_rules.pkl'
    rule.to_pickle(filename)
    filename = 'md-rule-learning/csv-files/md_cluster' + str(i) + '_association_rules.csv'
    rule.to_csv(filename)

In [137]:
# export itemsets for no markdown cell group
for (i, itemset) in enumerate(no_md_cluster_itemsets):
    filename = 'no-md-rule-learning/no_md_cluster' + str(i) + '_itemsets.pkl'
    itemset.to_pickle(filename)
    filename = 'no-md-rule-learning/csv-files/no_md_cluster' + str(i) + '_itemsets.csv'
    itemset.to_csv(filename)

In [138]:
# export association rules for no markdown cell group
for (i, rule) in enumerate(no_md_cluster_rules):
    filename = 'no-md-rule-learning/no_md_cluster' + str(i) + '_association_rules.pkl'
    rule.to_pickle(filename)
    filename = 'no-md-rule-learning/csv-files/no_md_cluster' + str(i) + '_association_rules.csv'
    rule.to_csv(filename)