# Imports

In [1]:
import pandas as pd

# Itemsets

## Markdown Cells

### Exporting

In [2]:
md_itemsets = pd.read_pickle('md_frequent_itemsets.pkl')
md_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning_T),1
1,0.741161,(longer_ending_F),1
2,0.953296,(has_author_F),1
3,0.806198,(has_equation_F),1
4,0.787429,(jupyter_prop_high),1


In [3]:
md_itemsets.shape

(1267, 3)

In [4]:
sorted_itemsets = md_itemsets.sort_values(by = ['support'], ascending = False)
sorted_itemsets.head()

Unnamed: 0,support,itemsets,length
2,0.953296,(has_author_F),1
16,0.95155,(exec_inorder_higher),1
6,0.93409,(num_contrib_lower),1
48,0.9079,"(exec_inorder_higher, has_author_F)",2
10,0.889568,(has_comments_T),1


In [5]:
sorted_itemsets.to_csv('csv-files/md_frequent_itemsets.csv')

### Analysis

In [6]:
md_itemsets = sorted_itemsets.copy()

In [7]:
print(len(md_itemsets[(md_itemsets['length'] == 1) &
                     (md_itemsets['support'] >= 0.8)]))

10


In [8]:
print(md_itemsets[(md_itemsets['length'] == 1) &
                     (md_itemsets['support'] >= 0.8)]['itemsets'])

2            (has_author_F)
16    (exec_inorder_higher)
6       (num_contrib_lower)
10         (has_comments_T)
15      (non_exec_prop_low)
18            (has_error_F)
13      (num_commits_lower)
17       (exec_skips_lower)
5       (markdown_prop_med)
3          (has_equation_F)
Name: itemsets, dtype: object


## No Markdown Cells

### Exporting

In [9]:
no_md_itemsets = pd.read_pickle('no_md_frequent_itemsets.pkl')
no_md_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.991144,(has_author_F),1
1,0.671587,(jupyter_prop_high),1
2,0.912915,(num_contrib_lower),1
3,0.676015,(image_prop_low),1
4,0.892251,(is_education_F),1


In [10]:
no_md_itemsets.shape

(346, 3)

In [11]:
sorted_itemsets = no_md_itemsets.sort_values(by = ['support'], ascending = False)
sorted_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.991144,(has_author_F),1
7,0.921771,(non_exec_prop_lower),1
17,0.916605,"(has_author_F, non_exec_prop_lower)",2
2,0.912915,(num_contrib_lower),1
12,0.904797,"(has_author_F, num_contrib_lower)",2


In [12]:
sorted_itemsets.to_csv('csv-files/no_md_frequent_itemsets.csv')

### Analysis

In [13]:
no_md_itemsets = sorted_itemsets.copy()

In [14]:
print(len(no_md_itemsets[(no_md_itemsets['length'] == 1) &
                     (no_md_itemsets['support'] >= 0.8)]))

7


In [15]:
print(no_md_itemsets[(no_md_itemsets['length'] == 1) &
                     (no_md_itemsets['support'] >= 0.8)]['itemsets'])

0            (has_author_F)
7     (non_exec_prop_lower)
2       (num_contrib_lower)
4          (is_education_F)
8     (exec_inorder_higher)
9        (exec_skips_lower)
10            (has_error_F)
Name: itemsets, dtype: object


# Association Rules

## Markdown Cells

In [16]:
md_association_rules = pd.read_pickle('md_association_rules.pkl')
md_association_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning_T),(has_author_F),0.543867,0.953296,0.515932,0.948636,0.995112,-0.002534,0.909278,1,1
1,(longer_beginning_T),(num_contrib_lower),0.543867,0.93409,0.50371,0.926164,0.991515,-0.004311,0.892652,1,1
2,(longer_beginning_T),(has_comments_T),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553,1,1
3,(longer_beginning_T),(exec_inorder_higher),0.543867,0.95155,0.516368,0.949438,0.997781,-0.001148,0.958242,1,1
4,(longer_ending_F),(has_author_F),0.741161,0.953296,0.713226,0.962309,1.009455,0.00668,1.239129,1,1


In [17]:
md_association_rules.shape

(16625, 11)

In [18]:
sorted_rules = md_association_rules.sort_values(by = ['confidence'], ascending = False)
sorted_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
15950,"(num_commits_lower, exec_inorder_higher, jupyt...",(num_contrib_lower),0.515059,0.93409,0.506766,0.983898,1.053323,0.025654,4.093363,5,1
11323,"(has_comments_T, jupyter_prop_high, non_exec_p...",(num_contrib_lower),0.538629,0.93409,0.5299,0.983793,1.05321,0.026771,4.066652,4,1
13922,"(has_author_F, num_commits_lower, jupyter_prop...",(num_contrib_lower),0.522479,0.93409,0.513749,0.983292,1.052673,0.025707,3.944718,5,1
14914,"(has_author_F, num_commits_lower, exec_skips_l...",(exec_inorder_higher),0.522916,0.95155,0.513749,0.982471,1.032496,0.016169,2.763983,5,1
10194,"(has_author_F, has_comments_T, exec_skips_lowe...",(exec_inorder_higher),0.617634,0.95155,0.606722,0.982332,1.03235,0.019012,2.742296,4,1


In [19]:
export = sorted_rules.drop(['antecedent support', 'consequent support', 'support', 'leverage'], axis = 1)
export.head()

Unnamed: 0,antecedents,consequents,confidence,lift,conviction,antecedent_len,consequent_len
15950,"(num_commits_lower, exec_inorder_higher, jupyt...",(num_contrib_lower),0.983898,1.053323,4.093363,5,1
11323,"(has_comments_T, jupyter_prop_high, non_exec_p...",(num_contrib_lower),0.983793,1.05321,4.066652,4,1
13922,"(has_author_F, num_commits_lower, jupyter_prop...",(num_contrib_lower),0.983292,1.052673,3.944718,5,1
14914,"(has_author_F, num_commits_lower, exec_skips_l...",(exec_inorder_higher),0.982471,1.032496,2.763983,5,1
10194,"(has_author_F, has_comments_T, exec_skips_lowe...",(exec_inorder_higher),0.982332,1.03235,2.742296,4,1


In [20]:
export['antecedents'] = export['antecedents'].apply(lambda x : str(x).replace('frozenset', ''))
export['consequents'] = export['consequents'].apply(lambda x : str(x).replace('frozenset', ''))
export.head()

Unnamed: 0,antecedents,consequents,confidence,lift,conviction,antecedent_len,consequent_len
15950,"({'num_commits_lower', 'exec_inorder_higher', ...",({'num_contrib_lower'}),0.983898,1.053323,4.093363,5,1
11323,"({'has_comments_T', 'jupyter_prop_high', 'non_...",({'num_contrib_lower'}),0.983793,1.05321,4.066652,4,1
13922,"({'has_author_F', 'num_commits_lower', 'jupyte...",({'num_contrib_lower'}),0.983292,1.052673,3.944718,5,1
14914,"({'has_author_F', 'num_commits_lower', 'exec_s...",({'exec_inorder_higher'}),0.982471,1.032496,2.763983,5,1
10194,"({'has_author_F', 'has_comments_T', 'exec_skip...",({'exec_inorder_higher'}),0.982332,1.03235,2.742296,4,1


In [21]:
export.to_csv('csv-files/md_association_rules.csv')

## No Markdown Cells

In [22]:
no_md_association_rules = pd.read_pickle('no_md_association_rules.pkl')
no_md_association_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.671587,0.991144,0.669373,0.996703,1.005609,0.003734,2.686347,1,1
1,(num_contrib_lower),(has_author_F),0.912915,0.991144,0.904797,0.991108,0.999963,-3.3e-05,0.995907,1,1
2,(has_author_F),(num_contrib_lower),0.991144,0.912915,0.904797,0.912882,0.999963,-3.3e-05,0.999615,1,1
3,(image_prop_low),(has_author_F),0.676015,0.991144,0.667897,0.987991,0.996819,-0.002131,0.737471,1,1
4,(has_author_F),(is_education_F),0.991144,0.892251,0.883395,0.891288,0.998921,-0.000954,0.991144,1,1


In [23]:
no_md_association_rules.shape

(2968, 11)

In [24]:
sorted_rules = no_md_association_rules.sort_values(by = ['confidence'], ascending = False)
sorted_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.671587,0.991144,0.669373,0.996703,1.005609,0.003734,2.686347,1,1
89,"(jupyter_prop_high, non_exec_prop_lower)",(has_author_F),0.622878,0.991144,0.620664,0.996445,1.005349,0.003302,2.491513,2,1
80,"(jupyter_prop_high, num_contrib_lower)",(has_author_F),0.61476,0.991144,0.612546,0.996399,1.005302,0.00323,2.459041,2,1
83,"(is_education_F, jupyter_prop_high)",(has_author_F),0.610332,0.991144,0.608118,0.996372,1.005275,0.003191,2.441328,2,1
92,"(exec_inorder_higher, jupyter_prop_high)",(has_author_F),0.602214,0.991144,0.6,0.996324,1.005226,0.003119,2.408856,2,1


In [25]:
export = sorted_rules.drop(['antecedent support', 'consequent support', 'support', 'leverage'], axis = 1)
export.head()

Unnamed: 0,antecedents,consequents,confidence,lift,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.996703,1.005609,2.686347,1,1
89,"(jupyter_prop_high, non_exec_prop_lower)",(has_author_F),0.996445,1.005349,2.491513,2,1
80,"(jupyter_prop_high, num_contrib_lower)",(has_author_F),0.996399,1.005302,2.459041,2,1
83,"(is_education_F, jupyter_prop_high)",(has_author_F),0.996372,1.005275,2.441328,2,1
92,"(exec_inorder_higher, jupyter_prop_high)",(has_author_F),0.996324,1.005226,2.408856,2,1


In [26]:
export['antecedents'] = export['antecedents'].apply(lambda x : str(x).replace('frozenset', ''))
export['consequents'] = export['consequents'].apply(lambda x : str(x).replace('frozenset', ''))
export.head()

Unnamed: 0,antecedents,consequents,confidence,lift,conviction,antecedent_len,consequent_len
0,({'jupyter_prop_high'}),({'has_author_F'}),0.996703,1.005609,2.686347,1,1
89,"({'jupyter_prop_high', 'non_exec_prop_lower'})",({'has_author_F'}),0.996445,1.005349,2.491513,2,1
80,"({'jupyter_prop_high', 'num_contrib_lower'})",({'has_author_F'}),0.996399,1.005302,2.459041,2,1
83,"({'is_education_F', 'jupyter_prop_high'})",({'has_author_F'}),0.996372,1.005275,2.441328,2,1
92,"({'exec_inorder_higher', 'jupyter_prop_high'})",({'has_author_F'}),0.996324,1.005226,2.408856,2,1


In [27]:
export.to_csv('csv-files/no_md_association_rules.csv')