# Imports

In [1]:
import pandas as pd
import prince
from mlxtend.frequent_patterns import apriori, association_rules

# Loading the Binned Data

## Markdown Cells

In [2]:
md_filepath = 'binning-data/markdown_group_binned.csv'
md_df = pd.read_csv(md_filepath)

In [3]:
# save the original dataframe
md_original = md_df

In [4]:
# clear the first two columns
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F
md_df = md_df.replace(True, 'T')
md_df = md_df.replace(False, 'F')

In [5]:
# initial look at the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,T,F,F,F,medium,high,low,lower,low,T,...,low,higher,lower,F,F,lower,F,medium,F,higher
1,T,F,F,F,high,low,medium,lower,low,T,...,high,higher,lower,F,F,lower,F,medium,F,lower
2,T,F,F,F,high,low,medium,lower,low,T,...,high,higher,lower,F,T,higher,F,high,F,lower
3,T,F,F,F,high,medium,medium,lower,low,T,...,low,higher,lower,F,F,lower,F,medium,F,lower
4,T,F,F,T,high,low,medium,lower,low,F,...,low,higher,lower,F,F,lower,F,low,F,lower


In [6]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'output_cell_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test',
 'num_headers',
 'has_param',
 'num_stars']

In [7]:
# check the bin sizes
for var in md_vars:
    print(md_df[var].value_counts())

T    1246
F    1045
Name: longer_beginning, dtype: int64
F    1698
T     593
Name: longer_ending, dtype: int64
F    2184
T     107
Name: has_author, dtype: int64
F    1847
T     444
Name: has_equation, dtype: int64
high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
medium    1383
high       529
low        379
Name: output_cell_prop, dtype: int64
medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
lower     2140
higher     151
Name: num_contrib, dtype: int64
low       1241
medium     873
high       177
Name: image_prop, dtype: int64
T    1321
F     970
Name: is_education, dtype: int64
F    1423
T     868
Name: has_links, dtype: int64
T    2038
F     253
Name: has_comments, dtype: int64
medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
T    1478
F     813
Name: has_title, dtype: int64
lower     1981
higher     310
Name: num_commits, dtype: int64
F    1236
T    1055
Name: md_format, dtype: int64
low       20

## No Markdown Cells

In [8]:
no_md_filepath = 'binning-data/no_markdown_group_binned.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [9]:
# save the original dataframe
no_md_original = no_md_df

In [10]:
# clear the first two columns
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F (strings)
no_md_df = no_md_df.replace(True, 'T')
no_md_df = no_md_df.replace(False, 'F')

In [11]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,F,medium,medium,lower,low,F,F,lower,lower,higher,lower,F,T,lower,F,F,lower
1,F,high,high,lower,low,F,T,lower,lower,higher,lower,T,T,lower,F,F,lower
2,F,low,medium,higher,low,F,T,higher,lower,higher,lower,F,T,lower,F,F,lower
3,F,high,low,lower,low,F,T,lower,lower,higher,lower,F,F,lower,F,F,lower
4,F,high,low,lower,low,F,T,higher,lower,higher,lower,T,F,lower,F,F,lower


In [12]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'output_cell_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test',
 'has_param',
 'num_stars']

In [13]:
# check the bin counts
for var in no_md_vars:
    print(no_md_df[var].value_counts())

F    1342
T      12
Name: has_author, dtype: int64
high      910
medium    286
low       158
Name: jupyter_prop, dtype: int64
medium    794
low       393
high      167
Name: output_cell_prop, dtype: int64
lower     1236
higher     118
Name: num_contrib, dtype: int64
low       915
medium    281
high      158
Name: image_prop, dtype: int64
F    1208
T     146
Name: is_education, dtype: int64
T    919
F    435
Name: has_comments, dtype: int64
lower     1041
higher     313
Name: num_commits, dtype: int64
lower     1248
higher     106
Name: non_exec_prop, dtype: int64
higher    1204
lower      150
Name: exec_inorder, dtype: int64
lower     1178
higher     176
Name: exec_skips, dtype: int64
F    1138
T     216
Name: has_error, dtype: int64
F    935
T    419
Name: has_export, dtype: int64
lower     1153
higher     201
Name: num_functions, dtype: int64
F    1353
T       1
Name: has_test, dtype: int64
F    1339
T      15
Name: has_param, dtype: int64
lower     1299
higher      55
Name: num_star

# Multiple Correspondence Analysis (MCA)

## Markdown Cells

### Performing MCA

In [14]:
# instantiate MCA object and fit to data
md_mca = prince.MCA(n_components = len(md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
md_mca_fit = md_mca.fit(md_df)

In [15]:
# put the results into dataframe format
md_mca_df = md_mca_fit.row_coordinates(md_df)

# initial look at the results of the MCA
md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.104761,0.644323,0.307194,-0.625962,-0.007008,0.196116,0.00121,-0.386855,0.333464,0.323209,...,-0.18741,-0.196358,0.23592,0.074124,0.213281,-0.214409,0.088561,0.358738,0.027205,0.120131
1,0.361239,0.424621,-0.475722,0.136874,-0.293836,-0.15299,-0.029207,0.400183,-0.123457,0.097232,...,0.040177,0.035331,0.21979,-0.133837,0.008973,-0.038138,0.137083,0.118699,-0.007119,0.021755
2,0.583285,0.27675,-0.30366,0.431084,-0.459641,0.11845,0.077635,0.063464,-0.133858,-0.337261,...,-0.085321,-0.167363,0.222302,-0.002913,0.061706,0.340651,0.025932,0.14384,0.117948,0.29601
3,0.102819,-0.112899,-0.196639,-0.033113,0.011916,-0.204391,-0.164194,-0.118093,0.028476,0.057855,...,-0.065692,-0.059671,0.161772,-0.103717,-0.138679,-0.13169,-0.083492,-0.051379,0.022478,-0.013527
4,-0.204907,0.087305,-0.339214,-0.038374,-0.134997,-0.004047,0.1246,0.340208,-0.057509,0.10461,...,0.064064,-0.091665,0.042883,-0.074493,-0.211037,0.090563,-0.329692,0.147235,-0.084661,-0.127858


### Analyzing the Results of MCA

In [16]:
# extract the explained variance
md_mca_fit.explained_inertia_

[0.10366066788079904,
 0.06659469494726254,
 0.05596107956999043,
 0.05276296602331989,
 0.04835770447287777,
 0.04191877225012664,
 0.03942795896349528,
 0.03642380121174885,
 0.0345596118775763,
 0.03352209427601275,
 0.03257533597171628,
 0.031186818179125553,
 0.030955186660771263,
 0.029067271697875062,
 0.028039773097052946,
 0.027331483004840142,
 0.026115547410060457,
 0.025150338310286933,
 0.024234609141912933,
 0.02313152366832897,
 0.02251948664559783,
 0.021279688628040187,
 0.02048466257757141,
 0.019972406415316297,
 0.018785385349996072,
 0.01732935638015986]

## No Markdown Cells

### Performing MCA

In [17]:
# instantiate MCA object and fit to data
no_md_mca = prince.MCA(n_components = len(no_md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
no_md_mca_fit = no_md_mca.fit(no_md_df)

In [18]:
# put the results into dataframe format
no_md_mca_df = no_md_mca_fit.row_coordinates(no_md_df)

# initial look at the results of the MCA
no_md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.071878,-0.041989,0.207998,-0.257161,0.02279,-0.3458,0.007461,0.015444,0.110829,0.07553,0.004104,-0.196289,0.13252,-0.226388,-0.348948,0.061324,0.043172
1,-0.181089,0.156364,0.04056,-0.200757,0.041777,-0.001351,0.002797,0.018711,-0.037327,-0.018664,-0.422003,0.743458,0.114726,0.264213,-0.150586,-0.006942,-0.007778
2,0.241798,-0.063569,-0.139433,0.080374,-0.730477,-0.205654,0.654276,0.265628,0.404117,-0.14173,0.157238,0.210295,-0.092136,-0.161394,0.108582,-0.133529,-0.078609
3,0.266695,-0.044443,-0.149762,0.013133,0.121795,0.020449,-0.113414,0.035554,-0.152233,0.02883,-0.053409,0.075277,-0.060606,0.039541,0.026221,0.053938,-0.245973
4,0.010225,0.12631,-0.223177,0.080633,0.027651,0.143545,0.200565,0.113072,-0.144118,-0.180148,0.001066,0.28912,-0.292884,0.371053,-0.246335,-0.050527,0.16473


### Analyzing the Results of MCA

In [19]:
# extract the explained variance
no_md_mca_fit.explained_inertia_

[0.09267785239908402,
 0.08136544980093442,
 0.06941702809830358,
 0.06325397164577062,
 0.06081185804222062,
 0.056872766139445814,
 0.055193935251100466,
 0.05276650804440892,
 0.04925926168552797,
 0.04845714443492029,
 0.04628625752390116,
 0.044921773837535164,
 0.04182052206901991,
 0.04009612088187051,
 0.03800600134362607,
 0.035749700522665954,
 0.03472077053608834]

# Association Rule Mining

## One-Hot Encoding

We must put our categorical data into a format that can be input into the Apriori algorithm

### Markdown Cells

In [128]:
# create a copy of the dataframe that we will one-hot encode
md_one_hot = md_original.copy()

In [129]:
# drop the first two columns (since we took the original dataframe)
md_one_hot = md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [130]:
# extract the datatypes so we can isolate the variables that need dummies
list(enumerate(zip(md_vars, md_one_hot.dtypes)))

[(0, ('longer_beginning', dtype('bool'))),
 (1, ('longer_ending', dtype('bool'))),
 (2, ('has_author', dtype('bool'))),
 (3, ('has_equation', dtype('bool'))),
 (4, ('jupyter_prop', dtype('O'))),
 (5, ('markdown_prop', dtype('O'))),
 (6, ('num_contrib', dtype('O'))),
 (7, ('image_prop', dtype('O'))),
 (8, ('is_education', dtype('bool'))),
 (9, ('has_links', dtype('bool'))),
 (10, ('has_comments', dtype('bool'))),
 (11, ('md_frequency', dtype('O'))),
 (12, ('has_title', dtype('bool'))),
 (13, ('num_commits', dtype('O'))),
 (14, ('md_format', dtype('bool'))),
 (15, ('non_exec_prop', dtype('O'))),
 (16, ('exec_inorder', dtype('O'))),
 (17, ('exec_skips', dtype('O'))),
 (18, ('has_error', dtype('bool')))]

In [131]:
# extract t/f vars
md_tf_vars = [md_vars[i] for i in [0, 1, 2, 3, 8, 9, 10, 12, 14, 18]]
md_tf_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'is_education',
 'has_links',
 'has_comments',
 'has_title',
 'md_format',
 'has_error']

In [132]:
# extract non-t/f vars
md_s_vars = [md_vars[i] for i in [4, 5, 6, 7, 11, 13, 15, 16, 17]]
md_s_vars

['jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'md_frequency',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [133]:
# extract value counts so we can isolate variables that have 3 bins
for (i, var) in enumerate(md_s_vars):
    print(i, md_one_hot[var].value_counts())

0 high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
1 medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
2 lower     2140
higher     151
Name: num_contrib, dtype: int64
3 low       1241
medium     873
high       177
Name: image_prop, dtype: int64
4 medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
5 lower     1981
higher     310
Name: num_commits, dtype: int64
6 low       2003
high       173
medium     115
Name: non_exec_prop, dtype: int64
7 higher    2180
lower      111
Name: exec_inorder, dtype: int64
8 lower     1925
higher     366
Name: exec_skips, dtype: int64


In [134]:
# extract lower/higher variables
md_lh_vars = [md_s_vars[i] for i in [2, 5, 7, 8]]

In [135]:
# extract low/medium/high variables
md_lmh_vars = [md_s_vars[i] for i in [0, 1, 3, 4, 6]]

In [136]:
# turn t/f vars into 1/0
for tf_var in md_tf_vars:
    md_one_hot[tf_var] = md_one_hot[tf_var].replace(True, 1)
    md_one_hot[tf_var] = md_one_hot[tf_var].replace(False, 0)

In [137]:
# turn lower/higher vars into 1/0
for lh_var in md_lh_vars:
    md_one_hot[lh_var] = md_one_hot[lh_var].replace('higher', 1)
    md_one_hot[lh_var] = md_one_hot[lh_var].replace('lower', 0)

In [138]:
# create dataframe that will hold all of the dummy variables
md_one_hot_cols = pd.get_dummies(md_one_hot[md_lmh_vars])

# initial look at the dummy variables
md_one_hot_cols.head()

Unnamed: 0,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,markdown_prop_high,markdown_prop_low,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0
1,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0
2,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0
3,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0


In [139]:
# delete the original columns from the one-hot dataframe
md_one_hot = md_one_hot.drop(md_lmh_vars, axis = 1)

# join the one hot dataframe with the dummy dataframe
md_one_hot = pd.concat([md_one_hot, md_one_hot_cols], axis = 1)

In [140]:
# initial look at the one-hot encoded data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,num_contrib,is_education,has_links,has_comments,has_title,num_commits,...,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,1.0,0.0,0.0,0.0,0,1.0,1.0,0.0,1.0,0,...,0,0,1,0,0,1,0,0,1,0
1,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,0.0,0,...,1,0,1,0,0,0,1,1,0,0
2,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,0,...,1,0,1,0,0,0,1,1,0,0
3,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,0,...,1,0,1,0,0,0,1,0,1,0
4,1.0,0.0,0.0,1.0,0,0.0,0.0,1.0,0.0,0,...,1,0,1,0,0,0,1,0,1,0


In [141]:
# convert everything to an int type
for var in list(md_one_hot):
    md_one_hot[var] = md_one_hot[var].astype(int)

In [142]:
# check the datatypes
md_one_hot.dtypes

longer_beginning        int64
longer_ending           int64
has_author              int64
has_equation            int64
num_contrib             int64
is_education            int64
has_links               int64
has_comments            int64
has_title               int64
num_commits             int64
md_format               int64
exec_inorder            int64
exec_skips              int64
has_error               int64
jupyter_prop_high       int64
jupyter_prop_low        int64
jupyter_prop_medium     int64
markdown_prop_high      int64
markdown_prop_low       int64
markdown_prop_medium    int64
image_prop_high         int64
image_prop_low          int64
image_prop_medium       int64
md_frequency_high       int64
md_frequency_low        int64
md_frequency_medium     int64
non_exec_prop_high      int64
non_exec_prop_low       int64
non_exec_prop_medium    int64
dtype: object

### No Markdown Cells

In [143]:
# copy the original dataframe
no_md_one_hot = no_md_original.copy()

In [144]:
# drop the first two columns
no_md_one_hot = no_md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [145]:
# extract datatypes
list(enumerate(zip(no_md_vars, no_md_one_hot.dtypes)))

[(0, ('has_author', dtype('bool'))),
 (1, ('jupyter_prop', dtype('O'))),
 (2, ('num_contrib', dtype('O'))),
 (3, ('image_prop', dtype('O'))),
 (4, ('is_education', dtype('bool'))),
 (5, ('has_comments', dtype('bool'))),
 (6, ('num_commits', dtype('O'))),
 (7, ('non_exec_prop', dtype('O'))),
 (8, ('exec_inorder', dtype('O'))),
 (9, ('exec_skips', dtype('O'))),
 (10, ('has_error', dtype('bool')))]

In [146]:
# extract t/f vars
no_md_tf_vars = [no_md_vars[i] for i in [0, 4, 5, 10]]
no_md_tf_vars

['has_author', 'is_education', 'has_comments', 'has_error']

In [147]:
# extract non-tf vars
no_md_s_vars = [no_md_vars[i] for i in [1, 2, 3, 6, 7, 8, 9]]
no_md_s_vars

['jupyter_prop',
 'num_contrib',
 'image_prop',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [148]:
# extract value counts
for (i, var) in enumerate(no_md_s_vars):
    print(i, no_md_one_hot[var].value_counts())

0 high      910
medium    286
low       159
Name: jupyter_prop, dtype: int64
1 lower     1237
higher     118
Name: num_contrib, dtype: int64
2 low       916
medium    281
high      158
Name: image_prop, dtype: int64
3 lower     1042
higher     313
Name: num_commits, dtype: int64
4 lower     1249
higher     106
Name: non_exec_prop, dtype: int64
5 higher    1205
lower      150
Name: exec_inorder, dtype: int64
6 lower     1179
higher     176
Name: exec_skips, dtype: int64


In [149]:
# extract lower/higher vars
no_md_lh_vars = [no_md_s_vars[i] for i in [1, 3, 4, 5, 6]]

In [150]:
# extract low/medium/high vars
no_md_lmh_vars = [no_md_s_vars[i] for i in [0, 2]]

In [151]:
# turn t/f vars into 1/0
for tf_var in no_md_tf_vars:
    no_md_one_hot[tf_var] = no_md_one_hot[tf_var].replace(True, 1)
    no_md_one_hot[tf_var] = no_md_one_hot[tf_var].replace(False, 0)

In [152]:
# turn lower/higher vars into 1/0
for lh_var in no_md_lh_vars:
    no_md_one_hot[lh_var] = no_md_one_hot[lh_var].replace('higher', 1)
    no_md_one_hot[lh_var] = no_md_one_hot[lh_var].replace('lower', 0)

In [153]:
# create dataframe that will hold dummy vars
no_md_one_hot_cols = pd.get_dummies(no_md_one_hot[no_md_lmh_vars])

# initial look at the dummy variables
no_md_one_hot_cols.head()

Unnamed: 0,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,0,0,1,0,1,0
1,1,0,0,0,1,0
2,0,1,0,0,1,0
3,1,0,0,0,1,0
4,1,0,0,0,1,0


In [154]:
# delete the original columns from the one-hot dataframe
no_md_one_hot = no_md_one_hot.drop(no_md_lmh_vars, axis = 1)

# join the one hot dataframe w the dummy vars
no_md_one_hot = pd.concat([no_md_one_hot, no_md_one_hot_cols], axis = 1)

In [155]:
# initial look at the one-hot encoded data
no_md_one_hot.head()

Unnamed: 0,has_author,num_contrib,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,0.0,0,0.0,0.0,0,0,1,0,0.0,0,0,1,0,1,0
1,0.0,0,0.0,1.0,0,0,1,0,1.0,1,0,0,0,1,0
2,0.0,1,0.0,1.0,1,0,1,0,0.0,0,1,0,0,1,0
3,0.0,0,0.0,1.0,0,0,1,0,0.0,1,0,0,0,1,0
4,0.0,0,0.0,1.0,1,0,1,0,1.0,1,0,0,0,1,0


In [156]:
# convert everything to an int type
for var in list(no_md_one_hot):
    no_md_one_hot[var] = no_md_one_hot[var].astype(int)

In [157]:
# check the datatypes
no_md_one_hot.dtypes

has_author             int64
num_contrib            int64
is_education           int64
has_comments           int64
num_commits            int64
non_exec_prop          int64
exec_inorder           int64
exec_skips             int64
has_error              int64
jupyter_prop_high      int64
jupyter_prop_low       int64
jupyter_prop_medium    int64
image_prop_high        int64
image_prop_low         int64
image_prop_medium      int64
dtype: object

## Performing the Apriori Algorithm

### Markdown Cells

In [158]:
# requires True/False instead of 1/0
md_one_hot = md_one_hot.replace(1, True)
md_one_hot = md_one_hot.replace(0, False)

In [159]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,num_contrib,is_education,has_links,has_comments,has_title,num_commits,...,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,True,False,False,False,False,True,True,False,True,False,...,False,False,True,False,False,True,False,False,True,False
1,True,False,False,False,False,True,True,True,False,False,...,True,False,True,False,False,False,True,True,False,False
2,True,False,False,False,False,True,True,True,True,False,...,True,False,True,False,False,False,True,True,False,False
3,True,False,False,False,False,True,True,True,True,False,...,True,False,True,False,False,False,True,False,True,False
4,True,False,False,True,False,False,False,True,False,False,...,True,False,True,False,False,False,True,False,True,False


In [160]:
# perform the apriori algorithm on the dataframe
md_itemsets = apriori(md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
md_itemsets

Unnamed: 0,support,itemsets
0,0.543867,(longer_beginning)
1,0.576604,(is_education)
2,0.889568,(has_comments)
3,0.645133,(has_title)
4,0.95155,(exec_inorder)
5,0.787429,(jupyter_prop_high)
6,0.820166,(markdown_prop_medium)
7,0.541685,(image_prop_low)
8,0.556962,(md_frequency_medium)
9,0.874291,(non_exec_prop_low)


In [161]:
# add a column that counts the number of elements in the itemset
md_itemsets['length'] = md_itemsets['itemsets'].apply(lambda x : len(x))
md_itemsets

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning),1
1,0.576604,(is_education),1
2,0.889568,(has_comments),1
3,0.645133,(has_title),1
4,0.95155,(exec_inorder),1
5,0.787429,(jupyter_prop_high),1
6,0.820166,(markdown_prop_medium),1
7,0.541685,(image_prop_low),1
8,0.556962,(md_frequency_medium),1
9,0.874291,(non_exec_prop_low),1


In [162]:
# filter down to itemsets that have at least 2 items and at least 0.7 support
md_itemsets[(md_itemsets['length'] >= 2) & (md_itemsets['support'] >= 0.7)]

Unnamed: 0,support,itemsets,length
15,0.848101,"(has_comments, exec_inorder)",2
16,0.700131,"(has_comments, jupyter_prop_high)",2
17,0.736796,"(markdown_prop_medium, has_comments)",2
18,0.774771,"(has_comments, non_exec_prop_low)",2
23,0.750764,"(exec_inorder, jupyter_prop_high)",2
24,0.780882,"(markdown_prop_medium, exec_inorder)",2
27,0.829769,"(exec_inorder, non_exec_prop_low)",2
30,0.72021,"(markdown_prop_medium, non_exec_prop_low)",2
34,0.703623,"(markdown_prop_medium, has_comments, exec_inor...",3
35,0.737233,"(has_comments, exec_inorder, non_exec_prop_low)",3


### No Markdown Cells

In [163]:
# replace 1/0 with True/False
no_md_one_hot = no_md_one_hot.replace(1, True)
no_md_one_hot = no_md_one_hot.replace(0, False)

In [164]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author,num_contrib,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False
1,False,False,False,True,False,False,True,False,True,True,False,False,False,True,False
2,False,True,False,True,True,False,True,False,False,False,True,False,False,True,False
3,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False
4,False,False,False,True,True,False,True,False,True,True,False,False,False,True,False


In [165]:
# perform apriori algorithm on the dataframe
no_md_itemsets = apriori(no_md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
no_md_itemsets

Unnamed: 0,support,itemsets
0,0.678229,(has_comments)
1,0.889299,(exec_inorder)
2,0.671587,(jupyter_prop_high)
3,0.676015,(image_prop_low)
4,0.597048,"(has_comments, exec_inorder)"
5,0.602214,"(exec_inorder, jupyter_prop_high)"
6,0.61107,"(image_prop_low, exec_inorder)"


Not a lot of itemsets (probably because not a lot of variables), maybe reduce `min_support`?

In [166]:
# add a column that counts the number of elements in the itemset
no_md_itemsets['length'] = no_md_itemsets['itemsets'].apply(lambda x : len(x))
no_md_itemsets

Unnamed: 0,support,itemsets,length
0,0.678229,(has_comments),1
1,0.889299,(exec_inorder),1
2,0.671587,(jupyter_prop_high),1
3,0.676015,(image_prop_low),1
4,0.597048,"(has_comments, exec_inorder)",2
5,0.602214,"(exec_inorder, jupyter_prop_high)",2
6,0.61107,"(image_prop_low, exec_inorder)",2


In [167]:
# filter down to itemsets that have at least 2 items
no_md_itemsets[no_md_itemsets['length'] >= 2]

Unnamed: 0,support,itemsets,length
4,0.597048,"(has_comments, exec_inorder)",2
5,0.602214,"(exec_inorder, jupyter_prop_high)",2
6,0.61107,"(image_prop_low, exec_inorder)",2


## Extracting Association Rules

### Markdown Cells

In [168]:
# use the itemsets to extract association rules
md_association_rules = association_rules(md_itemsets, metric = 'confidence', min_threshold = 0.7)
md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(longer_beginning),(has_comments),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553
1,(longer_beginning),(exec_inorder),0.543867,0.951550,0.516368,0.949438,0.997781,-0.001148,0.958242
2,(is_education),(has_comments),0.576604,0.889568,0.516805,0.896291,1.007557,0.003876,1.064824
3,(is_education),(exec_inorder),0.576604,0.951550,0.548669,0.951552,1.000002,0.000001,1.000048
4,(has_title),(has_comments),0.645133,0.889568,0.574422,0.890392,1.000927,0.000532,1.007523
...,...,...,...,...,...,...,...,...,...
143,"(markdown_prop_medium, exec_inorder)","(non_exec_prop_low, jupyter_prop_high)",0.780882,0.686163,0.552597,0.707658,1.031326,0.016785,1.073526
144,"(markdown_prop_medium, non_exec_prop_low)","(exec_inorder, jupyter_prop_high)",0.720210,0.750764,0.552597,0.767273,1.021989,0.011890,1.070937
145,"(exec_inorder, jupyter_prop_high)","(markdown_prop_medium, non_exec_prop_low)",0.750764,0.720210,0.552597,0.736047,1.021989,0.011890,1.059999
146,"(non_exec_prop_low, jupyter_prop_high)","(markdown_prop_medium, exec_inorder)",0.686163,0.780882,0.552597,0.805344,1.031326,0.016785,1.125667


In [169]:
# add columns that hold length
md_association_rules['antecedent_len'] = md_association_rules['antecedents'].apply(len)
md_association_rules['consequent_len'] = md_association_rules['consequents'].apply(len)

In [170]:
# filter down to rules that have 1 antecedent and 1 consequent
md_association_rules[(md_association_rules['antecedent_len'] == 1) &
                    (md_association_rules['consequent_len'] == 1) &
                    (md_association_rules['confidence'] >= 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning),(has_comments),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553,1,1
1,(longer_beginning),(exec_inorder),0.543867,0.95155,0.516368,0.949438,0.997781,-0.001148,0.958242,1,1
2,(is_education),(has_comments),0.576604,0.889568,0.516805,0.896291,1.007557,0.003876,1.064824,1,1
3,(is_education),(exec_inorder),0.576604,0.95155,0.548669,0.951552,1.000002,1e-06,1.000048,1,1
4,(has_title),(has_comments),0.645133,0.889568,0.574422,0.890392,1.000927,0.000532,1.007523,1,1
5,(has_comments),(exec_inorder),0.889568,0.95155,0.848101,0.953386,1.00193,0.001633,1.03939,1,1
6,(exec_inorder),(has_comments),0.95155,0.889568,0.848101,0.891284,1.00193,0.001633,1.015789,1,1
8,(jupyter_prop_high),(has_comments),0.787429,0.889568,0.700131,0.889135,0.999514,-0.000341,0.996098,1,1
9,(markdown_prop_medium),(has_comments),0.820166,0.889568,0.736796,0.89835,1.009873,0.007203,1.086398,1,1
10,(has_comments),(markdown_prop_medium),0.889568,0.820166,0.736796,0.828263,1.009873,0.007203,1.047148,1,1


### No Markdown Cells

In [171]:
# use the itemsets to extract association rules
no_md_association_rules = association_rules(no_md_itemsets, metric = 'confidence', min_threshold = 0.5)
no_md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(has_comments),(exec_inorder),0.678229,0.889299,0.597048,0.880305,0.989886,-0.0061,0.924857
1,(exec_inorder),(has_comments),0.889299,0.678229,0.597048,0.671369,0.989886,-0.0061,0.979127
2,(exec_inorder),(jupyter_prop_high),0.889299,0.671587,0.602214,0.677178,1.008326,0.004973,1.017321
3,(jupyter_prop_high),(exec_inorder),0.671587,0.889299,0.602214,0.896703,1.008326,0.004973,1.071681
4,(image_prop_low),(exec_inorder),0.676015,0.889299,0.61107,0.90393,1.016453,0.009891,1.152298
5,(exec_inorder),(image_prop_low),0.889299,0.676015,0.61107,0.687137,1.016453,0.009891,1.03555


# Association Rule Mining (with all dummies)

## One Hot Encoding

We create dummy variables for every single variable, instead of those with low/medium/high

### Markdown Cells

In [20]:
# create a copy of the dataframe we will one-hot encode
md_one_hot = md_original.copy()

In [21]:
# drop the first two columns 
md_one_hot = md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [22]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,True,False,False,False,medium,high,low,lower,low,True,...,low,higher,lower,False,False,lower,False,medium,False,higher
1,True,False,False,False,high,low,medium,lower,low,True,...,high,higher,lower,False,False,lower,False,medium,False,lower
2,True,False,False,False,high,low,medium,lower,low,True,...,high,higher,lower,False,True,higher,False,high,False,lower
3,True,False,False,False,high,medium,medium,lower,low,True,...,low,higher,lower,False,False,lower,False,medium,False,lower
4,True,False,False,True,high,low,medium,lower,low,False,...,low,higher,lower,False,False,lower,False,low,False,lower


In [23]:
# change True/False to T/F for simplicity
md_one_hot = md_one_hot.replace(True, 'T')
md_one_hot = md_one_hot.replace(False, 'F')

In [24]:
# change medium to med for simplicity
md_one_hot = md_one_hot.replace('medium', 'med')

In [25]:
# check the data again
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,T,F,F,F,med,high,low,lower,low,T,...,low,higher,lower,F,F,lower,F,med,F,higher
1,T,F,F,F,high,low,med,lower,low,T,...,high,higher,lower,F,F,lower,F,med,F,lower
2,T,F,F,F,high,low,med,lower,low,T,...,high,higher,lower,F,T,higher,F,high,F,lower
3,T,F,F,F,high,med,med,lower,low,T,...,low,higher,lower,F,F,lower,F,med,F,lower
4,T,F,F,T,high,low,med,lower,low,F,...,low,higher,lower,F,F,lower,F,low,F,lower


In [26]:
# create dummy variables for all variables
md_one_hot = pd.get_dummies(md_one_hot)

In [27]:
# look at the one-hot encoded data
md_one_hot.head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,num_functions_lower,has_test_F,has_test_T,num_headers_high,num_headers_low,num_headers_med,has_param_F,has_param_T,num_stars_higher,num_stars_lower
0,0,1,1,0,1,0,1,0,0,0,...,1,1,0,0,0,1,1,0,1,0
1,0,1,1,0,1,0,1,0,1,0,...,1,1,0,0,0,1,1,0,0,1
2,0,1,1,0,1,0,1,0,1,0,...,0,1,0,1,0,0,1,0,0,1
3,0,1,1,0,1,0,1,0,1,0,...,1,1,0,0,0,1,1,0,0,1
4,0,1,1,0,1,0,0,1,1,0,...,1,1,0,0,1,0,1,0,0,1


In [28]:
# check the fields
md_one_hot.columns

Index(['longer_beginning_F', 'longer_beginning_T', 'longer_ending_F',
       'longer_ending_T', 'has_author_F', 'has_author_T', 'has_equation_F',
       'has_equation_T', 'jupyter_prop_high', 'jupyter_prop_low',
       'jupyter_prop_med', 'output_cell_prop_high', 'output_cell_prop_low',
       'output_cell_prop_med', 'markdown_prop_high', 'markdown_prop_low',
       'markdown_prop_med', 'num_contrib_higher', 'num_contrib_lower',
       'image_prop_high', 'image_prop_low', 'image_prop_med', 'is_education_F',
       'is_education_T', 'has_links_F', 'has_links_T', 'has_comments_F',
       'has_comments_T', 'md_frequency_high', 'md_frequency_low',
       'md_frequency_med', 'has_title_F', 'has_title_T', 'num_commits_higher',
       'num_commits_lower', 'md_format_F', 'md_format_T', 'non_exec_prop_high',
       'non_exec_prop_low', 'non_exec_prop_med', 'exec_inorder_higher',
       'exec_inorder_lower', 'exec_skips_higher', 'exec_skips_lower',
       'has_error_F', 'has_error_T', 'has_expor

### No Markdown Cells

In [30]:
# create a copy of the dataframe we will one-hot encode
no_md_one_hot = no_md_original.copy()

In [31]:
# drop the first two columns
no_md_one_hot = no_md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [32]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,False,medium,medium,lower,low,False,False,lower,lower,higher,lower,False,True,lower,False,False,lower
1,False,high,high,lower,low,False,True,lower,lower,higher,lower,True,True,lower,False,False,lower
2,False,low,medium,higher,low,False,True,higher,lower,higher,lower,False,True,lower,False,False,lower
3,False,high,low,lower,low,False,True,lower,lower,higher,lower,False,False,lower,False,False,lower
4,False,high,low,lower,low,False,True,higher,lower,higher,lower,True,False,lower,False,False,lower


In [33]:
# change True/False to T/F 
no_md_one_hot = no_md_one_hot.replace(True, 'T')
no_md_one_hot = no_md_one_hot.replace(False, 'F')

# change medium to med
no_md_one_hot = no_md_one_hot.replace('medium', 'med')

In [34]:
# check the data again
no_md_one_hot.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,F,med,med,lower,low,F,F,lower,lower,higher,lower,F,T,lower,F,F,lower
1,F,high,high,lower,low,F,T,lower,lower,higher,lower,T,T,lower,F,F,lower
2,F,low,med,higher,low,F,T,higher,lower,higher,lower,F,T,lower,F,F,lower
3,F,high,low,lower,low,F,T,lower,lower,higher,lower,F,F,lower,F,F,lower
4,F,high,low,lower,low,F,T,higher,lower,higher,lower,T,F,lower,F,F,lower


In [35]:
# create all dummy variables
no_md_one_hot = pd.get_dummies(no_md_one_hot)

In [36]:
# look at the one-hot encoded data
no_md_one_hot.head()

Unnamed: 0,has_author_F,has_author_T,jupyter_prop_high,jupyter_prop_low,jupyter_prop_med,output_cell_prop_high,output_cell_prop_low,output_cell_prop_med,num_contrib_higher,num_contrib_lower,...,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_test_T,has_param_F,has_param_T,num_stars_higher,num_stars_lower
0,1,0,0,0,1,0,0,1,0,1,...,0,1,0,1,1,0,1,0,0,1
1,1,0,1,0,0,1,0,0,0,1,...,0,1,0,1,1,0,1,0,0,1
2,1,0,0,1,0,0,0,1,1,0,...,0,1,0,1,1,0,1,0,0,1
3,1,0,1,0,0,0,1,0,0,1,...,1,0,0,1,1,0,1,0,0,1
4,1,0,1,0,0,0,1,0,0,1,...,1,0,0,1,1,0,1,0,0,1


In [37]:
# check the fields
no_md_one_hot.columns

Index(['has_author_F', 'has_author_T', 'jupyter_prop_high', 'jupyter_prop_low',
       'jupyter_prop_med', 'output_cell_prop_high', 'output_cell_prop_low',
       'output_cell_prop_med', 'num_contrib_higher', 'num_contrib_lower',
       'image_prop_high', 'image_prop_low', 'image_prop_med', 'is_education_F',
       'is_education_T', 'has_comments_F', 'has_comments_T',
       'num_commits_higher', 'num_commits_lower', 'non_exec_prop_higher',
       'non_exec_prop_lower', 'exec_inorder_higher', 'exec_inorder_lower',
       'exec_skips_higher', 'exec_skips_lower', 'has_error_F', 'has_error_T',
       'has_export_F', 'has_export_T', 'num_functions_higher',
       'num_functions_lower', 'has_test_F', 'has_test_T', 'has_param_F',
       'has_param_T', 'num_stars_higher', 'num_stars_lower'],
      dtype='object')

## Performing the Apriori Algorithm

### Markdown Cells

In [38]:
# requires True/False instead of 1/0
md_one_hot = md_one_hot.replace(1, True)
md_one_hot = md_one_hot.replace(0, False)

In [39]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,num_functions_lower,has_test_F,has_test_T,num_headers_high,num_headers_low,num_headers_med,has_param_F,has_param_T,num_stars_higher,num_stars_lower
0,False,True,True,False,True,False,True,False,False,False,...,True,True,False,False,False,True,True,False,True,False
1,False,True,True,False,True,False,True,False,True,False,...,True,True,False,False,False,True,True,False,False,True
2,False,True,True,False,True,False,True,False,True,False,...,False,True,False,True,False,False,True,False,False,True
3,False,True,True,False,True,False,True,False,True,False,...,True,True,False,False,False,True,True,False,False,True
4,False,True,True,False,True,False,False,True,True,False,...,True,True,False,False,True,False,True,False,False,True


In [40]:
# perform apriori algorithm on the dataframe
md_itemsets = apriori(md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
md_itemsets

Unnamed: 0,support,itemsets
0,0.543867,(longer_beginning_T)
1,0.741161,(longer_ending_F)
2,0.953296,(has_author_F)
3,0.806198,(has_equation_F)
4,0.787429,(jupyter_prop_high)
...,...,...
10511,0.513313,"(exec_inorder_higher, num_stars_lower, exec_sk..."
10512,0.517678,"(num_contrib_lower, exec_inorder_higher, num_s..."
10513,0.532082,"(num_contrib_lower, exec_inorder_higher, num_s..."
10514,0.528154,"(num_contrib_lower, exec_inorder_higher, num_s..."


In [41]:
# add a column that counts the number of elements in the itemset
md_itemsets['length'] = md_itemsets['itemsets'].apply(len)
md_itemsets

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning_T),1
1,0.741161,(longer_ending_F),1
2,0.953296,(has_author_F),1
3,0.806198,(has_equation_F),1
4,0.787429,(jupyter_prop_high),1
...,...,...,...
10511,0.513313,"(exec_inorder_higher, num_stars_lower, exec_sk...",9
10512,0.517678,"(num_contrib_lower, exec_inorder_higher, num_s...",9
10513,0.532082,"(num_contrib_lower, exec_inorder_higher, num_s...",9
10514,0.528154,"(num_contrib_lower, exec_inorder_higher, num_s...",9


In [43]:
# filter the itemsets
md_itemsets[(md_itemsets['length'] >= 2) &
           (md_itemsets['support'] >= 0.9)]

Unnamed: 0,support,itemsets,length
64,0.9079,"(has_author_F, exec_inorder_higher)",2
69,0.919686,"(has_test_F, has_author_F)",2
71,0.939328,"(has_param_F, has_author_F)",2
142,0.90048,"(has_test_F, num_contrib_lower)",2
144,0.921432,"(has_param_F, num_contrib_lower)",2
215,0.919686,"(has_test_F, exec_inorder_higher)",2
217,0.937145,"(has_param_F, exec_inorder_higher)",2
240,0.951986,"(has_test_F, has_param_F)",2
244,0.918376,"(num_stars_lower, has_param_F)",2
513,0.905718,"(has_test_F, has_param_F, has_author_F)",3


### No Markdown Cells

In [44]:
# replace 1/0 with True/False
no_md_one_hot = no_md_one_hot.replace(1, True)
no_md_one_hot = no_md_one_hot.replace(0, False)

In [45]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author_F,has_author_T,jupyter_prop_high,jupyter_prop_low,jupyter_prop_med,output_cell_prop_high,output_cell_prop_low,output_cell_prop_med,num_contrib_higher,num_contrib_lower,...,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_test_T,has_param_F,has_param_T,num_stars_higher,num_stars_lower
0,True,False,False,False,True,False,False,True,False,True,...,False,True,False,True,True,False,True,False,False,True
1,True,False,True,False,False,True,False,False,False,True,...,False,True,False,True,True,False,True,False,False,True
2,True,False,False,True,False,False,False,True,True,False,...,False,True,False,True,True,False,True,False,False,True
3,True,False,True,False,False,False,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True
4,True,False,True,False,False,False,True,False,False,True,...,True,False,False,True,True,False,True,False,False,True


In [46]:
# perform apriori algorithm on the dataframe
no_md_itemsets = apriori(no_md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
no_md_itemsets

Unnamed: 0,support,itemsets
0,0.991137,(has_author_F)
1,0.672083,(jupyter_prop_high)
2,0.586411,(output_cell_prop_med)
3,0.912851,(num_contrib_lower)
4,0.675775,(image_prop_low)
...,...,...
4520,0.537666,"(non_exec_prop_lower, is_education_F, exec_ino..."
4521,0.511817,"(non_exec_prop_lower, is_education_F, exec_ino..."
4522,0.507386,"(non_exec_prop_lower, is_education_F, num_star..."
4523,0.513294,"(is_education_F, exec_inorder_higher, num_star..."


In [47]:
# add a column that counts the number of elements in the itemset
no_md_itemsets['length'] = no_md_itemsets['itemsets'].apply(len)
no_md_itemsets

Unnamed: 0,support,itemsets,length
0,0.991137,(has_author_F),1
1,0.672083,(jupyter_prop_high),1
2,0.586411,(output_cell_prop_med),1
3,0.912851,(num_contrib_lower),1
4,0.675775,(image_prop_low),1
...,...,...,...
4520,0.537666,"(non_exec_prop_lower, is_education_F, exec_ino...",9
4521,0.511817,"(non_exec_prop_lower, is_education_F, exec_ino...",9
4522,0.507386,"(non_exec_prop_lower, is_education_F, num_star...",9
4523,0.513294,"(is_education_F, exec_inorder_higher, num_star...",9


In [49]:
# filter the itemsets
no_md_itemsets[(no_md_itemsets['length'] >= 2) &
              (no_md_itemsets['support'] >= 0.9)]

Unnamed: 0,support,itemsets,length
19,0.904727,"(num_contrib_lower, has_author_F)",2
24,0.916544,"(non_exec_prop_lower, has_author_F)",2
30,0.990399,"(has_test_F, has_author_F)",2
31,0.980059,"(has_param_F, has_author_F)",2
32,0.950517,"(num_stars_lower, has_author_F)",2
62,0.912112,"(has_test_F, num_contrib_lower)",2
63,0.901773,"(has_param_F, num_contrib_lower)",2
108,0.921713,"(non_exec_prop_lower, has_test_F)",2
109,0.910635,"(non_exec_prop_lower, has_param_F)",2
136,0.988183,"(has_test_F, has_param_F)",2


## Extracting Association Rules

### Markdown Cells

In [52]:
# use the itemsets to extract association rules
md_association_rules = association_rules(md_itemsets, metric = 'confidence', min_threshold = 0.95)
md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(longer_beginning_T),(has_test_F),0.543867,0.966390,0.529463,0.973515,1.007373,0.003875,1.269024
1,(longer_beginning_T),(has_param_F),0.543867,0.985596,0.538193,0.989567,1.004029,0.002160,1.380586
2,(longer_beginning_T),(num_stars_lower),0.543867,0.931034,0.520733,0.957464,1.028387,0.014374,1.621340
3,(longer_ending_F),(has_author_F),0.741161,0.953296,0.713226,0.962309,1.009455,0.006680,1.239129
4,(longer_ending_F),(has_test_F),0.741161,0.966390,0.718027,0.968787,1.002480,0.001776,1.076781
...,...,...,...,...,...,...,...,...,...
28878,"(non_exec_prop_low, num_stars_lower, exec_skip...",(exec_inorder_higher),0.525971,0.951550,0.510694,0.970954,1.020393,0.010206,1.668080
28879,"(non_exec_prop_low, exec_inorder_higher, num_s...",(num_contrib_lower),0.528590,0.934090,0.510694,0.966144,1.034316,0.016943,1.946759
28880,"(non_exec_prop_low, exec_inorder_higher, num_s...","(has_test_F, has_param_F)",0.534701,0.951986,0.510694,0.955102,1.003273,0.001666,1.069402
28881,"(non_exec_prop_low, num_stars_lower, exec_skip...","(has_param_F, exec_inorder_higher)",0.531646,0.937145,0.510694,0.960591,1.025018,0.012465,1.594937


In [53]:
# add columns that hold length
md_association_rules['antecedent_len'] = md_association_rules['antecedents'].apply(len)
md_association_rules['consequent_len'] = md_association_rules['consequents'].apply(len)

In [54]:
# filter the rules
md_association_rules[(md_association_rules['antecedent_len'] == 1) &
                    (md_association_rules['consequent_len'] == 1) &
                    (md_association_rules['confidence'] >= 0.95)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning_T),(has_test_F),0.543867,0.966390,0.529463,0.973515,1.007373,0.003875,1.269024,1,1
1,(longer_beginning_T),(has_param_F),0.543867,0.985596,0.538193,0.989567,1.004029,0.002160,1.380586,1,1
2,(longer_beginning_T),(num_stars_lower),0.543867,0.931034,0.520733,0.957464,1.028387,0.014374,1.621340,1,1
3,(longer_ending_F),(has_author_F),0.741161,0.953296,0.713226,0.962309,1.009455,0.006680,1.239129,1,1
4,(longer_ending_F),(has_test_F),0.741161,0.966390,0.718027,0.968787,1.002480,0.001776,1.076781,1,1
...,...,...,...,...,...,...,...,...,...,...,...
89,(has_test_F),(has_param_F),0.966390,0.985596,0.951986,0.985095,0.999492,-0.000484,0.966390,1,1
90,(has_param_F),(has_test_F),0.985596,0.966390,0.951986,0.965899,0.999492,-0.000484,0.985596,1,1
91,(num_stars_lower),(has_test_F),0.931034,0.966390,0.897425,0.963901,0.997424,-0.002318,0.931034,1,1
92,(num_headers_med),(has_param_F),0.607595,0.985596,0.595373,0.979885,0.994206,-0.003470,0.716094,1,1


### No Markdown Cells

In [57]:
# use itemsets to extract association rules
no_md_association_rules = association_rules(no_md_itemsets, metric = 'confidence', min_threshold = 0.95)
no_md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(jupyter_prop_high),(has_author_F),0.672083,0.991137,0.669867,0.996703,1.005616,0.003741,2.688331
1,(output_cell_prop_med),(has_author_F),0.586411,0.991137,0.582718,0.993703,1.002588,0.001504,1.407386
2,(num_contrib_lower),(has_author_F),0.912851,0.991137,0.904727,0.991100,0.999963,-0.000034,0.995837
3,(image_prop_low),(has_author_F),0.675775,0.991137,0.667651,0.987978,0.996813,-0.002135,0.737210
4,(is_education_F),(has_author_F),0.892171,0.991137,0.883309,0.990066,0.998919,-0.000956,0.892171
...,...,...,...,...,...,...,...,...,...
15481,"(non_exec_prop_lower, exec_inorder_higher, num...","(has_param_F, has_author_F)",0.530281,0.980059,0.525111,0.990251,1.010399,0.005404,2.045368
15482,"(non_exec_prop_lower, exec_inorder_higher, num...","(has_test_F, has_author_F)",0.526588,0.990399,0.525111,0.997195,1.006862,0.003579,3.422821
15483,"(non_exec_prop_lower, exec_inorder_higher, num...","(has_test_F, has_param_F)",0.528804,0.988183,0.525111,0.993017,1.004891,0.002556,1.692171
15484,"(non_exec_prop_lower, exec_inorder_higher, num...","(num_stars_lower, has_test_F)",0.552437,0.958641,0.525111,0.950535,0.991544,-0.004478,0.836121


In [58]:
# add columns that hold length
no_md_association_rules['antecedent_len'] = no_md_association_rules['antecedents'].apply(len)
no_md_association_rules['consequent_len'] = no_md_association_rules['consequents'].apply(len)

In [59]:
# filter the rules
no_md_association_rules[(no_md_association_rules['antecedent_len'] == 1) &
                       (no_md_association_rules['consequent_len'] == 1) &
                       (no_md_association_rules['confidence'] >= 0.9)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.672083,0.991137,0.669867,0.996703,1.005616,0.003741,2.688331,1,1
1,(output_cell_prop_med),(has_author_F),0.586411,0.991137,0.582718,0.993703,1.002588,0.001504,1.407386,1,1
2,(num_contrib_lower),(has_author_F),0.912851,0.991137,0.904727,0.991100,0.999963,-0.000034,0.995837,1,1
3,(image_prop_low),(has_author_F),0.675775,0.991137,0.667651,0.987978,0.996813,-0.002135,0.737210,1,1
4,(is_education_F),(has_author_F),0.892171,0.991137,0.883309,0.990066,0.998919,-0.000956,0.892171,1,1
...,...,...,...,...,...,...,...,...,...,...,...
60,(has_param_F),(has_test_F),0.988922,0.999261,0.988183,0.999253,0.999992,-0.000008,0.988922,1,1
61,(num_stars_lower),(has_test_F),0.959380,0.999261,0.958641,0.999230,0.999969,-0.000030,0.959380,1,1
62,(has_test_F),(num_stars_lower),0.999261,0.959380,0.958641,0.959350,0.999969,-0.000030,0.999261,1,1
63,(num_stars_lower),(has_param_F),0.959380,0.988922,0.948301,0.988453,0.999526,-0.000450,0.959380,1,1


# Export Itemsets and Rules

## Markdown Cells

In [60]:
# check the data
md_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning_T),1
1,0.741161,(longer_ending_F),1
2,0.953296,(has_author_F),1
3,0.806198,(has_equation_F),1
4,0.787429,(jupyter_prop_high),1


In [61]:
# extract itemsets to a pickle
md_itemsets.to_pickle('rule-mining/md_frequent_itemsets.pkl')

# extract itemsets to a csv 
md_itemsets.to_csv('rule-mining/csv-files/md_frequent_itemsets.csv')

In [62]:
# check the data
md_association_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning_T),(has_test_F),0.543867,0.96639,0.529463,0.973515,1.007373,0.003875,1.269024,1,1
1,(longer_beginning_T),(has_param_F),0.543867,0.985596,0.538193,0.989567,1.004029,0.00216,1.380586,1,1
2,(longer_beginning_T),(num_stars_lower),0.543867,0.931034,0.520733,0.957464,1.028387,0.014374,1.62134,1,1
3,(longer_ending_F),(has_author_F),0.741161,0.953296,0.713226,0.962309,1.009455,0.00668,1.239129,1,1
4,(longer_ending_F),(has_test_F),0.741161,0.96639,0.718027,0.968787,1.00248,0.001776,1.076781,1,1


In [63]:
# extract rules to a pickle
md_association_rules.to_pickle('rule-mining/md_association_rules.pkl')

# extract rules to a csv
md_association_rules.to_csv('rule-mining/csv-files/md_association_rules.csv')

## No Markdown Cells

In [64]:
# check the data
no_md_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.991137,(has_author_F),1
1,0.672083,(jupyter_prop_high),1
2,0.586411,(output_cell_prop_med),1
3,0.912851,(num_contrib_lower),1
4,0.675775,(image_prop_low),1


In [65]:
# extract itemsets to a pickle
no_md_itemsets.to_pickle('rule-mining/no_md_frequent_itemsets.pkl')

# extract itemsets to a csv
no_md_itemsets.to_csv('rule-mining/csv-files/no_md_frequent_itemsets.csv')

In [66]:
# check the data
no_md_association_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.672083,0.991137,0.669867,0.996703,1.005616,0.003741,2.688331,1,1
1,(output_cell_prop_med),(has_author_F),0.586411,0.991137,0.582718,0.993703,1.002588,0.001504,1.407386,1,1
2,(num_contrib_lower),(has_author_F),0.912851,0.991137,0.904727,0.9911,0.999963,-3.4e-05,0.995837,1,1
3,(image_prop_low),(has_author_F),0.675775,0.991137,0.667651,0.987978,0.996813,-0.002135,0.73721,1,1
4,(is_education_F),(has_author_F),0.892171,0.991137,0.883309,0.990066,0.998919,-0.000956,0.892171,1,1


In [67]:
# extract rules to a pickle
no_md_association_rules.to_pickle('rule-mining/no_md_association_rules.pkl')

# extract rules to a csv
no_md_association_rules.to_csv('rule-mining/csv-files/no_md_association_rules.csv')