# Imports

In [1]:
import pandas as pd
import prince
from mlxtend.frequent_patterns import apriori, association_rules

# Loading the Binned Data

## Markdown Cells

In [2]:
md_filepath = 'binning-data/markdown_group_binned.csv'
md_df = pd.read_csv(md_filepath)

In [3]:
# save the original dataframe
md_original = md_df

In [4]:
# clear the first two columns
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F
md_df = md_df.replace(True, 'T')
md_df = md_df.replace(False, 'F')

In [5]:
# initial look at the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
0,T,F,F,F,medium,high,low,lower,low,T,...,lower,T,low,higher,lower,F,F,lower,F,medium
1,T,F,F,F,high,low,medium,lower,low,T,...,lower,T,high,higher,lower,F,F,lower,F,medium
2,T,F,F,F,high,low,medium,lower,low,T,...,lower,T,high,higher,lower,F,T,higher,F,high
3,T,F,F,F,high,medium,medium,lower,low,T,...,lower,F,low,higher,lower,F,F,lower,F,medium
4,T,F,F,T,high,low,medium,lower,low,F,...,lower,F,low,higher,lower,F,F,lower,F,low


In [6]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'output_cell_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test',
 'num_headers']

In [7]:
# check the bin sizes
for var in md_vars:
    print(md_df[var].value_counts())

T    1246
F    1045
Name: longer_beginning, dtype: int64
F    1698
T     593
Name: longer_ending, dtype: int64
F    2184
T     107
Name: has_author, dtype: int64
F    1847
T     444
Name: has_equation, dtype: int64
high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
medium    1383
high       529
low        379
Name: output_cell_prop, dtype: int64
medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
lower     2140
higher     151
Name: num_contrib, dtype: int64
low       1241
medium     873
high       177
Name: image_prop, dtype: int64
T    1321
F     970
Name: is_education, dtype: int64
F    1423
T     868
Name: has_links, dtype: int64
T    2038
F     253
Name: has_comments, dtype: int64
medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
T    1478
F     813
Name: has_title, dtype: int64
lower     1981
higher     310
Name: num_commits, dtype: int64
F    1236
T    1055
Name: md_format, dtype: int64
low       20

## No Markdown Cells

In [8]:
no_md_filepath = 'binning-data/no_markdown_group_binned.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [9]:
# save the original dataframe
no_md_original = no_md_df

In [10]:
# clear the first two columns
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F (strings)
no_md_df = no_md_df.replace(True, 'T')
no_md_df = no_md_df.replace(False, 'F')

In [11]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test
0,F,medium,medium,lower,low,F,F,lower,lower,higher,lower,F,T,lower,F
1,F,high,high,lower,low,F,T,lower,lower,higher,lower,T,T,lower,F
2,F,low,medium,higher,low,F,T,higher,lower,higher,lower,F,T,lower,F
3,F,high,low,lower,low,F,T,lower,lower,higher,lower,F,F,lower,F
4,F,high,low,lower,low,F,T,higher,lower,higher,lower,T,F,lower,F


In [12]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'output_cell_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test']

In [13]:
# check the bin counts
for var in no_md_vars:
    print(no_md_df[var].value_counts())

F    1342
T      12
Name: has_author, dtype: int64
high      910
medium    286
low       158
Name: jupyter_prop, dtype: int64
medium    794
low       393
high      167
Name: output_cell_prop, dtype: int64
lower     1236
higher     118
Name: num_contrib, dtype: int64
low       915
medium    281
high      158
Name: image_prop, dtype: int64
F    1208
T     146
Name: is_education, dtype: int64
T    919
F    435
Name: has_comments, dtype: int64
lower     1041
higher     313
Name: num_commits, dtype: int64
lower     1248
higher     106
Name: non_exec_prop, dtype: int64
higher    1204
lower      150
Name: exec_inorder, dtype: int64
lower     1178
higher     176
Name: exec_skips, dtype: int64
F    1138
T     216
Name: has_error, dtype: int64
F    935
T    419
Name: has_export, dtype: int64
lower     1153
higher     201
Name: num_functions, dtype: int64
F    1353
T       1
Name: has_test, dtype: int64


# Multiple Correspondence Analysis (MCA)

## Markdown Cells

### Performing MCA

In [14]:
# instantiate MCA object and fit to data
md_mca = prince.MCA(n_components = len(md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
md_mca_fit = md_mca.fit(md_df)

In [15]:
# put the results into dataframe format
md_mca_df = md_mca_fit.row_coordinates(md_df)

# initial look at the results of the MCA
md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.085894,0.407556,0.292207,-0.630892,-0.13999,-0.013242,0.090349,-0.310684,0.13838,-0.061571,...,0.237576,-0.212875,0.148948,-0.138957,-0.11388,-0.41175,0.055629,-0.158954,-0.327211,-0.073843
1,0.375768,0.572554,-0.431578,0.083251,-0.130029,-0.154141,0.023085,0.42335,-0.185051,0.009917,...,0.011516,0.13961,0.098499,-0.182442,0.020486,-0.09321,0.133077,0.022438,-0.129524,-0.045917
2,0.608899,0.450343,-0.240972,0.373581,-0.431201,0.056716,0.056419,0.0341,-0.022858,0.345997,...,0.152091,-0.032119,0.328724,-0.029688,0.094009,0.179149,0.164627,-0.185841,-0.120644,0.021495
3,0.109274,-0.087451,-0.209047,-0.030134,0.062675,-0.278073,-0.033848,-0.105245,-0.00509,-0.064212,...,0.086069,-0.024698,0.043007,-0.243954,-0.082701,-0.015256,-0.126657,0.091235,0.020544,0.036561
4,-0.211912,0.157911,-0.334089,-0.057425,-0.08007,0.0786,0.087869,0.366703,-0.068836,-0.157941,...,-0.078166,0.027297,0.076231,-0.181204,-0.011951,0.264857,-0.263027,-0.217114,-0.042865,-0.083393


### Analyzing the Results of MCA

In [16]:
# extract the explained variance
md_mca_fit.explained_inertia_

[0.110297480389293,
 0.06695957613487369,
 0.05947126885572312,
 0.055780426600956425,
 0.04810467089302917,
 0.042473410385134754,
 0.04166416169389522,
 0.03857831805509997,
 0.03643605501058089,
 0.03489676938002524,
 0.03362323366997474,
 0.032941003447213006,
 0.030940618664617163,
 0.029378163303986536,
 0.028880038970692107,
 0.027184411626224664,
 0.02651926241677011,
 0.025832852343581014,
 0.0244855706531415,
 0.02350963799726628,
 0.022000306705282156,
 0.021796854128485612,
 0.020954403137497406,
 0.019557787216297216]

## No Markdown Cells

### Performing MCA

In [17]:
# instantiate MCA object and fit to data
no_md_mca = prince.MCA(n_components = len(no_md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
no_md_mca_fit = no_md_mca.fit(no_md_df)

In [18]:
# put the results into dataframe format
no_md_mca_df = no_md_mca_fit.row_coordinates(no_md_df)

# initial look at the results of the MCA
no_md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.072613,-0.042273,0.303299,-0.194919,-0.092127,-0.368078,-0.028675,0.044076,0.108576,0.146116,-0.00921,-0.308557,-0.027019,-0.331987,-0.248639
1,-0.193938,0.164606,0.110198,-0.182384,-0.066001,0.031582,0.061932,0.03999,-0.017043,-0.777202,0.440694,0.305979,0.172416,-0.150989,-0.154622
2,0.25385,-0.075955,-0.360813,-0.091875,0.831681,-0.627923,0.272558,0.259882,-0.406762,-0.096204,-0.019651,0.012453,-0.210745,0.11335,-0.105403
3,0.288713,-0.05214,-0.096386,0.008557,-0.205076,0.121164,0.068893,-0.073697,0.060838,-0.068045,0.017921,0.1272,-0.04476,-0.076487,-0.038316
4,0.015937,0.124354,-0.196104,0.060283,0.050586,0.162846,0.253747,-0.003392,-0.27909,-0.281067,-0.202056,0.367695,0.416719,-0.067264,0.025486


### Analyzing the Results of MCA

In [19]:
# extract the explained variance
no_md_mca_fit.explained_inertia_

[0.10280194616042125,
 0.09031258672838312,
 0.07263021118485391,
 0.06990227267085264,
 0.0633294491491154,
 0.06311957747985415,
 0.05862260050646977,
 0.05569720473623258,
 0.052206867786633075,
 0.0510001006103156,
 0.047353948858145894,
 0.046266648507304914,
 0.04389788805365344,
 0.04125982276182712,
 0.040605465578059684]

# Association Rule Mining

## One-Hot Encoding

We must put our categorical data into a format that can be input into the Apriori algorithm

### Markdown Cells

In [128]:
# create a copy of the dataframe that we will one-hot encode
md_one_hot = md_original.copy()

In [129]:
# drop the first two columns (since we took the original dataframe)
md_one_hot = md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [130]:
# extract the datatypes so we can isolate the variables that need dummies
list(enumerate(zip(md_vars, md_one_hot.dtypes)))

[(0, ('longer_beginning', dtype('bool'))),
 (1, ('longer_ending', dtype('bool'))),
 (2, ('has_author', dtype('bool'))),
 (3, ('has_equation', dtype('bool'))),
 (4, ('jupyter_prop', dtype('O'))),
 (5, ('markdown_prop', dtype('O'))),
 (6, ('num_contrib', dtype('O'))),
 (7, ('image_prop', dtype('O'))),
 (8, ('is_education', dtype('bool'))),
 (9, ('has_links', dtype('bool'))),
 (10, ('has_comments', dtype('bool'))),
 (11, ('md_frequency', dtype('O'))),
 (12, ('has_title', dtype('bool'))),
 (13, ('num_commits', dtype('O'))),
 (14, ('md_format', dtype('bool'))),
 (15, ('non_exec_prop', dtype('O'))),
 (16, ('exec_inorder', dtype('O'))),
 (17, ('exec_skips', dtype('O'))),
 (18, ('has_error', dtype('bool')))]

In [131]:
# extract t/f vars
md_tf_vars = [md_vars[i] for i in [0, 1, 2, 3, 8, 9, 10, 12, 14, 18]]
md_tf_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'is_education',
 'has_links',
 'has_comments',
 'has_title',
 'md_format',
 'has_error']

In [132]:
# extract non-t/f vars
md_s_vars = [md_vars[i] for i in [4, 5, 6, 7, 11, 13, 15, 16, 17]]
md_s_vars

['jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'md_frequency',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [133]:
# extract value counts so we can isolate variables that have 3 bins
for (i, var) in enumerate(md_s_vars):
    print(i, md_one_hot[var].value_counts())

0 high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
1 medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
2 lower     2140
higher     151
Name: num_contrib, dtype: int64
3 low       1241
medium     873
high       177
Name: image_prop, dtype: int64
4 medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
5 lower     1981
higher     310
Name: num_commits, dtype: int64
6 low       2003
high       173
medium     115
Name: non_exec_prop, dtype: int64
7 higher    2180
lower      111
Name: exec_inorder, dtype: int64
8 lower     1925
higher     366
Name: exec_skips, dtype: int64


In [134]:
# extract lower/higher variables
md_lh_vars = [md_s_vars[i] for i in [2, 5, 7, 8]]

In [135]:
# extract low/medium/high variables
md_lmh_vars = [md_s_vars[i] for i in [0, 1, 3, 4, 6]]

In [136]:
# turn t/f vars into 1/0
for tf_var in md_tf_vars:
    md_one_hot[tf_var] = md_one_hot[tf_var].replace(True, 1)
    md_one_hot[tf_var] = md_one_hot[tf_var].replace(False, 0)

In [137]:
# turn lower/higher vars into 1/0
for lh_var in md_lh_vars:
    md_one_hot[lh_var] = md_one_hot[lh_var].replace('higher', 1)
    md_one_hot[lh_var] = md_one_hot[lh_var].replace('lower', 0)

In [138]:
# create dataframe that will hold all of the dummy variables
md_one_hot_cols = pd.get_dummies(md_one_hot[md_lmh_vars])

# initial look at the dummy variables
md_one_hot_cols.head()

Unnamed: 0,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,markdown_prop_high,markdown_prop_low,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0
1,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0
2,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0
3,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0


In [139]:
# delete the original columns from the one-hot dataframe
md_one_hot = md_one_hot.drop(md_lmh_vars, axis = 1)

# join the one hot dataframe with the dummy dataframe
md_one_hot = pd.concat([md_one_hot, md_one_hot_cols], axis = 1)

In [140]:
# initial look at the one-hot encoded data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,num_contrib,is_education,has_links,has_comments,has_title,num_commits,...,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,1.0,0.0,0.0,0.0,0,1.0,1.0,0.0,1.0,0,...,0,0,1,0,0,1,0,0,1,0
1,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,0.0,0,...,1,0,1,0,0,0,1,1,0,0
2,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,0,...,1,0,1,0,0,0,1,1,0,0
3,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,0,...,1,0,1,0,0,0,1,0,1,0
4,1.0,0.0,0.0,1.0,0,0.0,0.0,1.0,0.0,0,...,1,0,1,0,0,0,1,0,1,0


In [141]:
# convert everything to an int type
for var in list(md_one_hot):
    md_one_hot[var] = md_one_hot[var].astype(int)

In [142]:
# check the datatypes
md_one_hot.dtypes

longer_beginning        int64
longer_ending           int64
has_author              int64
has_equation            int64
num_contrib             int64
is_education            int64
has_links               int64
has_comments            int64
has_title               int64
num_commits             int64
md_format               int64
exec_inorder            int64
exec_skips              int64
has_error               int64
jupyter_prop_high       int64
jupyter_prop_low        int64
jupyter_prop_medium     int64
markdown_prop_high      int64
markdown_prop_low       int64
markdown_prop_medium    int64
image_prop_high         int64
image_prop_low          int64
image_prop_medium       int64
md_frequency_high       int64
md_frequency_low        int64
md_frequency_medium     int64
non_exec_prop_high      int64
non_exec_prop_low       int64
non_exec_prop_medium    int64
dtype: object

### No Markdown Cells

In [143]:
# copy the original dataframe
no_md_one_hot = no_md_original.copy()

In [144]:
# drop the first two columns
no_md_one_hot = no_md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [145]:
# extract datatypes
list(enumerate(zip(no_md_vars, no_md_one_hot.dtypes)))

[(0, ('has_author', dtype('bool'))),
 (1, ('jupyter_prop', dtype('O'))),
 (2, ('num_contrib', dtype('O'))),
 (3, ('image_prop', dtype('O'))),
 (4, ('is_education', dtype('bool'))),
 (5, ('has_comments', dtype('bool'))),
 (6, ('num_commits', dtype('O'))),
 (7, ('non_exec_prop', dtype('O'))),
 (8, ('exec_inorder', dtype('O'))),
 (9, ('exec_skips', dtype('O'))),
 (10, ('has_error', dtype('bool')))]

In [146]:
# extract t/f vars
no_md_tf_vars = [no_md_vars[i] for i in [0, 4, 5, 10]]
no_md_tf_vars

['has_author', 'is_education', 'has_comments', 'has_error']

In [147]:
# extract non-tf vars
no_md_s_vars = [no_md_vars[i] for i in [1, 2, 3, 6, 7, 8, 9]]
no_md_s_vars

['jupyter_prop',
 'num_contrib',
 'image_prop',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [148]:
# extract value counts
for (i, var) in enumerate(no_md_s_vars):
    print(i, no_md_one_hot[var].value_counts())

0 high      910
medium    286
low       159
Name: jupyter_prop, dtype: int64
1 lower     1237
higher     118
Name: num_contrib, dtype: int64
2 low       916
medium    281
high      158
Name: image_prop, dtype: int64
3 lower     1042
higher     313
Name: num_commits, dtype: int64
4 lower     1249
higher     106
Name: non_exec_prop, dtype: int64
5 higher    1205
lower      150
Name: exec_inorder, dtype: int64
6 lower     1179
higher     176
Name: exec_skips, dtype: int64


In [149]:
# extract lower/higher vars
no_md_lh_vars = [no_md_s_vars[i] for i in [1, 3, 4, 5, 6]]

In [150]:
# extract low/medium/high vars
no_md_lmh_vars = [no_md_s_vars[i] for i in [0, 2]]

In [151]:
# turn t/f vars into 1/0
for tf_var in no_md_tf_vars:
    no_md_one_hot[tf_var] = no_md_one_hot[tf_var].replace(True, 1)
    no_md_one_hot[tf_var] = no_md_one_hot[tf_var].replace(False, 0)

In [152]:
# turn lower/higher vars into 1/0
for lh_var in no_md_lh_vars:
    no_md_one_hot[lh_var] = no_md_one_hot[lh_var].replace('higher', 1)
    no_md_one_hot[lh_var] = no_md_one_hot[lh_var].replace('lower', 0)

In [153]:
# create dataframe that will hold dummy vars
no_md_one_hot_cols = pd.get_dummies(no_md_one_hot[no_md_lmh_vars])

# initial look at the dummy variables
no_md_one_hot_cols.head()

Unnamed: 0,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,0,0,1,0,1,0
1,1,0,0,0,1,0
2,0,1,0,0,1,0
3,1,0,0,0,1,0
4,1,0,0,0,1,0


In [154]:
# delete the original columns from the one-hot dataframe
no_md_one_hot = no_md_one_hot.drop(no_md_lmh_vars, axis = 1)

# join the one hot dataframe w the dummy vars
no_md_one_hot = pd.concat([no_md_one_hot, no_md_one_hot_cols], axis = 1)

In [155]:
# initial look at the one-hot encoded data
no_md_one_hot.head()

Unnamed: 0,has_author,num_contrib,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,0.0,0,0.0,0.0,0,0,1,0,0.0,0,0,1,0,1,0
1,0.0,0,0.0,1.0,0,0,1,0,1.0,1,0,0,0,1,0
2,0.0,1,0.0,1.0,1,0,1,0,0.0,0,1,0,0,1,0
3,0.0,0,0.0,1.0,0,0,1,0,0.0,1,0,0,0,1,0
4,0.0,0,0.0,1.0,1,0,1,0,1.0,1,0,0,0,1,0


In [156]:
# convert everything to an int type
for var in list(no_md_one_hot):
    no_md_one_hot[var] = no_md_one_hot[var].astype(int)

In [157]:
# check the datatypes
no_md_one_hot.dtypes

has_author             int64
num_contrib            int64
is_education           int64
has_comments           int64
num_commits            int64
non_exec_prop          int64
exec_inorder           int64
exec_skips             int64
has_error              int64
jupyter_prop_high      int64
jupyter_prop_low       int64
jupyter_prop_medium    int64
image_prop_high        int64
image_prop_low         int64
image_prop_medium      int64
dtype: object

## Performing the Apriori Algorithm

### Markdown Cells

In [158]:
# requires True/False instead of 1/0
md_one_hot = md_one_hot.replace(1, True)
md_one_hot = md_one_hot.replace(0, False)

In [159]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,num_contrib,is_education,has_links,has_comments,has_title,num_commits,...,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,True,False,False,False,False,True,True,False,True,False,...,False,False,True,False,False,True,False,False,True,False
1,True,False,False,False,False,True,True,True,False,False,...,True,False,True,False,False,False,True,True,False,False
2,True,False,False,False,False,True,True,True,True,False,...,True,False,True,False,False,False,True,True,False,False
3,True,False,False,False,False,True,True,True,True,False,...,True,False,True,False,False,False,True,False,True,False
4,True,False,False,True,False,False,False,True,False,False,...,True,False,True,False,False,False,True,False,True,False


In [160]:
# perform the apriori algorithm on the dataframe
md_itemsets = apriori(md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
md_itemsets

Unnamed: 0,support,itemsets
0,0.543867,(longer_beginning)
1,0.576604,(is_education)
2,0.889568,(has_comments)
3,0.645133,(has_title)
4,0.95155,(exec_inorder)
5,0.787429,(jupyter_prop_high)
6,0.820166,(markdown_prop_medium)
7,0.541685,(image_prop_low)
8,0.556962,(md_frequency_medium)
9,0.874291,(non_exec_prop_low)


In [161]:
# add a column that counts the number of elements in the itemset
md_itemsets['length'] = md_itemsets['itemsets'].apply(lambda x : len(x))
md_itemsets

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning),1
1,0.576604,(is_education),1
2,0.889568,(has_comments),1
3,0.645133,(has_title),1
4,0.95155,(exec_inorder),1
5,0.787429,(jupyter_prop_high),1
6,0.820166,(markdown_prop_medium),1
7,0.541685,(image_prop_low),1
8,0.556962,(md_frequency_medium),1
9,0.874291,(non_exec_prop_low),1


In [162]:
# filter down to itemsets that have at least 2 items and at least 0.7 support
md_itemsets[(md_itemsets['length'] >= 2) & (md_itemsets['support'] >= 0.7)]

Unnamed: 0,support,itemsets,length
15,0.848101,"(has_comments, exec_inorder)",2
16,0.700131,"(has_comments, jupyter_prop_high)",2
17,0.736796,"(markdown_prop_medium, has_comments)",2
18,0.774771,"(has_comments, non_exec_prop_low)",2
23,0.750764,"(exec_inorder, jupyter_prop_high)",2
24,0.780882,"(markdown_prop_medium, exec_inorder)",2
27,0.829769,"(exec_inorder, non_exec_prop_low)",2
30,0.72021,"(markdown_prop_medium, non_exec_prop_low)",2
34,0.703623,"(markdown_prop_medium, has_comments, exec_inor...",3
35,0.737233,"(has_comments, exec_inorder, non_exec_prop_low)",3


### No Markdown Cells

In [163]:
# replace 1/0 with True/False
no_md_one_hot = no_md_one_hot.replace(1, True)
no_md_one_hot = no_md_one_hot.replace(0, False)

In [164]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author,num_contrib,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False
1,False,False,False,True,False,False,True,False,True,True,False,False,False,True,False
2,False,True,False,True,True,False,True,False,False,False,True,False,False,True,False
3,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False
4,False,False,False,True,True,False,True,False,True,True,False,False,False,True,False


In [165]:
# perform apriori algorithm on the dataframe
no_md_itemsets = apriori(no_md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
no_md_itemsets

Unnamed: 0,support,itemsets
0,0.678229,(has_comments)
1,0.889299,(exec_inorder)
2,0.671587,(jupyter_prop_high)
3,0.676015,(image_prop_low)
4,0.597048,"(has_comments, exec_inorder)"
5,0.602214,"(exec_inorder, jupyter_prop_high)"
6,0.61107,"(image_prop_low, exec_inorder)"


Not a lot of itemsets (probably because not a lot of variables), maybe reduce `min_support`?

In [166]:
# add a column that counts the number of elements in the itemset
no_md_itemsets['length'] = no_md_itemsets['itemsets'].apply(lambda x : len(x))
no_md_itemsets

Unnamed: 0,support,itemsets,length
0,0.678229,(has_comments),1
1,0.889299,(exec_inorder),1
2,0.671587,(jupyter_prop_high),1
3,0.676015,(image_prop_low),1
4,0.597048,"(has_comments, exec_inorder)",2
5,0.602214,"(exec_inorder, jupyter_prop_high)",2
6,0.61107,"(image_prop_low, exec_inorder)",2


In [167]:
# filter down to itemsets that have at least 2 items
no_md_itemsets[no_md_itemsets['length'] >= 2]

Unnamed: 0,support,itemsets,length
4,0.597048,"(has_comments, exec_inorder)",2
5,0.602214,"(exec_inorder, jupyter_prop_high)",2
6,0.61107,"(image_prop_low, exec_inorder)",2


## Extracting Association Rules

### Markdown Cells

In [168]:
# use the itemsets to extract association rules
md_association_rules = association_rules(md_itemsets, metric = 'confidence', min_threshold = 0.7)
md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(longer_beginning),(has_comments),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553
1,(longer_beginning),(exec_inorder),0.543867,0.951550,0.516368,0.949438,0.997781,-0.001148,0.958242
2,(is_education),(has_comments),0.576604,0.889568,0.516805,0.896291,1.007557,0.003876,1.064824
3,(is_education),(exec_inorder),0.576604,0.951550,0.548669,0.951552,1.000002,0.000001,1.000048
4,(has_title),(has_comments),0.645133,0.889568,0.574422,0.890392,1.000927,0.000532,1.007523
...,...,...,...,...,...,...,...,...,...
143,"(markdown_prop_medium, exec_inorder)","(non_exec_prop_low, jupyter_prop_high)",0.780882,0.686163,0.552597,0.707658,1.031326,0.016785,1.073526
144,"(markdown_prop_medium, non_exec_prop_low)","(exec_inorder, jupyter_prop_high)",0.720210,0.750764,0.552597,0.767273,1.021989,0.011890,1.070937
145,"(exec_inorder, jupyter_prop_high)","(markdown_prop_medium, non_exec_prop_low)",0.750764,0.720210,0.552597,0.736047,1.021989,0.011890,1.059999
146,"(non_exec_prop_low, jupyter_prop_high)","(markdown_prop_medium, exec_inorder)",0.686163,0.780882,0.552597,0.805344,1.031326,0.016785,1.125667


In [169]:
# add columns that hold length
md_association_rules['antecedent_len'] = md_association_rules['antecedents'].apply(len)
md_association_rules['consequent_len'] = md_association_rules['consequents'].apply(len)

In [170]:
# filter down to rules that have 1 antecedent and 1 consequent
md_association_rules[(md_association_rules['antecedent_len'] == 1) &
                    (md_association_rules['consequent_len'] == 1) &
                    (md_association_rules['confidence'] >= 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning),(has_comments),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553,1,1
1,(longer_beginning),(exec_inorder),0.543867,0.95155,0.516368,0.949438,0.997781,-0.001148,0.958242,1,1
2,(is_education),(has_comments),0.576604,0.889568,0.516805,0.896291,1.007557,0.003876,1.064824,1,1
3,(is_education),(exec_inorder),0.576604,0.95155,0.548669,0.951552,1.000002,1e-06,1.000048,1,1
4,(has_title),(has_comments),0.645133,0.889568,0.574422,0.890392,1.000927,0.000532,1.007523,1,1
5,(has_comments),(exec_inorder),0.889568,0.95155,0.848101,0.953386,1.00193,0.001633,1.03939,1,1
6,(exec_inorder),(has_comments),0.95155,0.889568,0.848101,0.891284,1.00193,0.001633,1.015789,1,1
8,(jupyter_prop_high),(has_comments),0.787429,0.889568,0.700131,0.889135,0.999514,-0.000341,0.996098,1,1
9,(markdown_prop_medium),(has_comments),0.820166,0.889568,0.736796,0.89835,1.009873,0.007203,1.086398,1,1
10,(has_comments),(markdown_prop_medium),0.889568,0.820166,0.736796,0.828263,1.009873,0.007203,1.047148,1,1


### No Markdown Cells

In [171]:
# use the itemsets to extract association rules
no_md_association_rules = association_rules(no_md_itemsets, metric = 'confidence', min_threshold = 0.5)
no_md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(has_comments),(exec_inorder),0.678229,0.889299,0.597048,0.880305,0.989886,-0.0061,0.924857
1,(exec_inorder),(has_comments),0.889299,0.678229,0.597048,0.671369,0.989886,-0.0061,0.979127
2,(exec_inorder),(jupyter_prop_high),0.889299,0.671587,0.602214,0.677178,1.008326,0.004973,1.017321
3,(jupyter_prop_high),(exec_inorder),0.671587,0.889299,0.602214,0.896703,1.008326,0.004973,1.071681
4,(image_prop_low),(exec_inorder),0.676015,0.889299,0.61107,0.90393,1.016453,0.009891,1.152298
5,(exec_inorder),(image_prop_low),0.889299,0.676015,0.61107,0.687137,1.016453,0.009891,1.03555


# Association Rule Mining (with all dummies)

## One Hot Encoding

We create dummy variables for every single variable, instead of those with low/medium/high

### Markdown Cells

In [20]:
# create a copy of the dataframe we will one-hot encode
md_one_hot = md_original.copy()

In [21]:
# drop the first two columns 
md_one_hot = md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [22]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
0,True,False,False,False,medium,high,low,lower,low,True,...,lower,True,low,higher,lower,False,False,lower,False,medium
1,True,False,False,False,high,low,medium,lower,low,True,...,lower,True,high,higher,lower,False,False,lower,False,medium
2,True,False,False,False,high,low,medium,lower,low,True,...,lower,True,high,higher,lower,False,True,higher,False,high
3,True,False,False,False,high,medium,medium,lower,low,True,...,lower,False,low,higher,lower,False,False,lower,False,medium
4,True,False,False,True,high,low,medium,lower,low,False,...,lower,False,low,higher,lower,False,False,lower,False,low


In [23]:
# change True/False to T/F for simplicity
md_one_hot = md_one_hot.replace(True, 'T')
md_one_hot = md_one_hot.replace(False, 'F')

In [24]:
# change medium to med for simplicity
md_one_hot = md_one_hot.replace('medium', 'med')

In [25]:
# check the data again
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
0,T,F,F,F,med,high,low,lower,low,T,...,lower,T,low,higher,lower,F,F,lower,F,med
1,T,F,F,F,high,low,med,lower,low,T,...,lower,T,high,higher,lower,F,F,lower,F,med
2,T,F,F,F,high,low,med,lower,low,T,...,lower,T,high,higher,lower,F,T,higher,F,high
3,T,F,F,F,high,med,med,lower,low,T,...,lower,F,low,higher,lower,F,F,lower,F,med
4,T,F,F,T,high,low,med,lower,low,F,...,lower,F,low,higher,lower,F,F,lower,F,low


In [26]:
# create dummy variables for all variables
md_one_hot = pd.get_dummies(md_one_hot)

In [27]:
# look at the one-hot encoded data
md_one_hot.head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,has_error_T,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_test_T,num_headers_high,num_headers_low,num_headers_med
0,0,1,1,0,1,0,1,0,0,0,...,0,1,0,0,1,1,0,0,0,1
1,0,1,1,0,1,0,1,0,1,0,...,0,1,0,0,1,1,0,0,0,1
2,0,1,1,0,1,0,1,0,1,0,...,0,0,1,1,0,1,0,1,0,0
3,0,1,1,0,1,0,1,0,1,0,...,0,1,0,0,1,1,0,0,0,1
4,0,1,1,0,1,0,0,1,1,0,...,0,1,0,0,1,1,0,0,1,0


In [28]:
# check the fields
md_one_hot.columns

Index(['longer_beginning_F', 'longer_beginning_T', 'longer_ending_F',
       'longer_ending_T', 'has_author_F', 'has_author_T', 'has_equation_F',
       'has_equation_T', 'jupyter_prop_high', 'jupyter_prop_low',
       'jupyter_prop_med', 'output_cell_prop_high', 'output_cell_prop_low',
       'output_cell_prop_med', 'markdown_prop_high', 'markdown_prop_low',
       'markdown_prop_med', 'num_contrib_higher', 'num_contrib_lower',
       'image_prop_high', 'image_prop_low', 'image_prop_med', 'is_education_F',
       'is_education_T', 'has_links_F', 'has_links_T', 'has_comments_F',
       'has_comments_T', 'md_frequency_high', 'md_frequency_low',
       'md_frequency_med', 'has_title_F', 'has_title_T', 'num_commits_higher',
       'num_commits_lower', 'md_format_F', 'md_format_T', 'non_exec_prop_high',
       'non_exec_prop_low', 'non_exec_prop_med', 'exec_inorder_higher',
       'exec_inorder_lower', 'exec_skips_higher', 'exec_skips_lower',
       'has_error_F', 'has_error_T', 'has_expor

### No Markdown Cells

In [29]:
# create a copy of the dataframe we will one-hot encode
no_md_one_hot = no_md_original.copy()

In [30]:
# drop the first two columns
no_md_one_hot = no_md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [31]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test
0,False,medium,medium,lower,low,False,False,lower,lower,higher,lower,False,True,lower,False
1,False,high,high,lower,low,False,True,lower,lower,higher,lower,True,True,lower,False
2,False,low,medium,higher,low,False,True,higher,lower,higher,lower,False,True,lower,False
3,False,high,low,lower,low,False,True,lower,lower,higher,lower,False,False,lower,False
4,False,high,low,lower,low,False,True,higher,lower,higher,lower,True,False,lower,False


In [32]:
# change True/False to T/F 
no_md_one_hot = no_md_one_hot.replace(True, 'T')
no_md_one_hot = no_md_one_hot.replace(False, 'F')

# change medium to med
no_md_one_hot = no_md_one_hot.replace('medium', 'med')

In [33]:
# check the data again
no_md_one_hot.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test
0,F,med,med,lower,low,F,F,lower,lower,higher,lower,F,T,lower,F
1,F,high,high,lower,low,F,T,lower,lower,higher,lower,T,T,lower,F
2,F,low,med,higher,low,F,T,higher,lower,higher,lower,F,T,lower,F
3,F,high,low,lower,low,F,T,lower,lower,higher,lower,F,F,lower,F
4,F,high,low,lower,low,F,T,higher,lower,higher,lower,T,F,lower,F


In [34]:
# create all dummy variables
no_md_one_hot = pd.get_dummies(no_md_one_hot)

In [35]:
# look at the one-hot encoded data
no_md_one_hot.head()

Unnamed: 0,has_author_F,has_author_T,jupyter_prop_high,jupyter_prop_low,jupyter_prop_med,output_cell_prop_high,output_cell_prop_low,output_cell_prop_med,num_contrib_higher,num_contrib_lower,...,exec_skips_higher,exec_skips_lower,has_error_F,has_error_T,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_test_T
0,1,0,0,0,1,0,0,1,0,1,...,0,1,1,0,0,1,0,1,1,0
1,1,0,1,0,0,1,0,0,0,1,...,0,1,0,1,0,1,0,1,1,0
2,1,0,0,1,0,0,0,1,1,0,...,0,1,1,0,0,1,0,1,1,0
3,1,0,1,0,0,0,1,0,0,1,...,0,1,1,0,1,0,0,1,1,0
4,1,0,1,0,0,0,1,0,0,1,...,0,1,0,1,1,0,0,1,1,0


In [36]:
# check the fields
no_md_one_hot.columns

Index(['has_author_F', 'has_author_T', 'jupyter_prop_high', 'jupyter_prop_low',
       'jupyter_prop_med', 'output_cell_prop_high', 'output_cell_prop_low',
       'output_cell_prop_med', 'num_contrib_higher', 'num_contrib_lower',
       'image_prop_high', 'image_prop_low', 'image_prop_med', 'is_education_F',
       'is_education_T', 'has_comments_F', 'has_comments_T',
       'num_commits_higher', 'num_commits_lower', 'non_exec_prop_higher',
       'non_exec_prop_lower', 'exec_inorder_higher', 'exec_inorder_lower',
       'exec_skips_higher', 'exec_skips_lower', 'has_error_F', 'has_error_T',
       'has_export_F', 'has_export_T', 'num_functions_higher',
       'num_functions_lower', 'has_test_F', 'has_test_T'],
      dtype='object')

## Performing the Apriori Algorithm

### Markdown Cells

In [37]:
# requires True/False instead of 1/0
md_one_hot = md_one_hot.replace(1, True)
md_one_hot = md_one_hot.replace(0, False)

In [38]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,has_error_T,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_test_T,num_headers_high,num_headers_low,num_headers_med
0,False,True,True,False,True,False,True,False,False,False,...,False,True,False,False,True,True,False,False,False,True
1,False,True,True,False,True,False,True,False,True,False,...,False,True,False,False,True,True,False,False,False,True
2,False,True,True,False,True,False,True,False,True,False,...,False,False,True,True,False,True,False,True,False,False
3,False,True,True,False,True,False,True,False,True,False,...,False,True,False,False,True,True,False,False,False,True
4,False,True,True,False,True,False,False,True,True,False,...,False,True,False,False,True,True,False,False,True,False


In [39]:
# perform apriori algorithm on the dataframe
md_itemsets = apriori(md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
md_itemsets

Unnamed: 0,support,itemsets
0,0.543867,(longer_beginning_T)
1,0.741161,(longer_ending_F)
2,0.953296,(has_author_F)
3,0.806198,(has_equation_F)
4,0.787429,(jupyter_prop_high)
...,...,...
3236,0.507639,"(exec_inorder_higher, has_author_F, has_error_..."
3237,0.511131,"(exec_inorder_higher, has_author_F, non_exec_p..."
3238,0.517678,"(exec_inorder_higher, num_contrib_lower, non_e..."
3239,0.505456,"(exec_inorder_higher, num_contrib_lower, has_e..."


In [40]:
# add a column that counts the number of elements in the itemset
md_itemsets['length'] = md_itemsets['itemsets'].apply(len)
md_itemsets

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning_T),1
1,0.741161,(longer_ending_F),1
2,0.953296,(has_author_F),1
3,0.806198,(has_equation_F),1
4,0.787429,(jupyter_prop_high),1
...,...,...,...
3236,0.507639,"(exec_inorder_higher, has_author_F, has_error_...",7
3237,0.511131,"(exec_inorder_higher, has_author_F, non_exec_p...",7
3238,0.517678,"(exec_inorder_higher, num_contrib_lower, non_e...",7
3239,0.505456,"(exec_inorder_higher, num_contrib_lower, has_e...",7


In [41]:
# filter the itemsets
md_itemsets[(md_itemsets['length'] >= 2) &
           (md_itemsets['support'] >= 0.8)]

Unnamed: 0,support,itemsets,length
48,0.888695,"(num_contrib_lower, has_author_F)",2
52,0.846355,"(has_author_F, has_comments_T)",2
55,0.831951,"(has_author_F, num_commits_lower)",2
57,0.833697,"(non_exec_prop_low, has_author_F)",2
58,0.9079,"(exec_inorder_higher, has_author_F)",2
59,0.806635,"(exec_skips_lower, has_author_F)",2
60,0.824094,"(has_error_F, has_author_F)",2
63,0.919686,"(has_test_F, has_author_F)",2
115,0.835443,"(num_contrib_lower, has_comments_T)",2
118,0.828023,"(num_contrib_lower, num_commits_lower)",2


### No Markdown Cells

In [42]:
# replace 1/0 with True/False
no_md_one_hot = no_md_one_hot.replace(1, True)
no_md_one_hot = no_md_one_hot.replace(0, False)

In [43]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author_F,has_author_T,jupyter_prop_high,jupyter_prop_low,jupyter_prop_med,output_cell_prop_high,output_cell_prop_low,output_cell_prop_med,num_contrib_higher,num_contrib_lower,...,exec_skips_higher,exec_skips_lower,has_error_F,has_error_T,has_export_F,has_export_T,num_functions_higher,num_functions_lower,has_test_F,has_test_T
0,True,False,False,False,True,False,False,True,False,True,...,False,True,True,False,False,True,False,True,True,False
1,True,False,True,False,False,True,False,False,False,True,...,False,True,False,True,False,True,False,True,True,False
2,True,False,False,True,False,False,False,True,True,False,...,False,True,True,False,False,True,False,True,True,False
3,True,False,True,False,False,False,True,False,False,True,...,False,True,True,False,True,False,False,True,True,False
4,True,False,True,False,False,False,True,False,False,True,...,False,True,False,True,True,False,False,True,True,False


In [44]:
# perform apriori algorithm on the dataframe
no_md_itemsets = apriori(no_md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
no_md_itemsets

Unnamed: 0,support,itemsets
0,0.991137,(has_author_F)
1,0.672083,(jupyter_prop_high)
2,0.586411,(output_cell_prop_med)
3,0.912851,(num_contrib_lower)
4,0.675775,(image_prop_low)
...,...,...
1259,0.560561,"(exec_inorder_higher, is_education_F, has_auth..."
1260,0.538405,"(exec_inorder_higher, is_education_F, has_auth..."
1261,0.530281,"(is_education_F, has_author_F, non_exec_prop_l..."
1262,0.535451,"(exec_inorder_higher, is_education_F, has_auth..."


In [45]:
# add a column that counts the number of elements in the itemset
no_md_itemsets['length'] = no_md_itemsets['itemsets'].apply(len)
no_md_itemsets

Unnamed: 0,support,itemsets,length
0,0.991137,(has_author_F),1
1,0.672083,(jupyter_prop_high),1
2,0.586411,(output_cell_prop_med),1
3,0.912851,(num_contrib_lower),1
4,0.675775,(image_prop_low),1
...,...,...,...
1259,0.560561,"(exec_inorder_higher, is_education_F, has_auth...",7
1260,0.538405,"(exec_inorder_higher, is_education_F, has_auth...",7
1261,0.530281,"(is_education_F, has_author_F, non_exec_prop_l...",7
1262,0.535451,"(exec_inorder_higher, is_education_F, has_auth...",7


In [46]:
# filter the itemsets
no_md_itemsets[(no_md_itemsets['length'] >= 2) &
              (no_md_itemsets['support'] >= 0.8)]

Unnamed: 0,support,itemsets,length
17,0.904727,"(num_contrib_lower, has_author_F)",2
19,0.883309,"(is_education_F, has_author_F)",2
22,0.916544,"(has_author_F, non_exec_prop_lower)",2
23,0.881093,"(exec_inorder_higher, has_author_F)",2
24,0.861891,"(exec_skips_lower, has_author_F)",2
25,0.833826,"(has_error_F, has_author_F)",2
27,0.849335,"(num_functions_lower, has_author_F)",2
28,0.990399,"(has_test_F, has_author_F)",2
45,0.810192,"(is_education_F, num_contrib_lower)",2
48,0.839734,"(num_contrib_lower, non_exec_prop_lower)",2


## Extracting Association Rules

### Markdown Cells

In [47]:
# use the itemsets to extract association rules
md_association_rules = association_rules(md_itemsets, metric = 'confidence', min_threshold = 0.7)
md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(longer_beginning_T),(has_author_F),0.543867,0.953296,0.515932,0.948636,0.995112,-0.002534,0.909278
1,(longer_beginning_T),(num_contrib_lower),0.543867,0.934090,0.503710,0.926164,0.991515,-0.004311,0.892652
2,(longer_beginning_T),(has_comments_T),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553
3,(longer_beginning_T),(exec_inorder_higher),0.543867,0.951550,0.516368,0.949438,0.997781,-0.001148,0.958242
4,(longer_beginning_T),(has_test_F),0.543867,0.966390,0.529463,0.973515,1.007373,0.003875,1.269024
...,...,...,...,...,...,...,...,...,...
54829,"(non_exec_prop_low, exec_skips_lower, num_comm...","(exec_inorder_higher, has_error_F, has_test_F,...",0.633348,0.747272,0.507639,0.801516,1.072590,0.034355,1.273293
54830,"(has_error_F, has_test_F, exec_skips_lower)","(non_exec_prop_low, exec_inorder_higher, num_c...",0.717591,0.697076,0.507639,0.707421,1.014841,0.007424,1.035359
54831,"(has_error_F, exec_skips_lower, num_commits_lo...","(non_exec_prop_low, exec_inorder_higher, has_t...",0.645570,0.756438,0.507639,0.786342,1.039532,0.019305,1.139962
54832,"(exec_skips_lower, has_test_F, num_commits_lower)","(non_exec_prop_low, exec_inorder_higher, has_e...",0.707115,0.683544,0.507639,0.717901,1.050263,0.024294,1.121790


In [48]:
# add columns that hold length
md_association_rules['antecedent_len'] = md_association_rules['antecedents'].apply(len)
md_association_rules['consequent_len'] = md_association_rules['consequents'].apply(len)

In [49]:
# filter the rules
md_association_rules[(md_association_rules['antecedent_len'] == 1) &
                    (md_association_rules['consequent_len'] == 1) &
                    (md_association_rules['confidence'] >= 0.95)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
4,(longer_beginning_T),(has_test_F),0.543867,0.96639,0.529463,0.973515,1.007373,0.003875,1.269024,1,1
6,(longer_ending_F),(has_author_F),0.741161,0.953296,0.713226,0.962309,1.009455,0.00668,1.239129,1,1
34,(longer_ending_F),(has_test_F),0.741161,0.96639,0.718027,0.968787,1.00248,0.001776,1.076781,1,1
35,(has_equation_F),(has_author_F),0.806198,0.953296,0.773461,0.959394,1.006397,0.004916,1.150176,1,1
37,(jupyter_prop_high),(has_author_F),0.787429,0.953296,0.756438,0.960643,1.007707,0.005786,1.186689,1,1
39,(output_cell_prop_med),(has_author_F),0.603667,0.953296,0.576604,0.95517,1.001966,0.001132,1.041812,1,1
41,(markdown_prop_med),(has_author_F),0.820166,0.953296,0.785247,0.957424,1.004331,0.003386,1.096972,1,1
42,(num_contrib_lower),(has_author_F),0.93409,0.953296,0.888695,0.951402,0.998014,-0.001769,0.961035,1,1
44,(image_prop_low),(has_author_F),0.541685,0.953296,0.516805,0.954069,1.000812,0.000419,1.016847,1,1
46,(has_links_F),(has_author_F),0.621126,0.953296,0.60454,0.973296,1.02098,0.012423,1.74896,1,1


### No Markdown Cells

In [50]:
# use itemsets to extract association rules
no_md_association_rules = association_rules(no_md_itemsets, metric = 'confidence', min_threshold = 0.7)
no_md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(jupyter_prop_high),(has_author_F),0.672083,0.991137,0.669867,0.996703,1.005616,0.003741,2.688331
1,(output_cell_prop_med),(has_author_F),0.586411,0.991137,0.582718,0.993703,1.002588,0.001504,1.407386
2,(num_contrib_lower),(has_author_F),0.912851,0.991137,0.904727,0.991100,0.999963,-0.000034,0.995837
3,(has_author_F),(num_contrib_lower),0.991137,0.912851,0.904727,0.912817,0.999963,-0.000034,0.999609
4,(image_prop_low),(has_author_F),0.675775,0.991137,0.667651,0.987978,0.996813,-0.002135,0.737210
...,...,...,...,...,...,...,...,...,...
18832,"(has_error_F, non_exec_prop_lower)","(exec_inorder_higher, has_author_F, has_test_F...",0.764402,0.687592,0.556130,0.727536,1.058092,0.030533,1.146603
18833,"(exec_skips_lower, non_exec_prop_lower)","(exec_inorder_higher, has_author_F, has_error_...",0.791728,0.664697,0.556130,0.702425,1.056760,0.029870,1.126786
18834,"(num_functions_lower, has_error_F)","(exec_inorder_higher, has_author_F, non_exec_p...",0.731167,0.718612,0.556130,0.760606,1.058438,0.030705,1.175420
18835,"(has_error_F, exec_skips_lower)","(exec_inorder_higher, has_author_F, non_exec_p...",0.751108,0.702363,0.556130,0.740413,1.054174,0.028579,1.146577


In [51]:
# add columns that hold length
no_md_association_rules['antecedent_len'] = no_md_association_rules['antecedents'].apply(len)
no_md_association_rules['consequent_len'] = no_md_association_rules['consequents'].apply(len)

In [52]:
# filter the rules
no_md_association_rules[(no_md_association_rules['antecedent_len'] == 1) &
                       (no_md_association_rules['consequent_len'] == 1) &
                       (no_md_association_rules['confidence'] >= 0.9)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.672083,0.991137,0.669867,0.996703,1.005616,0.003741,2.688331,1,1
1,(output_cell_prop_med),(has_author_F),0.586411,0.991137,0.582718,0.993703,1.002588,0.001504,1.407386,1,1
2,(num_contrib_lower),(has_author_F),0.912851,0.991137,0.904727,0.9911,0.999963,-3.4e-05,0.995837,1,1
3,(has_author_F),(num_contrib_lower),0.991137,0.912851,0.904727,0.912817,0.999963,-3.4e-05,0.999609,1,1
4,(image_prop_low),(has_author_F),0.675775,0.991137,0.667651,0.987978,0.996813,-0.002135,0.73721,1,1
5,(is_education_F),(has_author_F),0.892171,0.991137,0.883309,0.990066,0.998919,-0.000956,0.892171,1,1
7,(has_comments_T),(has_author_F),0.67873,0.991137,0.669867,0.986942,0.995767,-0.002847,0.67873,1,1
9,(num_commits_lower),(has_author_F),0.768833,0.991137,0.761448,0.990394,0.99925,-0.000572,0.9226,1,1
10,(has_author_F),(non_exec_prop_lower),0.991137,0.921713,0.916544,0.924739,1.003283,0.002999,1.040204,1,1
11,(non_exec_prop_lower),(has_author_F),0.921713,0.991137,0.916544,0.994391,1.003283,0.002999,1.58008,1,1


# Export Itemsets and Rules

## Markdown Cells

In [53]:
# check the data
md_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning_T),1
1,0.741161,(longer_ending_F),1
2,0.953296,(has_author_F),1
3,0.806198,(has_equation_F),1
4,0.787429,(jupyter_prop_high),1


In [68]:
# extract itemsets to a pickle
md_itemsets.to_pickle('rule-mining/md_frequent_itemsets.pkl')

# extract itemsets to a csv 
md_itemsets.to_csv('rule-mining/csv-files/md_frequent_itemsets.csv')

In [55]:
# check the data
md_association_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning_T),(has_author_F),0.543867,0.953296,0.515932,0.948636,0.995112,-0.002534,0.909278,1,1
1,(longer_beginning_T),(num_contrib_lower),0.543867,0.93409,0.50371,0.926164,0.991515,-0.004311,0.892652,1,1
2,(longer_beginning_T),(has_comments_T),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553,1,1
3,(longer_beginning_T),(exec_inorder_higher),0.543867,0.95155,0.516368,0.949438,0.997781,-0.001148,0.958242,1,1
4,(longer_beginning_T),(has_test_F),0.543867,0.96639,0.529463,0.973515,1.007373,0.003875,1.269024,1,1


In [67]:
# extract rules to a pickle
md_association_rules.to_pickle('rule-mining/md_association_rules.pkl')

# extract rules to a csv
md_association_rules.to_csv('rule-mining/csv-files/md_association_rules.csv')

## No Markdown Cells

In [57]:
# check the data
no_md_itemsets.head()

Unnamed: 0,support,itemsets,length
0,0.991137,(has_author_F),1
1,0.672083,(jupyter_prop_high),1
2,0.586411,(output_cell_prop_med),1
3,0.912851,(num_contrib_lower),1
4,0.675775,(image_prop_low),1


In [66]:
# extract itemsets to a pickle
no_md_itemsets.to_pickle('rule-mining/no_md_frequent_itemsets.pkl')

# extract itemsets to a csv
no_md_itemsets.to_csv('rule-mining/csv-files/no_md_frequent_itemsets.csv')

In [59]:
# check the data
no_md_association_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(jupyter_prop_high),(has_author_F),0.672083,0.991137,0.669867,0.996703,1.005616,0.003741,2.688331,1,1
1,(output_cell_prop_med),(has_author_F),0.586411,0.991137,0.582718,0.993703,1.002588,0.001504,1.407386,1,1
2,(num_contrib_lower),(has_author_F),0.912851,0.991137,0.904727,0.9911,0.999963,-3.4e-05,0.995837,1,1
3,(has_author_F),(num_contrib_lower),0.991137,0.912851,0.904727,0.912817,0.999963,-3.4e-05,0.999609,1,1
4,(image_prop_low),(has_author_F),0.675775,0.991137,0.667651,0.987978,0.996813,-0.002135,0.73721,1,1


In [65]:
# extract rules to a pickle
no_md_association_rules.to_pickle('rule-mining/no_md_association_rules.pkl')

# extract rules to a csv
no_md_association_rules.to_csv('rule-mining/csv-files/no_md_association_rules.csv')