# Imports

In [2]:
import pandas as pd
import prince
from mlxtend.frequent_patterns import apriori, association_rules

# Loading the Binned Data

## Markdown Cells

In [3]:
md_filepath = 'binning-data/markdown_group_binned.csv'
md_df = pd.read_csv(md_filepath)

In [4]:
# save the original dataframe
md_original = md_df

In [5]:
# clear the first two columns
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F
md_df = md_df.replace(True, 'T')
md_df = md_df.replace(False, 'F')

In [6]:
# initial look at the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,T,F,F,F,medium,low,lower,low,T,T,F,low,T,lower,T,low,higher,lower,F
1,T,F,F,F,high,medium,lower,low,T,T,T,medium,F,lower,T,high,higher,lower,F
2,T,F,F,F,high,medium,lower,low,T,T,T,medium,T,lower,T,high,higher,lower,F
3,T,F,F,F,high,medium,lower,low,T,T,T,medium,T,lower,F,low,higher,lower,F
4,T,F,F,T,high,medium,lower,low,F,F,T,medium,F,lower,F,low,higher,lower,F


In [7]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

In [8]:
# check the bin sizes
for var in md_vars:
    print(md_df[var].value_counts())

T    1246
F    1045
Name: longer_beginning, dtype: int64
F    1698
T     593
Name: longer_ending, dtype: int64
F    2184
T     107
Name: has_author, dtype: int64
F    1847
T     444
Name: has_equation, dtype: int64
high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
lower     2140
higher     151
Name: num_contrib, dtype: int64
low       1241
medium     873
high       177
Name: image_prop, dtype: int64
T    1321
F     970
Name: is_education, dtype: int64
F    1423
T     868
Name: has_links, dtype: int64
T    2038
F     253
Name: has_comments, dtype: int64
medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
T    1478
F     813
Name: has_title, dtype: int64
lower     1981
higher     310
Name: num_commits, dtype: int64
F    1236
T    1055
Name: md_format, dtype: int64
low       2003
high       173
medium     115
Name: non_exec_prop, dtype: int64
higher    2180


## No Markdown Cells

In [9]:
no_md_filepath = 'binning-data/no_markdown_group_binned.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [10]:
# save the original dataframe
no_md_original = no_md_df

In [11]:
# clear the first two columns
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F (strings)
no_md_df = no_md_df.replace(True, 'T')
no_md_df = no_md_df.replace(False, 'F')

In [12]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,F,medium,lower,low,F,F,lower,lower,higher,lower,F
1,F,high,lower,low,F,T,lower,lower,higher,lower,T
2,F,low,higher,low,F,T,higher,lower,higher,lower,F
3,F,high,lower,low,F,T,lower,lower,higher,lower,F
4,F,high,lower,low,F,T,higher,lower,higher,lower,T


In [13]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

In [14]:
# check the bin counts
for var in no_md_vars:
    print(no_md_df[var].value_counts())

F    1343
T      12
Name: has_author, dtype: int64
high      910
medium    286
low       159
Name: jupyter_prop, dtype: int64
lower     1237
higher     118
Name: num_contrib, dtype: int64
low       916
medium    281
high      158
Name: image_prop, dtype: int64
F    1209
T     146
Name: is_education, dtype: int64
T    919
F    436
Name: has_comments, dtype: int64
lower     1042
higher     313
Name: num_commits, dtype: int64
lower     1249
higher     106
Name: non_exec_prop, dtype: int64
higher    1205
lower      150
Name: exec_inorder, dtype: int64
lower     1179
higher     176
Name: exec_skips, dtype: int64
F    1139
T     216
Name: has_error, dtype: int64


# Multiple Correspondence Analysis (MCA)

## Markdown Cells

### Performing MCA

In [15]:
# instantiate MCA object and fit to data
md_mca = prince.MCA(n_components = len(md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
md_mca_fit = md_mca.fit(md_df)

In [16]:
# put the results into dataframe format
md_mca_df = md_mca_fit.row_coordinates(md_df)

# initial look at the results of the MCA
md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.03895,0.76415,0.123466,-0.573001,0.159412,-0.204294,0.304377,0.201377,-0.190293,0.129577,-0.185345,0.197234,-0.352195,-0.169324,-0.125456,0.142135,-0.402566,0.068305,-0.162917
1,0.379564,0.01533,0.418419,-0.186083,-0.246143,0.184833,-0.517825,0.400313,0.171027,-0.027436,-0.090247,-0.123177,-0.118447,0.201818,0.160676,0.159336,0.12433,0.098163,-0.293007
2,0.460085,-0.070535,0.408262,-0.26588,-0.238258,0.137878,-0.251149,0.324552,0.269962,-0.056728,-0.141259,-0.048566,0.010673,0.370292,0.096294,0.021593,0.07848,0.111308,-0.235359
3,0.154882,-0.208383,0.083654,-0.174085,-0.063831,0.149867,0.030867,0.029345,0.131944,-0.062887,-0.063892,-0.01268,-0.239903,0.131121,0.038304,0.005465,-0.221667,-0.075274,0.161511
4,-0.177197,-0.204644,0.06972,0.044641,0.069887,0.080545,-0.319025,-0.109154,-0.073152,0.13563,-0.120507,-0.000196,0.051243,0.148198,-0.058387,0.373835,-0.085053,-0.225825,0.103342


### Analyzing the Results of MCA

In [17]:
# extract the explained variance
md_mca_fit.explained_inertia_

[0.1264117646830074,
 0.07712189891928822,
 0.06526424818252895,
 0.05840604317218444,
 0.0565107247007416,
 0.04972328585655356,
 0.04611778026252358,
 0.043746856640539476,
 0.04271778486206097,
 0.04008643490669374,
 0.03888754395770909,
 0.03642054625134442,
 0.03497393600902581,
 0.03457026936060691,
 0.03392026746091107,
 0.032169249866233006,
 0.02933996114039189,
 0.028167206290571938,
 0.026366212871817868]

## No Markdown Cells

### Performing MCA

In [18]:
# instantiate MCA object and fit to data
no_md_mca = prince.MCA(n_components = len(no_md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
no_md_mca_fit = no_md_mca.fit(no_md_df)

In [19]:
# put the results into dataframe format
no_md_mca_df = no_md_mca_fit.row_coordinates(no_md_df)

# initial look at the results of the MCA
no_md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.261743,0.296846,-0.257493,-0.137606,0.353145,-0.39161,-0.227178,0.040868,-0.173026,-0.182239,-0.107518
1,0.17825,0.041815,-0.228766,-0.33524,-0.239304,0.423842,0.295788,0.039632,0.048768,-0.29318,-0.113312
2,-0.40653,-0.122694,0.352174,0.712612,0.732707,0.570806,0.49862,0.231289,0.119264,-0.033058,0.198903
3,-0.131782,-0.086343,0.02892,-0.23108,-0.1224,0.103071,-0.011793,-0.073935,0.028024,-0.192888,-0.095841
4,0.28524,-0.023131,0.126216,-0.004104,-0.11384,0.465679,0.443982,0.076053,-0.161783,-0.182193,-0.188592


### Analyzing the Results of MCA

In [20]:
# extract the explained variance
no_md_mca_fit.explained_inertia_

[0.12370870341785964,
 0.09849552031983039,
 0.09734374536610015,
 0.08658594297883986,
 0.08400283383718782,
 0.07838450195841805,
 0.07407927250811161,
 0.07013650754422476,
 0.06622192784246521,
 0.05862806239954164,
 0.05692710063661963]

# Association Rule Mining

## One-Hot Encoding

We must put our categorical data into a format that can be input into the Apriori algorithm

### Markdown Cells

In [21]:
# create a copy of the dataframe that we will one-hot encode
md_one_hot = md_original.copy()

In [22]:
# drop the first two columns (since we took the original dataframe)
md_one_hot = md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [23]:
# extract the datatypes so we can isolate the variables that need dummies
list(enumerate(zip(md_vars, md_one_hot.dtypes)))

[(0, ('longer_beginning', dtype('bool'))),
 (1, ('longer_ending', dtype('bool'))),
 (2, ('has_author', dtype('bool'))),
 (3, ('has_equation', dtype('bool'))),
 (4, ('jupyter_prop', dtype('O'))),
 (5, ('markdown_prop', dtype('O'))),
 (6, ('num_contrib', dtype('O'))),
 (7, ('image_prop', dtype('O'))),
 (8, ('is_education', dtype('bool'))),
 (9, ('has_links', dtype('bool'))),
 (10, ('has_comments', dtype('bool'))),
 (11, ('md_frequency', dtype('O'))),
 (12, ('has_title', dtype('bool'))),
 (13, ('num_commits', dtype('O'))),
 (14, ('md_format', dtype('bool'))),
 (15, ('non_exec_prop', dtype('O'))),
 (16, ('exec_inorder', dtype('O'))),
 (17, ('exec_skips', dtype('O'))),
 (18, ('has_error', dtype('bool')))]

In [24]:
# extract t/f vars
md_tf_vars = [md_vars[i] for i in [0, 1, 2, 3, 8, 9, 10, 12, 14, 18]]
md_tf_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'is_education',
 'has_links',
 'has_comments',
 'has_title',
 'md_format',
 'has_error']

In [25]:
# extract non-t/f vars
md_s_vars = [md_vars[i] for i in [4, 5, 6, 7, 11, 13, 15, 16, 17]]
md_s_vars

['jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'md_frequency',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [26]:
# extract value counts so we can isolate variables that have 3 bins
for (i, var) in enumerate(md_s_vars):
    print(i, md_one_hot[var].value_counts())

0 high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
1 medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
2 lower     2140
higher     151
Name: num_contrib, dtype: int64
3 low       1241
medium     873
high       177
Name: image_prop, dtype: int64
4 medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
5 lower     1981
higher     310
Name: num_commits, dtype: int64
6 low       2003
high       173
medium     115
Name: non_exec_prop, dtype: int64
7 higher    2180
lower      111
Name: exec_inorder, dtype: int64
8 lower     1925
higher     366
Name: exec_skips, dtype: int64


In [27]:
# extract lower/higher variables
md_lh_vars = [md_s_vars[i] for i in [2, 5, 7, 8]]

In [28]:
# extract low/medium/high variables
md_lmh_vars = [md_s_vars[i] for i in [0, 1, 3, 4, 6]]

In [29]:
# turn t/f vars into 1/0
for tf_var in md_tf_vars:
    md_one_hot[tf_var] = md_one_hot[tf_var].replace(True, 1)
    md_one_hot[tf_var] = md_one_hot[tf_var].replace(False, 0)

In [30]:
# turn lower/higher vars into 1/0
for lh_var in md_lh_vars:
    md_one_hot[lh_var] = md_one_hot[lh_var].replace('higher', 1)
    md_one_hot[lh_var] = md_one_hot[lh_var].replace('lower', 0)

In [31]:
# create dataframe that will hold all of the dummy variables
md_one_hot_cols = pd.get_dummies(md_one_hot[md_lmh_vars])

# initial look at the dummy variables
md_one_hot_cols.head()

Unnamed: 0,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,markdown_prop_high,markdown_prop_low,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0
1,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0
2,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0
3,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,1,0,1,0,0,0,1,0,1,0


In [32]:
# delete the original columns from the one-hot dataframe
md_one_hot = md_one_hot.drop(md_lmh_vars, axis = 1)

# join the one hot dataframe with the dummy dataframe
md_one_hot = pd.concat([md_one_hot, md_one_hot_cols], axis = 1)

In [33]:
# initial look at the one-hot encoded data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,num_contrib,is_education,has_links,has_comments,has_title,num_commits,...,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,1.0,0.0,0.0,0.0,0,1.0,1.0,0.0,1.0,0,...,0,0,1,0,0,1,0,0,1,0
1,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,0.0,0,...,1,0,1,0,0,0,1,1,0,0
2,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,0,...,1,0,1,0,0,0,1,1,0,0
3,1.0,0.0,0.0,0.0,0,1.0,1.0,1.0,1.0,0,...,1,0,1,0,0,0,1,0,1,0
4,1.0,0.0,0.0,1.0,0,0.0,0.0,1.0,0.0,0,...,1,0,1,0,0,0,1,0,1,0


In [34]:
# convert everything to an int type
for var in list(md_one_hot):
    md_one_hot[var] = md_one_hot[var].astype(int)

In [35]:
# check the datatypes
md_one_hot.dtypes

longer_beginning        int64
longer_ending           int64
has_author              int64
has_equation            int64
num_contrib             int64
is_education            int64
has_links               int64
has_comments            int64
has_title               int64
num_commits             int64
md_format               int64
exec_inorder            int64
exec_skips              int64
has_error               int64
jupyter_prop_high       int64
jupyter_prop_low        int64
jupyter_prop_medium     int64
markdown_prop_high      int64
markdown_prop_low       int64
markdown_prop_medium    int64
image_prop_high         int64
image_prop_low          int64
image_prop_medium       int64
md_frequency_high       int64
md_frequency_low        int64
md_frequency_medium     int64
non_exec_prop_high      int64
non_exec_prop_low       int64
non_exec_prop_medium    int64
dtype: object

### No Markdown Cells

In [36]:
# copy the original dataframe
no_md_one_hot = no_md_original.copy()

In [37]:
# drop the first two columns
no_md_one_hot = no_md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [38]:
# extract datatypes
list(enumerate(zip(no_md_vars, no_md_one_hot.dtypes)))

[(0, ('has_author', dtype('bool'))),
 (1, ('jupyter_prop', dtype('O'))),
 (2, ('num_contrib', dtype('O'))),
 (3, ('image_prop', dtype('O'))),
 (4, ('is_education', dtype('bool'))),
 (5, ('has_comments', dtype('bool'))),
 (6, ('num_commits', dtype('O'))),
 (7, ('non_exec_prop', dtype('O'))),
 (8, ('exec_inorder', dtype('O'))),
 (9, ('exec_skips', dtype('O'))),
 (10, ('has_error', dtype('bool')))]

In [39]:
# extract t/f vars
no_md_tf_vars = [no_md_vars[i] for i in [0, 4, 5, 10]]
no_md_tf_vars

['has_author', 'is_education', 'has_comments', 'has_error']

In [40]:
# extract non-tf vars
no_md_s_vars = [no_md_vars[i] for i in [1, 2, 3, 6, 7, 8, 9]]
no_md_s_vars

['jupyter_prop',
 'num_contrib',
 'image_prop',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [41]:
# extract value counts
for (i, var) in enumerate(no_md_s_vars):
    print(i, no_md_one_hot[var].value_counts())

0 high      910
medium    286
low       159
Name: jupyter_prop, dtype: int64
1 lower     1237
higher     118
Name: num_contrib, dtype: int64
2 low       916
medium    281
high      158
Name: image_prop, dtype: int64
3 lower     1042
higher     313
Name: num_commits, dtype: int64
4 lower     1249
higher     106
Name: non_exec_prop, dtype: int64
5 higher    1205
lower      150
Name: exec_inorder, dtype: int64
6 lower     1179
higher     176
Name: exec_skips, dtype: int64


In [42]:
# extract lower/higher vars
no_md_lh_vars = [no_md_s_vars[i] for i in [1, 3, 4, 5, 6]]

In [43]:
# extract low/medium/high vars
no_md_lmh_vars = [no_md_s_vars[i] for i in [0, 2]]

In [44]:
# turn t/f vars into 1/0
for tf_var in no_md_tf_vars:
    no_md_one_hot[tf_var] = no_md_one_hot[tf_var].replace(True, 1)
    no_md_one_hot[tf_var] = no_md_one_hot[tf_var].replace(False, 0)

In [45]:
# turn lower/higher vars into 1/0
for lh_var in no_md_lh_vars:
    no_md_one_hot[lh_var] = no_md_one_hot[lh_var].replace('higher', 1)
    no_md_one_hot[lh_var] = no_md_one_hot[lh_var].replace('lower', 0)

In [46]:
# create dataframe that will hold dummy vars
no_md_one_hot_cols = pd.get_dummies(no_md_one_hot[no_md_lmh_vars])

# initial look at the dummy variables
no_md_one_hot_cols.head()

Unnamed: 0,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,0,0,1,0,1,0
1,1,0,0,0,1,0
2,0,1,0,0,1,0
3,1,0,0,0,1,0
4,1,0,0,0,1,0


In [47]:
# delete the original columns from the one-hot dataframe
no_md_one_hot = no_md_one_hot.drop(no_md_lmh_vars, axis = 1)

# join the one hot dataframe w the dummy vars
no_md_one_hot = pd.concat([no_md_one_hot, no_md_one_hot_cols], axis = 1)

In [48]:
# initial look at the one-hot encoded data
no_md_one_hot.head()

Unnamed: 0,has_author,num_contrib,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,0.0,0,0.0,0.0,0,0,1,0,0.0,0,0,1,0,1,0
1,0.0,0,0.0,1.0,0,0,1,0,1.0,1,0,0,0,1,0
2,0.0,1,0.0,1.0,1,0,1,0,0.0,0,1,0,0,1,0
3,0.0,0,0.0,1.0,0,0,1,0,0.0,1,0,0,0,1,0
4,0.0,0,0.0,1.0,1,0,1,0,1.0,1,0,0,0,1,0


In [49]:
# convert everything to an int type
for var in list(no_md_one_hot):
    no_md_one_hot[var] = no_md_one_hot[var].astype(int)

In [50]:
# check the datatypes
no_md_one_hot.dtypes

has_author             int64
num_contrib            int64
is_education           int64
has_comments           int64
num_commits            int64
non_exec_prop          int64
exec_inorder           int64
exec_skips             int64
has_error              int64
jupyter_prop_high      int64
jupyter_prop_low       int64
jupyter_prop_medium    int64
image_prop_high        int64
image_prop_low         int64
image_prop_medium      int64
dtype: object

## Performing the Apriori Algorithm

### Markdown Cells

In [51]:
# requires True/False instead of 1/0
md_one_hot = md_one_hot.replace(1, True)
md_one_hot = md_one_hot.replace(0, False)

In [52]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,num_contrib,is_education,has_links,has_comments,has_title,num_commits,...,markdown_prop_medium,image_prop_high,image_prop_low,image_prop_medium,md_frequency_high,md_frequency_low,md_frequency_medium,non_exec_prop_high,non_exec_prop_low,non_exec_prop_medium
0,True,False,False,False,False,True,True,False,True,False,...,False,False,True,False,False,True,False,False,True,False
1,True,False,False,False,False,True,True,True,False,False,...,True,False,True,False,False,False,True,True,False,False
2,True,False,False,False,False,True,True,True,True,False,...,True,False,True,False,False,False,True,True,False,False
3,True,False,False,False,False,True,True,True,True,False,...,True,False,True,False,False,False,True,False,True,False
4,True,False,False,True,False,False,False,True,False,False,...,True,False,True,False,False,False,True,False,True,False


In [53]:
# perform the apriori algorithm on the dataframe
md_itemsets = apriori(md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
md_itemsets

Unnamed: 0,support,itemsets
0,0.543867,(longer_beginning)
1,0.576604,(is_education)
2,0.889568,(has_comments)
3,0.645133,(has_title)
4,0.95155,(exec_inorder)
5,0.787429,(jupyter_prop_high)
6,0.820166,(markdown_prop_medium)
7,0.541685,(image_prop_low)
8,0.556962,(md_frequency_medium)
9,0.874291,(non_exec_prop_low)


In [54]:
# add a column that counts the number of elements in the itemset
md_itemsets['length'] = md_itemsets['itemsets'].apply(lambda x : len(x))
md_itemsets

Unnamed: 0,support,itemsets,length
0,0.543867,(longer_beginning),1
1,0.576604,(is_education),1
2,0.889568,(has_comments),1
3,0.645133,(has_title),1
4,0.95155,(exec_inorder),1
5,0.787429,(jupyter_prop_high),1
6,0.820166,(markdown_prop_medium),1
7,0.541685,(image_prop_low),1
8,0.556962,(md_frequency_medium),1
9,0.874291,(non_exec_prop_low),1


In [55]:
# filter down to itemsets that have at least 2 items and at least 0.7 support
md_itemsets[(md_itemsets['length'] >= 2) & (md_itemsets['support'] >= 0.7)]

Unnamed: 0,support,itemsets,length
15,0.848101,"(exec_inorder, has_comments)",2
16,0.700131,"(jupyter_prop_high, has_comments)",2
17,0.736796,"(has_comments, markdown_prop_medium)",2
18,0.774771,"(has_comments, non_exec_prop_low)",2
23,0.750764,"(exec_inorder, jupyter_prop_high)",2
24,0.780882,"(exec_inorder, markdown_prop_medium)",2
27,0.829769,"(exec_inorder, non_exec_prop_low)",2
30,0.72021,"(markdown_prop_medium, non_exec_prop_low)",2
34,0.703623,"(exec_inorder, has_comments, markdown_prop_med...",3
35,0.737233,"(exec_inorder, has_comments, non_exec_prop_low)",3


### No Markdown Cells

In [56]:
# replace 1/0 with True/False
no_md_one_hot = no_md_one_hot.replace(1, True)
no_md_one_hot = no_md_one_hot.replace(0, False)

In [57]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author,num_contrib,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,jupyter_prop_high,jupyter_prop_low,jupyter_prop_medium,image_prop_high,image_prop_low,image_prop_medium
0,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False
1,False,False,False,True,False,False,True,False,True,True,False,False,False,True,False
2,False,True,False,True,True,False,True,False,False,False,True,False,False,True,False
3,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False
4,False,False,False,True,True,False,True,False,True,True,False,False,False,True,False


In [58]:
# perform apriori algorithm on the dataframe
no_md_itemsets = apriori(no_md_one_hot, min_support = 0.5, use_colnames = True)

# look at the itemsets
no_md_itemsets

Unnamed: 0,support,itemsets
0,0.678229,(has_comments)
1,0.889299,(exec_inorder)
2,0.671587,(jupyter_prop_high)
3,0.676015,(image_prop_low)
4,0.597048,"(exec_inorder, has_comments)"
5,0.602214,"(exec_inorder, jupyter_prop_high)"
6,0.61107,"(exec_inorder, image_prop_low)"


Not a lot of itemsets (probably because not a lot of variables), maybe reduce `min_support`?

In [59]:
# add a column that counts the number of elements in the itemset
no_md_itemsets['length'] = no_md_itemsets['itemsets'].apply(lambda x : len(x))
no_md_itemsets

Unnamed: 0,support,itemsets,length
0,0.678229,(has_comments),1
1,0.889299,(exec_inorder),1
2,0.671587,(jupyter_prop_high),1
3,0.676015,(image_prop_low),1
4,0.597048,"(exec_inorder, has_comments)",2
5,0.602214,"(exec_inorder, jupyter_prop_high)",2
6,0.61107,"(exec_inorder, image_prop_low)",2


In [60]:
# filter down to itemsets that have at least 2 items
no_md_itemsets[no_md_itemsets['length'] >= 2]

Unnamed: 0,support,itemsets,length
4,0.597048,"(exec_inorder, has_comments)",2
5,0.602214,"(exec_inorder, jupyter_prop_high)",2
6,0.61107,"(exec_inorder, image_prop_low)",2


## Extracting Association Rules

### Markdown Cells

In [61]:
# use the itemsets to extract association rules
md_association_rules = association_rules(md_itemsets, metric = 'confidence', min_threshold = 0.7)
md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(longer_beginning),(has_comments),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553
1,(longer_beginning),(exec_inorder),0.543867,0.951550,0.516368,0.949438,0.997781,-0.001148,0.958242
2,(is_education),(has_comments),0.576604,0.889568,0.516805,0.896291,1.007557,0.003876,1.064824
3,(is_education),(exec_inorder),0.576604,0.951550,0.548669,0.951552,1.000002,0.000001,1.000048
4,(has_title),(has_comments),0.645133,0.889568,0.574422,0.890392,1.000927,0.000532,1.007523
...,...,...,...,...,...,...,...,...,...
143,"(markdown_prop_medium, non_exec_prop_low)","(exec_inorder, jupyter_prop_high)",0.720210,0.750764,0.552597,0.767273,1.021989,0.011890,1.070937
144,"(exec_inorder, jupyter_prop_high)","(markdown_prop_medium, non_exec_prop_low)",0.750764,0.720210,0.552597,0.736047,1.021989,0.011890,1.059999
145,"(jupyter_prop_high, markdown_prop_medium)","(exec_inorder, non_exec_prop_low)",0.664339,0.829769,0.552597,0.831800,1.002448,0.001350,1.012079
146,"(exec_inorder, markdown_prop_medium)","(jupyter_prop_high, non_exec_prop_low)",0.780882,0.686163,0.552597,0.707658,1.031326,0.016785,1.073526


In [62]:
# add columns that hold length
md_association_rules['antecedent_len'] = md_association_rules['antecedents'].apply(len)
md_association_rules['consequent_len'] = md_association_rules['consequents'].apply(len)

In [63]:
# filter down to rules that have 1 antecedent and 1 consequent
md_association_rules[(md_association_rules['antecedent_len'] == 1) &
                    (md_association_rules['consequent_len'] == 1) &
                    (md_association_rules['confidence'] >= 0.8)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(longer_beginning),(has_comments),0.543867,0.889568,0.503274,0.925361,1.040237,0.019467,1.479553,1,1
1,(longer_beginning),(exec_inorder),0.543867,0.95155,0.516368,0.949438,0.997781,-0.001148,0.958242,1,1
2,(is_education),(has_comments),0.576604,0.889568,0.516805,0.896291,1.007557,0.003876,1.064824,1,1
3,(is_education),(exec_inorder),0.576604,0.95155,0.548669,0.951552,1.000002,1e-06,1.000048,1,1
4,(has_title),(has_comments),0.645133,0.889568,0.574422,0.890392,1.000927,0.000532,1.007523,1,1
5,(exec_inorder),(has_comments),0.95155,0.889568,0.848101,0.891284,1.00193,0.001633,1.015789,1,1
6,(has_comments),(exec_inorder),0.889568,0.95155,0.848101,0.953386,1.00193,0.001633,1.03939,1,1
7,(jupyter_prop_high),(has_comments),0.787429,0.889568,0.700131,0.889135,0.999514,-0.000341,0.996098,1,1
9,(has_comments),(markdown_prop_medium),0.889568,0.820166,0.736796,0.828263,1.009873,0.007203,1.047148,1,1
10,(markdown_prop_medium),(has_comments),0.820166,0.889568,0.736796,0.89835,1.009873,0.007203,1.086398,1,1


### No Markdown Cells

In [64]:
# use the itemsets to extract association rules
no_md_association_rules = association_rules(no_md_itemsets, metric = 'confidence', min_threshold = 0.5)
no_md_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(exec_inorder),(has_comments),0.889299,0.678229,0.597048,0.671369,0.989886,-0.0061,0.979127
1,(has_comments),(exec_inorder),0.678229,0.889299,0.597048,0.880305,0.989886,-0.0061,0.924857
2,(exec_inorder),(jupyter_prop_high),0.889299,0.671587,0.602214,0.677178,1.008326,0.004973,1.017321
3,(jupyter_prop_high),(exec_inorder),0.671587,0.889299,0.602214,0.896703,1.008326,0.004973,1.071681
4,(exec_inorder),(image_prop_low),0.889299,0.676015,0.61107,0.687137,1.016453,0.009891,1.03555
5,(image_prop_low),(exec_inorder),0.676015,0.889299,0.61107,0.90393,1.016453,0.009891,1.152298


**Note (TODO): cross-reference this with the markdown cell group**

# Association Rule Mining (with all dummies)

## One Hot Encoding

We create dummy variables for every single variable, instead of those with low/medium/high

### Markdown Cells

In [65]:
# create a copy of the dataframe we will one-hot encode
md_one_hot = md_original.copy()

In [66]:
# drop the first two columns 
md_one_hot = md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [67]:
# initial look at the data
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,True,False,False,False,medium,low,lower,low,True,True,False,low,True,lower,True,low,higher,lower,False
1,True,False,False,False,high,medium,lower,low,True,True,True,medium,False,lower,True,high,higher,lower,False
2,True,False,False,False,high,medium,lower,low,True,True,True,medium,True,lower,True,high,higher,lower,False
3,True,False,False,False,high,medium,lower,low,True,True,True,medium,True,lower,False,low,higher,lower,False
4,True,False,False,True,high,medium,lower,low,False,False,True,medium,False,lower,False,low,higher,lower,False


In [68]:
# change True/False to T/F for simplicity
md_one_hot = md_one_hot.replace(True, 'T')
md_one_hot = md_one_hot.replace(False, 'F')

In [69]:
# change medium to med for simplicity
md_one_hot = md_one_hot.replace('medium', 'med')

In [70]:
# check the data again
md_one_hot.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,T,F,F,F,med,low,lower,low,T,T,F,low,T,lower,T,low,higher,lower,F
1,T,F,F,F,high,med,lower,low,T,T,T,med,F,lower,T,high,higher,lower,F
2,T,F,F,F,high,med,lower,low,T,T,T,med,T,lower,T,high,higher,lower,F
3,T,F,F,F,high,med,lower,low,T,T,T,med,T,lower,F,low,higher,lower,F
4,T,F,F,T,high,med,lower,low,F,F,T,med,F,lower,F,low,higher,lower,F


In [71]:
# create dummy variables for all variables
md_one_hot = pd.get_dummies(md_one_hot)

In [72]:
# look at the one-hot encoded data
md_one_hot.head()

Unnamed: 0,longer_beginning_F,longer_beginning_T,longer_ending_F,longer_ending_T,has_author_F,has_author_T,has_equation_F,has_equation_T,jupyter_prop_high,jupyter_prop_low,...,md_format_T,non_exec_prop_high,non_exec_prop_low,non_exec_prop_med,exec_inorder_higher,exec_inorder_lower,exec_skips_higher,exec_skips_lower,has_error_F,has_error_T
0,0,1,1,0,1,0,1,0,0,0,...,1,0,1,0,1,0,0,1,1,0
1,0,1,1,0,1,0,1,0,1,0,...,1,1,0,0,1,0,0,1,1,0
2,0,1,1,0,1,0,1,0,1,0,...,1,1,0,0,1,0,0,1,1,0
3,0,1,1,0,1,0,1,0,1,0,...,0,0,1,0,1,0,0,1,1,0
4,0,1,1,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,1,1,0


In [73]:
# check the fields
md_one_hot.columns

Index(['longer_beginning_F', 'longer_beginning_T', 'longer_ending_F',
       'longer_ending_T', 'has_author_F', 'has_author_T', 'has_equation_F',
       'has_equation_T', 'jupyter_prop_high', 'jupyter_prop_low',
       'jupyter_prop_med', 'markdown_prop_high', 'markdown_prop_low',
       'markdown_prop_med', 'num_contrib_higher', 'num_contrib_lower',
       'image_prop_high', 'image_prop_low', 'image_prop_med', 'is_education_F',
       'is_education_T', 'has_links_F', 'has_links_T', 'has_comments_F',
       'has_comments_T', 'md_frequency_high', 'md_frequency_low',
       'md_frequency_med', 'has_title_F', 'has_title_T', 'num_commits_higher',
       'num_commits_lower', 'md_format_F', 'md_format_T', 'non_exec_prop_high',
       'non_exec_prop_low', 'non_exec_prop_med', 'exec_inorder_higher',
       'exec_inorder_lower', 'exec_skips_higher', 'exec_skips_lower',
       'has_error_F', 'has_error_T'],
      dtype='object')

### No Markdown Cells

In [75]:
# create a copy of the dataframe we will one-hot encode
no_md_one_hot = no_md_original.copy()

In [76]:
# drop the first two columns
no_md_one_hot = no_md_one_hot.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [77]:
# initial look at the data
no_md_one_hot.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,False,medium,lower,low,False,False,lower,lower,higher,lower,False
1,False,high,lower,low,False,True,lower,lower,higher,lower,True
2,False,low,higher,low,False,True,higher,lower,higher,lower,False
3,False,high,lower,low,False,True,lower,lower,higher,lower,False
4,False,high,lower,low,False,True,higher,lower,higher,lower,True


In [79]:
# change True/False to T/F 
no_md_one_hot = no_md_one_hot.replace(True, 'T')
no_md_one_hot = no_md_one_hot.replace(False, 'F')

# change medium to med
no_md_one_hot = no_md_one_hot.replace('medium', 'med')

In [80]:
# check the data again
no_md_one_hot.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,F,med,lower,low,F,F,lower,lower,higher,lower,F
1,F,high,lower,low,F,T,lower,lower,higher,lower,T
2,F,low,higher,low,F,T,higher,lower,higher,lower,F
3,F,high,lower,low,F,T,lower,lower,higher,lower,F
4,F,high,lower,low,F,T,higher,lower,higher,lower,T


In [81]:
# create all dummy variables
no_md_one_hot = pd.get_dummies(no_md_one_hot)

In [82]:
# look at the one-hot encoded data
no_md_one_hot.head()

Unnamed: 0,has_author_F,has_author_T,jupyter_prop_high,jupyter_prop_low,jupyter_prop_med,num_contrib_higher,num_contrib_lower,image_prop_high,image_prop_low,image_prop_med,...,num_commits_higher,num_commits_lower,non_exec_prop_higher,non_exec_prop_lower,exec_inorder_higher,exec_inorder_lower,exec_skips_higher,exec_skips_lower,has_error_F,has_error_T
0,1,0,0,0,1,0,1,0,1,0,...,0,1,0,1,1,0,0,1,1,0
1,1,0,1,0,0,0,1,0,1,0,...,0,1,0,1,1,0,0,1,0,1
2,1,0,0,1,0,1,0,0,1,0,...,1,0,0,1,1,0,0,1,1,0
3,1,0,1,0,0,0,1,0,1,0,...,0,1,0,1,1,0,0,1,1,0
4,1,0,1,0,0,0,1,0,1,0,...,1,0,0,1,1,0,0,1,0,1


In [83]:
# check the fields
no_md_one_hot.columns

Index(['has_author_F', 'has_author_T', 'jupyter_prop_high', 'jupyter_prop_low',
       'jupyter_prop_med', 'num_contrib_higher', 'num_contrib_lower',
       'image_prop_high', 'image_prop_low', 'image_prop_med', 'is_education_F',
       'is_education_T', 'has_comments_F', 'has_comments_T',
       'num_commits_higher', 'num_commits_lower', 'non_exec_prop_higher',
       'non_exec_prop_lower', 'exec_inorder_higher', 'exec_inorder_lower',
       'exec_skips_higher', 'exec_skips_lower', 'has_error_F', 'has_error_T'],
      dtype='object')