# Imports

In [109]:
import pandas as pd
import prince

# Loading the Data

## Markdown Cells

In [110]:
md_filepath = 'markdown_group.csv'
md_df = pd.read_csv(md_filepath)

In [111]:
# save the original dataframe
md_original = md_df

In [112]:
# clear the first two columns
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [113]:
# initial look at the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
1,True,False,False,False,1.0,0.16129,1.0,0.0,True,True,True,0.333333,False,2.0,True,0.961538,1.0,0.0,False
2,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,1.0,0.0,False
3,True,False,False,False,1.0,0.461538,1.0,0.0,True,True,True,0.64,True,1.0,False,0.0,0.923077,1.615385,False
4,True,False,False,True,1.0,0.461538,1.0,0.0,False,False,True,0.52,False,1.0,False,0.0,1.0,6.333333,False


In [114]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

## No Markdown Cells

In [115]:
no_md_filepath = 'no_markdown_group.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [116]:
# save the original dataframe
no_md_original = no_md_df

In [117]:
# clear the fisrt two columns
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [118]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,False,0.71413,1.0,0.142857,False,False,1.0,0.0,1.0,1.0,False
1,False,0.99784,1.0,0.1875,False,True,1.0,0.04878,0.789474,10.631579,True
2,False,0.121957,5.0,0.0,False,True,11.0,0.0,0.909091,6.181818,False
3,False,1.0,1.0,0.0,False,True,1.0,0.0,1.0,1.631579,False
4,False,0.940063,3.0,0.0,False,True,3.0,0.125,1.0,1.0,True


In [119]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

# Binning the Data

Since we want to do analysis on categorical variables, we turn our quantitative variables categorical by binning

## Markdown Cells

In [120]:
# print the dtypes so we can extract the quantitative variables
list(enumerate(zip(md_vars, md_df.dtypes)))

[(0, ('longer_beginning', dtype('bool'))),
 (1, ('longer_ending', dtype('bool'))),
 (2, ('has_author', dtype('bool'))),
 (3, ('has_equation', dtype('bool'))),
 (4, ('jupyter_prop', dtype('float64'))),
 (5, ('markdown_prop', dtype('float64'))),
 (6, ('num_contrib', dtype('float64'))),
 (7, ('image_prop', dtype('float64'))),
 (8, ('is_education', dtype('bool'))),
 (9, ('has_links', dtype('bool'))),
 (10, ('has_comments', dtype('bool'))),
 (11, ('md_frequency', dtype('float64'))),
 (12, ('has_title', dtype('bool'))),
 (13, ('num_commits', dtype('float64'))),
 (14, ('md_format', dtype('bool'))),
 (15, ('non_exec_prop', dtype('float64'))),
 (16, ('exec_inorder', dtype('float64'))),
 (17, ('exec_skips', dtype('float64'))),
 (18, ('has_error', dtype('bool')))]

In [121]:
# extract the quantitative variables
md_quant_vars = [md_vars[i] for i in [4, 5, 6, 7, 11, 13, 15, 16, 17]]
md_quant_vars

['jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'md_frequency',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [122]:
# array of quartiles we want to separate the data into (will later combine medium1 and medium2)
four_labels = ['low', 'medium1', 'medium2', 'high']

In [123]:
# create a copy of the dataframe that we will turn into categorical data
md_cat_df = md_df.copy()

In [124]:
# for each quantitative field in the data
for quant_var in md_quant_vars:
    
    # change the column in the dataframe so that it is binned into low, medium, and high
    md_cat_df.loc[:, quant_var] = pd.cut(md_cat_df[quant_var], 4, labels = four_labels)
    
    # combine medium1 and medium2 into medium
    md_cat_df.loc[:, quant_var] = md_cat_df[quant_var].replace(['medium1', 'medium2'], 'medium')

In [125]:
# initial look at the categorical data
md_cat_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,True,False,False,False,medium,low,low,low,True,True,False,low,True,low,True,low,high,low,False
1,True,False,False,False,high,low,low,low,True,True,True,medium,False,low,True,high,high,low,False
2,True,False,False,False,high,medium,low,low,True,True,True,medium,True,low,True,high,high,low,False
3,True,False,False,False,high,medium,low,low,True,True,True,medium,True,low,False,low,high,low,False
4,True,False,False,True,high,medium,low,low,False,False,True,medium,False,low,False,low,high,low,False


In [126]:
# check the bin sizes
for quant_var in md_quant_vars:
    print(md_cat_df[quant_var].value_counts())

high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
medium    1672
low        544
high        75
Name: markdown_prop, dtype: int64
low       2218
medium      61
high        12
Name: num_contrib, dtype: int64
low       1241
medium     873
high       177
Name: image_prop, dtype: int64
medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
low       2192
medium      90
high         9
Name: num_commits, dtype: int64
low       2003
high       173
medium     115
Name: non_exec_prop, dtype: int64
high      2078
medium     209
low          4
Name: exec_inorder, dtype: int64
low       2255
medium      34
high         2
Name: exec_skips, dtype: int64


Based on this, I think having **two bins** for each quantitative variable is actually best, since in most cases the low/high group is much smaller than the other bins, and analysis might be skewed if we have bins with a very low number of notebooks

In [127]:
# new labels for only two bins
two_labels = ['lower', 'higher']

In [128]:
# reset the categorical dataframe to the original dataframe
md_cat_df = md_df.copy()

In [129]:
# for each quantitative field in the data
for quant_var in md_quant_vars:
    
    # change the column in the dataframe so that it is binned into low, medium, and high
    md_cat_df.loc[:, quant_var] = pd.cut(md_cat_df[quant_var], 2, labels = two_labels)

In [130]:
# initial look at the new categorical data
md_cat_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,True,False,False,False,higher,lower,lower,lower,True,True,False,lower,True,lower,True,lower,higher,lower,False
1,True,False,False,False,higher,lower,lower,lower,True,True,True,lower,False,lower,True,higher,higher,lower,False
2,True,False,False,False,higher,lower,lower,lower,True,True,True,higher,True,lower,True,higher,higher,lower,False
3,True,False,False,False,higher,lower,lower,lower,True,True,True,higher,True,lower,False,lower,higher,lower,False
4,True,False,False,True,higher,lower,lower,lower,False,False,True,higher,False,lower,False,lower,higher,lower,False


In [149]:
# check the bin sizes again
for quant_var in md_quant_vars:
    print(md_cat_df[quant_var].value_counts())

higher    2003
lower      288
Name: jupyter_prop, dtype: int64
lower     1763
higher     528
Name: markdown_prop, dtype: int64
lower     2255
higher      36
Name: num_contrib, dtype: int64
lower     1721
higher     570
Name: image_prop, dtype: int64
higher    1434
lower      857
Name: md_frequency, dtype: int64
lower     2266
higher      25
Name: num_commits, dtype: int64
lower     2081
higher     210
Name: non_exec_prop, dtype: int64
higher    2268
lower       23
Name: exec_inorder, dtype: int64
lower     2284
higher       7
Name: exec_skips, dtype: int64


## No Markdown Cells

In [131]:
# print the dtypes so we can extract the quantitative variables
list(enumerate(zip(no_md_vars, no_md_df.dtypes)))

[(0, ('has_author', dtype('bool'))),
 (1, ('jupyter_prop', dtype('float64'))),
 (2, ('num_contrib', dtype('float64'))),
 (3, ('image_prop', dtype('float64'))),
 (4, ('is_education', dtype('bool'))),
 (5, ('has_comments', dtype('bool'))),
 (6, ('num_commits', dtype('float64'))),
 (7, ('non_exec_prop', dtype('float64'))),
 (8, ('exec_inorder', dtype('float64'))),
 (9, ('exec_skips', dtype('float64'))),
 (10, ('has_error', dtype('bool')))]

In [132]:
# extract the quantitative variables
no_md_quant_vars = [no_md_vars[i] for i in [1, 2, 3, 6, 7, 8, 9]]
no_md_quant_vars

['jupyter_prop',
 'num_contrib',
 'image_prop',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips']

In [133]:
# create a copy of the dataframe that we will turn into categorical data
no_md_cat_df = no_md_df.copy()

In [134]:
# for each quantitative field in the data
for quant_var in no_md_quant_vars:
    
    # change the column in the dataframe so that it is binned into low, medium, and high
    no_md_cat_df.loc[:, quant_var] = pd.cut(no_md_cat_df[quant_var], 4, labels = four_labels)
    
    # combine medium1 and medium2 into medium
    no_md_cat_df.loc[:, quant_var] = no_md_cat_df[quant_var].replace(['medium1', 'medium2'], 'medium')

In [135]:
# initial look at the categorical data
no_md_cat_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,False,medium,low,low,False,False,low,low,high,low,False
1,False,high,low,low,False,True,low,low,high,low,True
2,False,low,low,low,False,True,medium,low,high,low,False
3,False,high,low,low,False,True,low,low,high,low,False
4,False,high,low,low,False,True,low,low,high,low,True


In [136]:
# check the bin sizes
for quant_var in no_md_quant_vars:
    print(no_md_cat_df[quant_var].value_counts())

high      1040
medium     254
low         61
Name: jupyter_prop, dtype: int64
low       1323
medium      29
high         3
Name: num_contrib, dtype: int64
low       916
medium    281
high      158
Name: image_prop, dtype: int64
low       1338
medium      16
high         1
Name: num_commits, dtype: int64
low       1178
medium      93
high        84
Name: non_exec_prop, dtype: int64
high      1165
medium     169
low         21
Name: exec_inorder, dtype: int64
low       1348
medium       6
high         1
Name: exec_skips, dtype: int64


Again, we decide to have **two bins** for each quantitative variable

In [145]:
# create a copy of the dataframe that we will turn into categorical data
no_md_cat_df = no_md_df.copy()

In [146]:
# for each quantitative field in the data
for quant_var in no_md_quant_vars:
    
    # change the column in the dataframe so that it is binned into low, medium, and high
    no_md_cat_df.loc[:, quant_var] = pd.cut(no_md_cat_df[quant_var], 2, labels = labels)

In [147]:
# initial look at the new categorical data
no_md_cat_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,False,higher,lower,lower,False,False,lower,lower,higher,lower,False
1,False,higher,lower,lower,False,True,lower,lower,higher,lower,True
2,False,lower,lower,lower,False,True,lower,lower,higher,lower,False
3,False,higher,lower,lower,False,True,lower,lower,higher,lower,False
4,False,higher,lower,lower,False,True,lower,lower,higher,lower,True


In [148]:
# check the bin sizes again
for quant_var in no_md_quant_vars:
    print(no_md_cat_df[quant_var].value_counts())

higher    1263
lower       92
Name: jupyter_prop, dtype: int64
lower     1348
higher       7
Name: num_contrib, dtype: int64
lower     1102
higher     253
Name: image_prop, dtype: int64
lower     1354
higher       1
Name: num_commits, dtype: int64
lower     1249
higher     106
Name: non_exec_prop, dtype: int64
higher    1290
lower       65
Name: exec_inorder, dtype: int64
lower     1354
higher       1
Name: exec_skips, dtype: int64


# Multiple Correspondence Analysis (MCA)

# Association Rule Mining

## One-Hot Encoding

We must put our categorical data into a format that can be input into the Apriori algorithm

# To-Do

<input type="checkbox"> Look at the outliers that are being placed by themselves in a bin 

<input type="checkbox"> Determine whether you want to keep 2 or 3 bins for the quantitative variables

<input type="checkbox"> MCA

<input type="checkbox"> one-hot encoding 

<input type="checkbox"> plug into apriori algorithm to get frequent itemsets

<input type="checkbox"> analyze association rules from frequent itemsets