# Imports

In [2]:
import pandas as pd
import prince

# Loading the Binned Data

## Markdown Cells

In [22]:
md_filepath = 'binning-data/markdown_group_binned.csv'
md_df = pd.read_csv(md_filepath)

In [23]:
# save the original dataframe
md_original = md_df

In [24]:
# clear the first two columns
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F
md_df = md_df.replace(True, 'T')
md_df = md_df.replace(False, 'F')

In [25]:
# initial look at the data
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,T,F,F,F,medium,low,lower,low,T,T,F,low,T,lower,T,low,higher,lower,F
1,T,F,F,F,high,medium,lower,low,T,T,T,medium,F,lower,T,high,higher,lower,F
2,T,F,F,F,high,medium,lower,low,T,T,T,medium,T,lower,T,high,higher,lower,F
3,T,F,F,F,high,medium,lower,low,T,T,T,medium,T,lower,F,low,higher,lower,F
4,T,F,F,T,high,medium,lower,low,F,F,T,medium,F,lower,F,low,higher,lower,F


In [26]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

In [27]:
# check the bin sizes
for var in md_vars:
    print(md_df[var].value_counts())

T    1246
F    1045
Name: longer_beginning, dtype: int64
F    1698
T     593
Name: longer_ending, dtype: int64
F    2184
T     107
Name: has_author, dtype: int64
F    1847
T     444
Name: has_equation, dtype: int64
high      1804
medium     346
low        141
Name: jupyter_prop, dtype: int64
medium    1879
high       216
low        196
Name: markdown_prop, dtype: int64
lower     2140
higher     151
Name: num_contrib, dtype: int64
low       1241
medium     873
high       177
Name: image_prop, dtype: int64
T    1321
F     970
Name: is_education, dtype: int64
F    1423
T     868
Name: has_links, dtype: int64
T    2038
F     253
Name: has_comments, dtype: int64
medium    1276
high       660
low        355
Name: md_frequency, dtype: int64
T    1478
F     813
Name: has_title, dtype: int64
lower     1981
higher     310
Name: num_commits, dtype: int64
F    1236
T    1055
Name: md_format, dtype: int64
low       2003
high       173
medium     115
Name: non_exec_prop, dtype: int64
higher    2180


## No Markdown Cells

In [16]:
no_md_filepath = 'binning-data/no_markdown_group_binned.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [17]:
# save the original dataframe
no_md_original = no_md_df

In [18]:
# clear the first two columns
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# change True and False to T and F (strings)
no_md_df = no_md_df.replace(True, 'T')
no_md_df = no_md_df.replace(False, 'F')

In [19]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,F,medium,lower,low,F,F,lower,lower,higher,lower,F
1,F,high,lower,low,F,T,lower,lower,higher,lower,T
2,F,low,higher,low,F,T,higher,lower,higher,lower,F
3,F,high,lower,low,F,T,lower,lower,higher,lower,F
4,F,high,lower,low,F,T,higher,lower,higher,lower,T


In [20]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

In [21]:
# check the bin counts
for var in no_md_vars:
    print(no_md_df[var].value_counts())

F    1343
T      12
Name: has_author, dtype: int64
high      910
medium    286
low       159
Name: jupyter_prop, dtype: int64
lower     1237
higher     118
Name: num_contrib, dtype: int64
low       916
medium    281
high      158
Name: image_prop, dtype: int64
F    1209
T     146
Name: is_education, dtype: int64
T    919
F    436
Name: has_comments, dtype: int64
lower     1042
higher     313
Name: num_commits, dtype: int64
lower     1249
higher     106
Name: non_exec_prop, dtype: int64
higher    1205
lower      150
Name: exec_inorder, dtype: int64
lower     1179
higher     176
Name: exec_skips, dtype: int64
F    1139
T     216
Name: has_error, dtype: int64


# Multiple Correspondence Analysis (MCA)

## Markdown Cells

### Performing MCA

In [28]:
# instantiate MCA object and fit to data
md_mca = prince.MCA(n_components = len(md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
md_mca_fit = md_mca.fit(md_df)

In [29]:
# put the results into dataframe format
md_mca_df = md_mca_fit.row_coordinates(md_df)

# initial look at the results of the MCA
md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.03895,0.76415,0.123466,-0.573001,0.159412,-0.204294,0.304377,0.201377,-0.190293,0.129577,-0.185345,0.197234,-0.352195,-0.169324,-0.125456,0.142135,-0.402566,0.068305,-0.162917
1,0.379564,0.01533,0.418419,-0.186083,-0.246143,0.184833,-0.517825,0.400313,0.171027,-0.027436,-0.090247,-0.123177,-0.118447,0.201818,0.160676,0.159336,0.12433,0.098163,-0.293007
2,0.460085,-0.070535,0.408262,-0.26588,-0.238258,0.137878,-0.251149,0.324552,0.269962,-0.056728,-0.141259,-0.048566,0.010673,0.370292,0.096294,0.021593,0.07848,0.111308,-0.235359
3,0.154882,-0.208383,0.083654,-0.174085,-0.063831,0.149867,0.030867,0.029345,0.131944,-0.062887,-0.063892,-0.01268,-0.239903,0.131121,0.038304,0.005465,-0.221667,-0.075274,0.161511
4,-0.177197,-0.204644,0.06972,0.044641,0.069887,0.080545,-0.319025,-0.109154,-0.073152,0.13563,-0.120507,-0.000196,0.051243,0.148198,-0.058387,0.373835,-0.085053,-0.225825,0.103342


### Analyzing the Results of MCA

In [37]:
# extract the explained variance
md_mca_fit.explained_inertia_

[0.1264117646830074,
 0.07712189891928822,
 0.06526424818252895,
 0.05840604317218444,
 0.0565107247007416,
 0.04972328585655356,
 0.04611778026252358,
 0.043746856640539476,
 0.04271778486206097,
 0.04008643490669374,
 0.03888754395770909,
 0.03642054625134442,
 0.03497393600902581,
 0.03457026936060691,
 0.03392026746091107,
 0.032169249866233006,
 0.02933996114039189,
 0.028167206290571938,
 0.026366212871817868]

In [42]:
# examine correlation between variables and components
md_mca_fit.column_coordinates(md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
longer_beginning_F,-0.501642,0.444638,0.095962,0.126237,0.254459,0.19793,-0.045684,-0.135847,0.198687,-0.292268,0.215025,-0.031523,0.373521,-0.119315,0.005951,-0.092144,0.010672,-0.062968,0.095803
longer_beginning_T,0.420719,-0.37291,-0.080482,-0.105873,-0.213411,-0.166001,0.038314,0.113933,-0.166635,0.24512,-0.180338,0.026438,-0.313266,0.100067,-0.004991,0.07728,-0.008951,0.052811,-0.080349
longer_ending_F,-0.319238,-0.07631,0.01457,-0.00022,-0.128835,0.032059,-0.001481,-0.049057,0.04594,0.115785,-0.230736,-0.059781,-0.135868,0.163877,-0.072235,0.191716,-0.107107,-0.02363,-0.071063
longer_ending_T,0.914109,0.218507,-0.04172,0.000629,0.368907,-0.091798,0.004239,0.14047,-0.131546,-0.33154,0.660691,0.171177,0.389045,-0.469247,0.206837,-0.548962,0.30669,0.067662,0.203483
has_author_F,-0.037588,-0.038748,0.045978,-0.01657,-0.002381,0.028142,-0.016798,0.019799,-0.143904,-0.10731,-0.030546,-0.052161,0.002923,0.028043,-0.037419,-0.043706,-0.033017,-0.014016,-0.018561
has_author_T,0.767213,0.790902,-0.938462,0.338221,0.048596,-0.574407,0.342869,-0.404116,2.937261,2.190336,0.623482,1.064669,-0.059653,-0.572385,0.763765,0.892094,0.673915,0.286075,0.378848
has_equation_F,-0.168698,-0.049979,0.087811,-0.06268,-0.086274,0.056066,0.156559,0.15861,0.115709,-0.041754,0.125659,-0.13852,-0.206771,-0.079648,0.042135,-0.147445,0.025538,0.070294,-0.024792
has_equation_T,0.701769,0.207908,-0.365287,0.260743,0.358892,-0.233232,-0.65127,-0.659803,-0.481338,0.173693,-0.522731,0.57623,0.860149,0.331329,-0.175277,0.613358,-0.106237,-0.292415,0.103132
jupyter_prop_high,-0.128079,-0.162226,-0.018441,-0.153899,-0.046789,0.145331,-0.182521,0.000542,0.110188,0.03011,-0.218918,0.083345,0.02947,-0.013851,-0.038764,-0.213566,0.054825,0.054573,0.08718
jupyter_prop_low,-0.138341,0.432019,1.158335,1.582347,-0.745427,-0.104266,1.532623,-1.01728,-0.61085,-0.916968,0.175315,0.516031,-0.169728,0.186297,2.046899,0.760829,0.27877,-0.447243,-0.752502


## No Markdown Cells

### Performing MCA

In [31]:
# instantiate MCA object and fit to data
no_md_mca = prince.MCA(n_components = len(no_md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   engine = 'auto',
                   random_state = 42)
no_md_mca_fit = no_md_mca.fit(no_md_df)

In [32]:
# put the results into dataframe format
no_md_mca_df = no_md_mca_fit.row_coordinates(no_md_df)

# initial look at the results of the MCA
no_md_mca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.261743,0.296846,-0.257493,-0.137606,0.353145,-0.39161,-0.227178,0.040868,-0.173026,-0.182239,-0.107518
1,0.17825,0.041815,-0.228766,-0.33524,-0.239304,0.423842,0.295788,0.039632,0.048768,-0.29318,-0.113312
2,-0.40653,-0.122694,0.352174,0.712612,0.732707,0.570806,0.49862,0.231289,0.119264,-0.033058,0.198903
3,-0.131782,-0.086343,0.02892,-0.23108,-0.1224,0.103071,-0.011793,-0.073935,0.028024,-0.192888,-0.095841
4,0.28524,-0.023131,0.126216,-0.004104,-0.11384,0.465679,0.443982,0.076053,-0.161783,-0.182193,-0.188592


### Analyzing the Results of MCA

In [33]:
# extract the explained variance
no_md_mca_fit.explained_inertia_

[0.12370870341785964,
 0.09849552031983039,
 0.09734374536610015,
 0.08658594297883986,
 0.08400283383718782,
 0.07838450195841805,
 0.07407927250811161,
 0.07013650754422476,
 0.06622192784246521,
 0.05862806239954164,
 0.05692710063661963]

In [41]:
# examine correlation between variables and components
no_md_mca_fit.column_coordinates(no_md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
has_author_F,0.00565,-0.0516,-0.024969,0.014281,0.017453,0.002794,-0.001172,-0.058902,-0.028048,-0.025372,0.003697
has_author_T,-0.632321,5.77485,2.794447,-1.598309,-1.953268,-0.312659,0.131142,6.592143,3.139046,2.839528,-0.413748
jupyter_prop_high,0.091038,-0.328076,0.078405,-0.354606,-0.203238,0.228355,-0.006763,-0.170982,0.097016,0.265061,-0.080002
jupyter_prop_low,-0.700465,-0.19953,-0.851629,1.603471,-0.00559,0.067867,1.131312,0.984586,-0.372883,-0.165388,0.664374
jupyter_prop_medium,0.099755,1.154805,0.223987,0.236853,0.649775,-0.764315,-0.607429,-0.003342,-0.101385,-0.751429,-0.114803
num_contrib_higher,-0.74319,-0.019378,0.740974,0.180185,2.128201,1.448237,0.037298,-0.277785,1.276802,0.522074,0.284251
num_contrib_lower,0.070894,0.001848,-0.070683,-0.017188,-0.203013,-0.13815,-0.003558,0.026499,-0.121797,-0.049802,-0.027115
image_prop_high,0.168111,-1.497701,0.969834,-0.329853,-0.234827,-1.451802,0.604853,0.470135,0.574472,0.099628,0.029067
image_prop_low,-0.299096,0.317588,-0.150137,-0.056396,0.098019,0.091621,0.288857,-0.203674,0.014493,-0.112212,-0.220509
image_prop_medium,0.880463,-0.193145,-0.0559,0.369307,-0.187484,0.51765,-1.281709,0.399589,-0.370257,0.309769,0.702468


# Association Rule Mining

## One-Hot Encoding

We must put our categorical data into a format that can be input into the Apriori algorithm

# To-Do

<input type="checkbox"> Look at the outliers that are being placed by themselves in a bin 

<input type="checkbox"> Determine whether you want to keep 2 or 3 bins for the quantitative variables

<input type="checkbox"> MCA

<input type="checkbox"> one-hot encoding 

<input type="checkbox"> plug into apriori algorithm to get frequent itemsets

<input type="checkbox"> analyze association rules from frequent itemsets