# Imports

In [1]:
import pandas as pd
import prince
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

# Loading the Data

## Markdown Cells

In [2]:
md_filepath = 'markdown_group.csv'
md_df = pd.read_csv(md_filepath)

In [3]:
# initial look at the data
md_df.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
0,3,594,True,False,False,False,0.507588,1.0,0.044444,3.0,...,1.0,True,0.0,1.0,1.809524,False,False,0.0,False,7.0
1,6,1222,True,False,False,False,1.0,0.0,0.16129,1.0,...,2.0,True,0.961538,1.0,0.0,False,False,0.0,False,5.0
2,7,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,...,1.0,True,0.988636,1.0,0.0,False,True,15.0,False,30.0
3,12,2705,True,False,False,False,1.0,0.5,0.461538,1.0,...,1.0,False,0.0,0.923077,1.615385,False,False,6.0,False,7.0
4,15,2861,True,False,False,True,1.0,0.214286,0.461538,1.0,...,1.0,False,0.0,1.0,6.333333,False,False,0.0,False,1.0


In [4]:
# clear the first two columns, which hold indexes and notebook IDs
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [5]:
# check the data again
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers
0,True,False,False,False,0.507588,1.0,0.044444,3.0,0.0,True,...,1.0,True,0.0,1.0,1.809524,False,False,0.0,False,7.0
1,True,False,False,False,1.0,0.0,0.16129,1.0,0.0,True,...,2.0,True,0.961538,1.0,0.0,False,False,0.0,False,5.0
2,True,False,False,False,0.970851,0.011364,0.375887,1.0,0.0,True,...,1.0,True,0.988636,1.0,0.0,False,True,15.0,False,30.0
3,True,False,False,False,1.0,0.5,0.461538,1.0,0.0,True,...,1.0,False,0.0,0.923077,1.615385,False,False,6.0,False,7.0
4,True,False,False,True,1.0,0.214286,0.461538,1.0,0.0,False,...,1.0,False,0.0,1.0,6.333333,False,False,0.0,False,1.0


In [6]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'output_cell_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test',
 'num_headers']

## No Markdown Cells

In [7]:
no_md_filepath = 'no_markdown_group.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [8]:
# clear the first two columns, which hold indexes and notebook IDs
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [9]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test
0,False,0.71413,0.538462,1.0,0.142857,False,False,1.0,0.0,1.0,1.0,False,True,2.0,False
1,False,0.99784,0.780488,1.0,0.1875,False,True,1.0,0.04878,0.789474,10.631579,True,True,5.0,False
2,False,0.121957,0.461538,5.0,0.0,False,True,11.0,0.0,0.909091,6.181818,False,True,0.0,False
3,False,1.0,0.190476,1.0,0.0,False,True,1.0,0.0,1.0,1.631579,False,False,1.0,False
4,False,0.940063,0.125,3.0,0.0,False,True,3.0,0.125,1.0,1.0,True,False,3.0,False


In [10]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'output_cell_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test']

# Principal Component Analysis (PCA)

## Adjust the Data

Since PCA is performed on quantitative variables, we change `True` and `False` in the data to `1` and `0`, respectively

In [217]:
# replace True with 1 in both groups
md_adjusted = md_df.replace(True, 1)
no_md_adjusted = no_md_df.replace(True, 1)

In [218]:
# replace False with 0 in both groups
md_adjusted = md_adjusted.replace(False, 0)
no_md_adjusted = no_md_adjusted.replace(False, 0)

In [219]:
# check the markdown cell group
md_adjusted.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,1.0,0.0,0.0,0.0,0.507588,0.044444,3.0,0.0,1.0,1.0,0.0,0.068182,1.0,1.0,1.0,0.0,1.0,1.809524,0.0
1,1.0,0.0,0.0,0.0,1.0,0.16129,1.0,0.0,1.0,1.0,1.0,0.333333,0.0,2.0,1.0,0.961538,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.970851,0.375887,1.0,0.0,1.0,1.0,1.0,0.621429,1.0,1.0,1.0,0.988636,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.461538,1.0,0.0,1.0,1.0,1.0,0.64,1.0,1.0,0.0,0.0,0.923077,1.615385,0.0
4,1.0,0.0,0.0,1.0,1.0,0.461538,1.0,0.0,0.0,0.0,1.0,0.52,0.0,1.0,0.0,0.0,1.0,6.333333,0.0


In [220]:
# check the no markdown cell group
no_md_adjusted.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.0,0.71413,1.0,0.142857,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.99784,1.0,0.1875,0.0,1.0,1.0,0.04878,0.789474,10.631579,1.0
2,0.0,0.121957,5.0,0.0,0.0,1.0,11.0,0.0,0.909091,6.181818,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.631579,0.0
4,0.0,0.940063,3.0,0.0,0.0,1.0,3.0,0.125,1.0,1.0,1.0


## Standardizing the Data

In [221]:
# normalize all values in markdown cell group
md_adjusted_stand = md_adjusted.loc[:, md_vars].values
md_adjusted_stand = StandardScaler().fit_transform(md_adjusted_stand)

In [222]:
# check standardization
print(np.mean(md_adjusted_stand), np.std(md_adjusted_stand))

7.602639141268273e-17 1.0


In [223]:
# put normalized group back into dataframe format
md_adjusted_stand_df = pd.DataFrame(data = md_adjusted_stand, columns = md_vars)

# initial look at the standardized data
md_adjusted_stand_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.915797,-0.590961,-0.221343,-0.490296,-1.293088,-1.94205,0.504838,-0.960204,0.856909,1.280391,-2.838192,-1.964232,0.741665,-0.399981,1.082388,-0.410421,0.690767,-0.337021,-0.393367
1,0.915797,-0.590961,-0.221343,-0.490296,0.570932,-1.28607,-0.201473,-0.960204,0.856909,1.280391,0.352337,-0.934795,-1.348317,-0.05639,1.082388,3.185141,0.690767,-0.487356,-0.393367
2,0.915797,-0.590961,-0.221343,-0.490296,0.460589,-0.081313,-0.201473,-0.960204,0.856909,1.280391,0.352337,0.18372,0.741665,-0.399981,1.082388,3.28647,0.690767,-0.487356,-0.393367
3,0.915797,-0.590961,-0.221343,-0.490296,0.570932,0.399542,-0.201473,-0.960204,0.856909,1.280391,0.352337,0.255823,0.741665,-0.399981,-0.923883,-0.410421,0.003778,-0.35315,-0.393367
4,0.915797,-0.590961,-0.221343,2.039586,0.570932,0.399542,-0.201473,-0.960204,-1.166986,-0.781011,0.352337,-0.210071,-1.348317,-0.399981,-0.923883,-0.410421,0.690767,0.038819,-0.393367


In [224]:
# normalize all values in the no markdown cell group
no_md_adjusted_stand = no_md_adjusted.loc[:, no_md_vars].values
no_md_adjusted_stand = StandardScaler().fit_transform(no_md_adjusted_stand)

In [225]:
# check standardization
print(np.mean(no_md_adjusted_stand), np.std(no_md_adjusted_stand))

-3.909057653963651e-17 1.0


In [226]:
# put normalized group back into dataframe format
no_md_adjusted_stand_df = pd.DataFrame(data = no_md_adjusted_stand, columns = no_md_vars)

# initial look at the standardized data
no_md_adjusted_stand_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,-0.094526,-0.68694,-0.256338,-0.263196,-0.347507,-1.451826,-0.336957,-0.42506,0.52516,-0.371956,-0.435477
1,-0.094526,0.546025,-0.256338,-0.131354,-0.347507,0.688788,-0.336957,-0.23371,-0.681648,0.365723,2.296334
2,-0.094526,-3.260437,1.489081,-0.685091,-0.347507,0.688788,6.280101,-0.42506,0.004038,0.024918,-0.435477
3,-0.094526,0.55541,-0.256338,-0.685091,-0.347507,0.688788,-0.336957,-0.42506,0.52516,-0.323583,-0.435477
4,-0.094526,0.294933,0.616371,-0.685091,-0.347507,0.688788,0.986454,0.065274,0.52516,-0.371956,2.296334


## Markdown Cells

### Performing PCA

In [227]:
# call PCA on the dataset
pca_md = PCA(n_components = len(md_vars))
pc_md = pca_md.fit_transform(md_adjusted_stand)

In [228]:
# put this into dataframe format
pca_md_df = pd.DataFrame(data = pc_md, columns = ['principal component ' + str(i) for i in range(len(md_vars))])

# initial look at the results of the PCA
pca_md_df.head()

Unnamed: 0,principal component 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,principal component 11,principal component 12,principal component 13,principal component 14,principal component 15,principal component 16,principal component 17,principal component 18
0,0.325632,2.486135,-0.434603,-0.662086,-0.871493,2.313732,0.464941,-0.085904,1.41165,-2.071667,-1.24266,-0.325391,0.337758,0.112486,0.04997,0.648099,-1.139459,-0.547465,-1.359421
1,0.850372,1.964054,-0.791791,-2.924954,0.506385,-0.983145,-0.74338,1.794124,-0.508036,-1.007058,0.321607,-0.258157,0.998064,0.158485,-0.05322,0.187462,-0.580699,0.106422,-0.437267
2,1.989086,0.797023,-1.404567,-2.214956,1.127977,-0.188968,-0.073963,1.062631,-0.271357,-0.914284,1.193776,0.387692,1.237828,0.432216,-0.342647,-0.053309,-0.392262,0.449585,-0.385573
3,0.807969,-0.854003,-0.589295,-0.710216,0.724741,0.58588,0.295259,0.118766,-0.033199,-0.930153,-0.321548,0.055024,-1.164403,0.325895,0.229711,0.325803,-0.758573,1.032983,0.246042
4,-0.685577,-0.632618,-0.671905,-0.009599,-0.206052,-1.158051,-0.66403,0.57325,1.185572,-0.06996,0.821988,-1.176089,-1.239776,-1.165998,0.851267,0.976175,0.247722,-0.617327,0.810952


### Analyze the Results of PCA

In [229]:
# extract the explained variance ratios
print(pca_md.explained_variance_ratio_)

[0.16405735 0.09253741 0.08574525 0.07068464 0.06554028 0.05693957
 0.05393053 0.04959195 0.04624602 0.04467132 0.04339441 0.04109365
 0.03401479 0.03016097 0.0294863  0.02710162 0.02583591 0.02240953
 0.01655849]


In [230]:
# examine each principal component with each feature
md_pca_var = pd.DataFrame(pca_md.components_, columns = md_vars)
md_pca_var

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.285941,0.307101,0.084662,0.192641,-0.116514,0.414784,0.078877,-0.140132,0.363885,0.382156,0.036146,0.228698,0.186034,0.096104,0.381265,0.143621,0.110146,-0.094329,-0.072382
1,-0.288496,0.050112,0.072435,-0.029529,-0.335399,-0.262235,0.430237,-0.264251,0.070177,0.090323,-0.136037,-0.444716,-0.155943,0.273586,0.124651,0.288457,0.184996,-0.091546,0.045496
2,-0.005633,0.05723,0.196835,0.142991,-0.101358,-0.077816,0.001696,0.185093,0.132622,0.178538,0.164829,-0.162013,-0.118717,0.234797,0.148224,-0.199409,-0.521322,0.578203,0.216741
3,-0.002933,0.058328,0.024464,0.077071,-0.432527,0.225516,0.450558,0.340499,-0.190698,-0.232117,-0.141803,0.248416,0.108025,0.251062,-0.215664,-0.264335,-0.024007,-0.00205,-0.263521
4,0.347004,-0.238179,-0.084076,-0.355716,-0.111945,0.040386,0.200143,-0.345431,-0.194304,-0.047419,0.287275,0.246118,0.163192,0.228308,-0.147761,0.223769,-0.135978,0.069582,0.398266
5,-0.085344,0.106391,-0.087946,-0.320845,-0.066453,0.126454,0.024065,-0.329605,0.022425,0.085199,-0.637554,-0.051721,0.290083,-0.284571,-0.02757,-0.115401,-0.31159,0.216895,-0.029096
6,-0.024561,-0.086823,0.786485,-0.153616,0.046541,-0.048024,-0.152149,-0.022773,-0.185179,0.063425,0.046788,-0.136122,0.447683,0.107736,0.002924,-0.137838,0.137075,-0.093467,-0.062997
7,0.247751,-0.195482,0.226244,-0.068564,0.379966,0.041954,0.065023,-0.027564,-0.018023,-0.011351,-0.295672,0.080887,-0.363774,0.302278,-0.020847,0.331163,-0.242417,0.030638,-0.453285
8,0.18788,-0.458557,-0.056694,0.567682,0.149459,0.028491,0.094952,-0.011902,-0.01779,-0.061789,-0.391835,-0.125206,0.195191,0.10843,0.089609,-0.11645,0.025002,-0.12332,0.368804
9,-0.278342,0.477272,0.24408,0.102523,0.191244,0.136159,-0.096159,-0.046555,0.054506,-0.287274,-0.204121,0.25398,-0.134804,0.200284,-0.234026,0.16098,-0.021766,-0.075286,0.470473


- principal component 0: `markdown_prop`
- principal component 1: `md_frequency`, `num_contrib`
- principal component 2: `exec_inorder`, `exec_skips`

## No Markdown Cells

### Performing PCA

In [231]:
# call PCA on the dataset
pca_no_md = PCA(n_components = len(no_md_vars))
pc_no_md = pca_no_md.fit_transform(no_md_adjusted_stand)

In [232]:
# put this into dataframe format
pca_no_md_df = pd.DataFrame(data = pc_no_md, columns = ['principal component ' + str(i) for i in range(len(no_md_vars))])

# initial look at the results of the PCA
pca_no_md_df.head()

Unnamed: 0,principal component 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10
0,-0.893334,-0.524358,-0.55799,1.153832,-0.612463,0.477258,0.286244,0.188093,-0.550187,0.010195,-0.060608
1,1.796207,-0.522384,1.105356,0.051893,-0.33725,-0.253991,-1.346783,-0.182888,0.32491,-0.092964,0.401822
2,-0.766702,6.184324,-0.229399,0.522036,0.28679,-0.26411,-1.5266,1.184071,-3.0719,0.078415,-1.116162
3,-0.299909,-0.53915,-0.087177,-0.537409,-0.241197,-0.28183,-0.20065,-0.757863,-0.052443,0.510133,-0.787259
4,0.379913,0.570896,1.053284,-0.151507,-0.585976,-0.573586,-2.347405,0.228461,0.061454,0.475055,0.371588


### Analyze the Results of PCA

In [233]:
# extract the explained variance ratios
print(pca_no_md.explained_variance_ratio_)

[0.14487747 0.11721992 0.1108202  0.10747086 0.09345702 0.0832355
 0.07703527 0.07193561 0.06973362 0.06435767 0.05985687]


In [234]:
# examine each principal component with each feature
no_md_pca_var = pd.DataFrame(pca_no_md.components_, columns = no_md_vars)
no_md_pca_var

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,-0.07796,0.218722,-0.201551,0.275545,-0.130421,0.193203,0.041966,-0.334509,-0.51103,0.503892,0.38273
1,0.152516,-0.406227,0.55243,0.122769,-0.013507,0.249644,0.620293,-0.039759,-0.10004,0.150655,-0.099567
2,0.422296,-0.09567,0.008768,-0.590601,0.019222,0.155617,-0.106086,0.449732,-0.242202,0.152586,0.378923
3,-0.281288,-0.497047,0.194897,-0.268845,0.129516,-0.557427,-0.196965,-0.346389,-0.157662,0.125616,0.191423
4,-0.223756,-0.087359,-0.230465,0.009006,0.885108,0.223421,0.076566,0.11867,-0.099391,0.110357,-0.117352
5,0.770579,-0.21974,-0.284887,0.260221,0.214538,-0.173985,-0.037504,-0.339071,0.09106,-0.080071,0.04087
6,0.157259,-0.042017,0.014937,0.080415,-0.10119,-0.198097,-0.275393,0.231249,-0.272875,0.481311,-0.694186
7,0.06399,0.393428,0.002759,0.069811,0.136451,-0.655305,0.506356,0.278243,-0.203352,-0.052168,0.101516
8,0.165593,0.385746,0.676924,0.166861,0.30379,0.045157,-0.417174,-0.096796,-0.167529,-0.160298,0.078855
9,0.089319,0.344508,0.123094,-0.327837,0.121861,-0.043609,0.117353,-0.266636,0.581166,0.557669,-0.018433


These numbers are bad :(

# Factor Analysis for Mixed Data (FAMD)

## Markdown Cells

### Performing FAMD

In [11]:
# instantiate FAMD object and fit to data
md_famd = prince.FAMD(n_components = len(md_vars),
                     n_iter = 10,
                     copy = True,
                     check_input = True,
                     engine = 'auto',
                     random_state = 42)
md_famd_fit = md_famd.fit(md_df)

In [12]:
# put the results into dataframe format
md_famd_df = md_famd_fit.row_coordinates(md_df)

# initial look at the results of the FAMD
md_famd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.151264,-0.016238,1.488211,0.45845,-1.589972,0.706413,-1.179598,0.197602,-0.548496,-0.086474,...,-0.316891,0.13637,-0.525482,-0.016051,-0.224745,0.39955,-0.211204,0.354389,-0.031304,0.011708
1,0.920734,2.031507,1.786297,-1.099412,0.36402,-0.325544,0.303062,-0.528449,0.088303,0.016506,...,-0.448427,0.096455,-0.059392,-0.09508,-0.090741,0.069583,-0.028761,-0.077988,-0.02847,0.003315
2,2.394507,1.62903,0.637295,-0.847658,1.335335,0.595721,-0.11062,-0.592877,0.648553,0.408211,...,-0.154546,0.320923,0.010213,0.105593,-0.272322,0.264775,-0.007857,-0.059822,-0.033959,-0.059553
3,0.983406,-0.371839,0.411791,-0.736242,-0.034379,-0.056684,-0.256872,0.242391,-0.603736,0.078825,...,-0.200589,-0.057553,-0.064881,-0.311742,-0.085537,0.268807,0.280837,-0.154268,-0.037737,0.044633
4,0.35139,0.311371,0.546583,-0.860767,-0.15327,-0.396782,-0.128402,0.152495,-0.315516,0.012931,...,-0.467354,-0.333976,0.434454,0.159937,0.533116,0.077951,-0.038284,-0.111949,-0.025086,-0.010907


### Anayzing the Results of FAMD

In [13]:
# extract the explained variance
md_famd_fit.explained_inertia_

[0.16165843053673734,
 0.1321790759875022,
 0.1041149855964393,
 0.10112067694191446,
 0.08242453652892333,
 0.06562781875633061,
 0.05778213647460086,
 0.05520837536384379,
 0.052383826681710745,
 0.03755124187834275,
 0.033493926213676556,
 0.025037180995498776,
 0.020799460215050542,
 0.012671838475843075,
 0.00993050530211492,
 0.00812257285670816,
 0.007742686697860873,
 0.006771507712111673,
 0.006115921095084764,
 0.005705199862300251,
 0.005095700391373615,
 0.005006891203333915,
 0.002088711775235872,
 0.0013667924574620037]

In [14]:
# examine correlation between variables and componbents
md_famd_fit.column_correlations(md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
exec_inorder,0.234929,0.449313,-0.390067,-0.301679,-0.341185,0.174854,0.124916,-0.040062,-0.22441,-0.217556,...,0.090516,-0.168625,-0.059331,-0.06242,0.079139,-0.429122,0.409531,-0.145562,0.075952,-0.178558
exec_skips,-0.33402,-0.952876,0.737493,0.761515,0.833969,-0.214797,-0.387349,0.061947,0.654457,0.664741,...,-0.334968,0.230276,0.126971,-0.041021,-0.061052,0.858486,-0.933464,0.150006,-0.248628,0.2455
has_author,0.113798,-0.03387,-0.048306,0.127744,0.057367,0.062384,-0.08256,-0.148603,0.059003,0.052922,...,-0.160672,-0.101538,0.062769,-0.139335,0.155837,0.021844,-0.097428,-0.128944,-0.168193,-0.116108
has_comments,0.074202,-0.118541,-0.021382,0.09321,0.170974,0.148487,-0.021848,0.035859,0.144555,0.047954,...,-0.061227,-0.144036,-0.128495,-0.129577,0.017611,0.058411,-0.102379,-0.09405,-0.063929,-0.103199
has_equation,0.167054,-0.013117,-0.113965,0.1096,0.055566,0.112763,-0.117945,-0.138181,0.106301,0.097052,...,-0.170366,-0.139593,0.059664,-0.165369,0.173697,-0.038731,-0.057228,-0.17654,-0.169669,-0.165536
has_error,0.016425,-0.11815,0.046579,0.110685,0.143899,0.062444,-0.040835,0.014703,0.101525,0.072277,...,-0.061035,-0.064612,-0.048536,-0.073358,0.018977,0.084379,-0.12141,-0.030489,-0.060949,-0.035606
has_export,0.083738,-0.054923,-0.028551,0.106715,0.139116,0.155437,0.041635,0.067452,0.092885,0.060151,...,-0.034664,-0.147845,-0.168732,-0.13244,-0.000916,0.029142,-0.060233,-0.082321,-0.044283,-0.080366
has_links,0.316549,-0.003326,-0.21009,0.19482,0.097475,0.207678,-0.234494,-0.274738,0.161064,0.148825,...,-0.322487,-0.249592,0.128768,-0.315411,0.335069,-0.06685,-0.117754,-0.340473,-0.330681,-0.315823
has_test,0.134574,-0.01999,-0.103533,0.055622,0.078492,0.146218,-0.069817,-0.040031,0.107365,0.04822,...,-0.09826,-0.153678,-0.051258,-0.147962,0.093003,-0.026304,-0.038376,-0.138616,-0.092707,-0.136202
has_title,0.209921,0.023085,-0.180861,0.053025,0.035125,0.157213,-0.150063,-0.136079,0.107321,0.071043,...,-0.165864,-0.170878,0.04493,-0.189105,0.186133,-0.096203,-0.018026,-0.215527,-0.154399,-0.218087


Mostly `exec_skips`, `exec_inorder`, and `num_headers`

## No Markdown Cells

### Performing FAMD

In [15]:
# instantiate FAno_md object and fit to data
no_md_famd = prince.FAMD(n_components = len(no_md_vars),
                     n_iter = 10,
                     copy = True,
                     check_input = True,
                     engine = 'auto',
                     random_state = 42)
no_md_famd_fit = no_md_famd.fit(no_md_df)

In [16]:
# put the results into dataframe format
no_md_famd_df = no_md_famd_fit.row_coordinates(no_md_df)

# initial look at the results of the FAMD
no_md_famd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.238341,-0.041101,0.318639,-0.36706,-0.820038,0.18818,-0.273049,-0.02116,0.153062,-0.359486,0.890841,-0.135701,-0.026194,-0.02136,0.00064
1,1.602259,0.978673,-0.326747,-0.036254,-0.621872,0.068524,0.295055,0.366063,-0.203034,0.353563,0.73831,0.524489,-0.082758,-0.036208,0.000515
2,1.29551,0.433946,4.438051,1.262301,1.351632,0.66979,-2.135724,1.329833,-0.258067,-0.784256,0.297577,-0.06458,-0.210691,-0.053966,-0.0007
3,0.870027,-0.535298,-0.130196,-0.605251,-0.386296,-0.347709,0.115578,0.386044,0.117077,-0.402222,-0.726199,-0.069342,-0.175426,0.005678,-0.000746
4,1.207722,-0.695831,0.646973,-0.072401,0.236245,-0.010193,0.16219,0.64697,-0.022693,-0.286542,-0.635886,1.05558,-0.131429,-0.009032,-0.000672


### Analyzing the Results of FAMD

In [17]:
# extract the explained variance
no_md_famd_fit.explained_inertia_

[0.1571846512526245,
 0.1524392015586379,
 0.11311098216869997,
 0.10010037859397894,
 0.08807454085849782,
 0.07942396066839362,
 0.06765275009484199,
 0.06326025016079549,
 0.06089663648831785,
 0.044728928182224695,
 0.032520569707825575,
 0.0215741399734718,
 0.01735194848556167,
 0.0015473312968644344,
 0.00013373050926396754]

In [18]:
# examine correlation between variables and componbents
no_md_famd_fit.column_correlations(no_md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
exec_inorder,-0.263037,-0.306972,0.241029,-0.299762,-0.238256,0.112387,0.132399,0.166809,-0.279348,-0.219583,0.266619,0.267588,-0.183239,-0.239765,0.042609
exec_skips,0.79614,0.996072,-0.701203,0.963491,0.761972,-0.47977,-0.458082,-0.484293,0.999725,0.865353,-0.888774,-0.654988,0.519608,0.870655,-0.310356
has_author,0.024424,-0.022037,1.6e-05,0.003246,0.019394,0.058682,0.031744,-0.009038,-0.016876,-0.026287,-0.006875,-0.023745,0.040189,0.026133,0.054629
has_comments,0.180278,0.085923,-0.113812,0.125926,0.182861,0.097326,-0.010126,-0.090017,0.0851,-0.006217,-0.154493,-0.14727,0.176428,-0.006141,0.123079
has_error,0.193111,0.148359,-0.199865,0.167752,0.090065,0.043228,-0.053252,-0.142257,0.140348,0.079375,-0.181087,-0.194735,0.168833,0.073021,0.058683
has_export,0.134911,0.047062,-0.099539,0.092877,0.089145,0.084962,0.012373,-0.086857,0.059039,-0.036942,-0.109766,-0.119064,0.137907,-0.009138,0.098111
has_test,-0.011583,-0.007423,0.005769,-0.008841,-0.011429,-0.009329,-0.008905,0.001648,-0.005361,0.00776,0.012117,0.013163,-0.017436,-0.002442,0.193489
image_prop,0.021244,0.106598,-0.016936,0.02411,0.190988,-0.066878,-0.106354,-0.088574,0.061262,0.057674,0.001321,0.05803,0.005149,0.108985,-0.064078
is_education,-0.073007,-0.032174,0.049082,-0.05416,-0.051962,-0.056162,-0.05462,0.074674,-0.032948,0.007536,0.062764,0.072897,-0.070116,0.005962,-0.0803
jupyter_prop,0.064159,0.023196,-0.169869,0.004555,0.026098,0.029216,-0.062274,-0.00016,0.018104,-0.00319,-0.039454,-0.091282,0.041919,0.036744,0.036136


Seems to mostly be `exec_skips`, `exec_inorder`, and `num_functions`

# Multiple Factor Analysis (MFA)

## Markdown Cells

### Grouping Variables

In [19]:
# look at the current overall variables we have and types
md_df.dtypes

longer_beginning       bool
longer_ending          bool
has_author             bool
has_equation           bool
jupyter_prop        float64
output_cell_prop    float64
markdown_prop       float64
num_contrib         float64
image_prop          float64
is_education           bool
has_links              bool
has_comments           bool
md_frequency        float64
has_title              bool
num_commits         float64
md_format              bool
non_exec_prop       float64
exec_inorder        float64
exec_skips          float64
has_error              bool
has_export             bool
num_functions       float64
has_test               bool
num_headers         float64
dtype: object

We divide as follows (groups must be **either** quantitative or qualitative):
- Group 1 (markdown analysis - qualitative):
    - `longer_beginning`
    - `longer_ending`
    - `has_equation`
    - `has_links`
    - `md_format`
    - `has_title`
- Group 2 (markdown analysis - quantitative):
    - `markdown_prop`
    - `md_frequency`
    - `num_headers`
- Group 3 (repo analysis - quantitative)
    - `jupyter_prop`
    - `num_contrib`
    - `num_commits`
- Group 4 (code analysis - quantitative)
    - `image_prop`
    - `non_exec_prop`
    - `exec_inorder`
    - `exec_skips`
    - `output_cell_prop`
    - `num_functions`
- Group 5 (code analysis - qualitative)
    - `has_comments`
    - `has_error`
    - `has_export`
    - `has_test`
- Group 6 (notebook analysis - qualitative)
    - `has_author`
    - `is_education`

In [20]:
# define the groups for the markdown cell group
md_groups = {
    'md_qual' : ['longer_beginning', 'longer_ending', 'has_equation', 'has_links', 'md_format', 'has_title'],
    'md_quant' : ['markdown_prop', 'md_frequency', 'num_headers'],
    'repo_quant' : ['jupyter_prop', 'num_contrib', 'num_commits'],
    'exec_quant' : ['image_prop', 'non_exec_prop', 'exec_inorder', 'exec_skips', 'output_cell_prop', 'num_functions'],
    'code_qual' : ['has_comments', 'has_error', 'has_export', 'has_test'],
    'nb_qual' : ['has_author', 'is_education']
}

### Performing MFA

In [21]:
# instantiate MFA object and fit to data
md_mfa = prince.MFA(groups = md_groups,
                   n_components = len(md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   normalize = True,
                   engine = 'auto',
                   random_state = 42)
md_mfa_fit = md_mfa.fit(md_df)

In [22]:
# put the results into dataframe format
md_mfa_df = md_mfa_fit.row_coordinates(md_df)

# initial look at the results of the MFA
md_mfa_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1.329479,-0.261022,1.59824,0.715299,-1.300145,-0.258437,-1.475033,-0.075163,-1.043445,-0.354667,...,0.338224,0.055552,-0.365722,-0.885537,0.667769,0.202704,0.195459,-0.19344,0.206925,0.023263
1,1.739494,0.455736,2.243607,-1.961575,0.607371,0.049031,0.174679,-0.762567,0.334262,-0.015004,...,0.324494,-0.427176,-0.076135,-0.343653,-0.035284,-0.097125,-0.074294,-0.033785,-0.036606,0.012629
2,2.656255,1.388076,0.680538,-1.910793,1.629674,-0.147676,-0.598535,0.316445,0.590572,0.629562,...,0.396023,0.111631,-0.024762,-0.70285,0.057599,0.066177,-0.140887,0.024488,-0.140719,-0.090184
3,1.93643,-0.46054,-0.12789,-0.63328,-0.105054,0.104784,0.037845,-0.109388,-0.771514,-0.061863,...,-0.329748,-0.116029,-0.055168,-0.395268,-0.071723,-0.313318,0.107377,0.018621,-0.484851,0.099922
4,0.747744,-0.382896,0.451277,-0.945464,-0.276526,0.27877,0.36756,-0.128137,-0.273839,-0.157842,...,-0.276123,-0.772197,0.320142,0.249498,-0.705959,0.293919,0.495561,-0.212273,-0.04677,-0.026722


### Analyzing the Results of MFA

In [23]:
# extract the explained variance
md_mfa_fit.explained_inertia_

[0.24580185486678413,
 0.1102355850641012,
 0.09630085328239772,
 0.08606298095312297,
 0.06317366334741077,
 0.05269770916226406,
 0.05221764561153017,
 0.04746344598601411,
 0.03989281715107279,
 0.034485674923231326,
 0.02534492318819397,
 0.023386698792497513,
 0.020380335721019067,
 0.018014514450558433,
 0.01275212577967654,
 0.011666607349645025,
 0.010365809239595016,
 0.00979780688471967,
 0.009175916481371848,
 0.008286260222896193,
 0.007173262154560836,
 0.006650875510819488,
 0.006129444230448947,
 0.0025431896460671698]

In [24]:
# examine correlation between variables and components
md_mfa_fit.column_correlations(md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
exec_inorder,0.103403,0.280708,-0.112073,-0.376729,-0.302922,-0.377453,-0.03532,0.073308,-0.128499,-0.105407,...,-0.153503,0.047823,-0.13479,0.318769,0.021865,-0.124574,-0.100807,0.132591,-0.448599,-0.180892
exec_skips,-0.04718,-0.447719,0.109189,0.875469,0.767174,0.756173,-0.082469,-0.045743,0.28876,0.45688,...,0.187088,-0.203405,0.162038,-0.789594,-0.197548,0.090164,0.231652,-0.369603,0.936662,0.25168
has_author,0.141812,0.109313,-0.100828,0.108026,0.094253,-0.069055,-0.120319,-0.052345,0.138818,0.05435,...,-0.144285,-0.154906,-0.123558,-0.12975,-0.157533,-0.139023,0.104987,-0.169579,0.000453,-0.115511
has_comments,0.102802,0.010605,-0.128596,0.087936,0.177543,-0.028638,-0.075085,0.145953,0.046227,0.079562,...,-0.09035,-0.003985,-0.135678,-0.107929,-0.080059,-0.078736,-0.0611,-0.068555,0.041987,-0.101629
has_equation,0.186213,0.148915,-0.165614,0.076141,0.090474,-0.092785,-0.167439,-0.003264,0.124983,0.121148,...,-0.176302,-0.158175,-0.149247,-0.110319,-0.182261,-0.173149,0.070781,-0.159785,-0.05754,-0.164961
has_error,0.049959,-0.016846,-0.044213,0.113932,0.146117,0.031726,-0.046504,0.063501,0.024715,0.070777,...,-0.039989,-0.023041,-0.053178,-0.109973,-0.056948,-0.036032,0.013075,-0.073295,0.091696,-0.034386
has_export,0.093334,0.058541,-0.074405,0.093554,0.148637,-0.056251,-0.029476,0.161672,-0.0292,0.097715,...,-0.091942,0.012993,-0.144236,-0.068081,-0.046167,-0.086189,-0.082386,-0.043039,0.009586,-0.0773
has_links,0.350994,0.283911,-0.299004,0.127646,0.166501,-0.181719,-0.320126,-0.022373,0.211383,0.187436,...,-0.330127,-0.298962,-0.283688,-0.214087,-0.357585,-0.333474,0.131347,-0.311894,-0.104785,-0.314709
has_test,0.141402,0.094791,-0.154939,0.029296,0.098001,-0.085601,-0.127418,0.089412,0.056379,0.088681,...,-0.140728,-0.069815,-0.159646,-0.0712,-0.116968,-0.130751,0.001578,-0.088479,-0.044968,-0.136241
has_title,0.21462,0.171366,-0.226018,0.007815,0.069999,-0.131321,-0.208967,0.033927,0.100597,0.114709,...,-0.208394,-0.152591,-0.189584,-0.083915,-0.197342,-0.20502,0.047319,-0.141933,-0.10926,-0.218888


Again, mostly `num_headers`

## No Markdown Cells

### Grouping Variables

In [25]:
# look at the current overall variables we have and types
no_md_df.dtypes

has_author             bool
jupyter_prop        float64
output_cell_prop    float64
num_contrib         float64
image_prop          float64
is_education           bool
has_comments           bool
num_commits         float64
non_exec_prop       float64
exec_inorder        float64
exec_skips          float64
has_error              bool
has_export             bool
num_functions       float64
has_test               bool
dtype: object

We simply take the groups from the previous section, but exclude the groups pertaining to markdown cells:

- Group 1 (repo analysis - quantitative)
    - `jupyter_prop`
    - `num_contrib`
    - `num_commits`
- Group 2 (code analysis - quantitative)
    - `image_prop`
    - `non_exec_prop`
    - `exec_inorder`
    - `exec_skips`
    - `output_cell_prop`
    - `num_functions`
- Group 3 (code analysis - qualitative)
    - `has_comments`
    - `has_error`
    - `has_export`
    - `has_test`
- Group 4 (notebook analysis - qualitative)
    - `has_author`
    - `is_education`

In [26]:
# define the groups for the no markdown cell group
no_md_groups = {
    'repo_quant' : ['jupyter_prop', 'num_contrib', 'num_commits'],
    'exec_quant' : ['image_prop', 'non_exec_prop', 'exec_inorder', 'exec_skips', 'output_cell_prop', 'num_functions'],
    'code_qual' : ['has_comments', 'has_error', 'has_export', 'has_test'],
    'nb_qual' : ['has_author', 'is_education']
}

### Performing MFA

In [27]:
# instantiate MFA object and fit to data
no_md_mfa = prince.MFA(groups = no_md_groups,
                   n_components = len(no_md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   normalize = True,
                   engine = 'auto',
                   random_state = 42)
no_md_mfa_fit = no_md_mfa.fit(no_md_df)

In [28]:
# put the results into dataframe format
no_md_mfa_df = no_md_mfa_fit.row_coordinates(no_md_df)

# initial look at the results of the MFA
no_md_mfa_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.273758,-0.124665,0.099973,-0.310792,-0.37935,0.513309,-0.768452,-0.14824,-0.242596,0.138349,-0.322996,0.881373,-0.135131,-0.081498,0.000649
1,1.033297,1.363095,-0.230777,0.531385,-0.66395,0.651212,-0.081414,0.067931,-0.512763,-0.243069,0.382403,0.700562,0.528272,-0.12485,0.000525
2,1.367534,-0.96786,5.008987,2.411495,0.805714,-1.839648,-1.920956,-0.680271,-1.185417,-0.127854,-0.62719,0.264342,-0.087028,-0.163303,-0.000691
3,0.714002,-0.140062,-0.723054,0.27923,0.122498,0.640705,-0.138096,-0.379614,-0.520976,0.073633,-0.397577,-0.750476,-0.075268,0.036293,-0.00075
4,0.967324,-0.52932,0.18281,1.188262,0.319462,0.249464,-0.053151,-0.075824,-0.606483,-0.045285,-0.273479,-0.663439,1.053611,-0.005809,-0.000673


### Analyzing the Results of MFA

In [29]:
# extract the explained variance
no_md_mfa_fit.explained_inertia_

[0.15451707993628053,
 0.12538323721562156,
 0.12307883248474871,
 0.1037088761724126,
 0.0871152705569648,
 0.07744971710740069,
 0.07343022789469669,
 0.0607415396764694,
 0.05568101229405508,
 0.04927433032569942,
 0.03620413848442848,
 0.026328517935834194,
 0.017509553230600976,
 0.009469179988239616,
 0.00010848669654696081]

In [30]:
# examine correlation between variables and components
no_md_mfa_fit.column_correlations(no_md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
exec_inorder,-0.131857,-0.308189,-0.135321,-0.193027,0.289684,0.296567,-0.254441,0.085144,-0.154277,-0.280175,-0.185471,0.270017,0.270483,-0.25742,0.04309
exec_skips,0.364125,0.974447,0.520736,0.573786,-0.920157,-0.931283,0.892563,-0.393976,0.413607,0.99989,0.764914,-0.911711,-0.660137,0.923906,-0.311906
has_author,0.065242,-0.017485,-0.007594,0.049702,-0.001347,-0.00094,-0.027475,0.060198,0.031711,-0.016,-0.032223,-0.002516,-0.023826,0.018132,0.054518
has_comments,0.251339,0.103962,0.026426,0.204337,-0.110145,-0.136879,0.02774,0.105957,0.152618,0.087952,-0.036842,-0.148607,-0.147268,0.018302,0.12288
has_error,0.141704,0.172063,-0.019919,0.142655,-0.185667,-0.168515,0.07472,0.055437,0.149624,0.142502,0.047294,-0.178612,-0.194582,0.092165,0.058424
has_export,0.182471,0.064733,-0.015877,0.143867,-0.095259,-0.090238,0.014148,0.093375,0.119495,0.060988,-0.063716,-0.103984,-0.11862,0.009477,0.097968
has_test,-0.006677,-0.007619,-0.00615,-0.015967,0.007619,0.006398,-0.003873,-0.01171,-0.008076,-0.005509,0.011167,0.011235,0.013137,-0.004293,0.193373
image_prop,-0.057954,0.100538,0.086403,0.011709,0.010672,-0.067425,0.044386,-0.079519,0.0981,0.061493,0.063493,0.003385,0.059776,0.100866,-0.064213
is_education,0.02874,-0.037551,-0.006356,-0.095287,0.058718,0.033133,-0.019627,-0.066809,-0.090337,-0.033524,0.025352,0.060314,0.073247,-0.00471,-0.080207
jupyter_prop,0.036538,0.07499,-0.202671,0.00318,-0.014271,-0.051478,-0.023656,0.02521,0.030681,0.019542,-0.010082,-0.037363,-0.089563,0.037956,0.036023


Again, mostly `num_functions`