# Imports

In [2]:
import pandas as pd
import prince
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

# Loading the Data

## Markdown Cells

In [3]:
md_filepath = 'markdown_group.csv'
md_df = pd.read_csv(md_filepath)

In [4]:
# initial look at the data
md_df.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,3,594,True,False,False,False,0.507588,1.0,0.044444,3.0,...,0.0,1.0,1.809524,False,False,0.0,False,7.0,False,10.0
1,6,1222,True,False,False,False,1.0,0.0,0.16129,1.0,...,0.961538,1.0,0.0,False,False,0.0,False,5.0,False,0.0
2,7,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,...,0.988636,1.0,0.0,False,True,15.0,False,30.0,False,0.0
3,12,2705,True,False,False,False,1.0,0.5,0.461538,1.0,...,0.0,0.923077,1.615385,False,False,6.0,False,7.0,False,0.0
4,15,2861,True,False,False,True,1.0,0.214286,0.461538,1.0,...,0.0,1.0,6.333333,False,False,0.0,False,1.0,False,0.0


In [5]:
# clear the first two columns, which hold indexes and notebook IDs
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [6]:
# check the data again
md_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,True,False,False,False,0.507588,1.0,0.044444,3.0,0.0,True,...,0.0,1.0,1.809524,False,False,0.0,False,7.0,False,10.0
1,True,False,False,False,1.0,0.0,0.16129,1.0,0.0,True,...,0.961538,1.0,0.0,False,False,0.0,False,5.0,False,0.0
2,True,False,False,False,0.970851,0.011364,0.375887,1.0,0.0,True,...,0.988636,1.0,0.0,False,True,15.0,False,30.0,False,0.0
3,True,False,False,False,1.0,0.5,0.461538,1.0,0.0,True,...,0.0,0.923077,1.615385,False,False,6.0,False,7.0,False,0.0
4,True,False,False,True,1.0,0.214286,0.461538,1.0,0.0,False,...,0.0,1.0,6.333333,False,False,0.0,False,1.0,False,0.0


In [7]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'output_cell_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test',
 'num_headers',
 'has_param',
 'num_stars']

## No Markdown Cells

In [8]:
no_md_filepath = 'no_markdown_group.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [9]:
# clear the first two columns, which hold indexes and notebook IDs
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

In [10]:
# initial look at the data
no_md_df.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,False,0.71413,0.538462,1.0,0.142857,False,False,1.0,0.0,1.0,1.0,False,True,2.0,False,False,1.0
1,False,0.99784,0.780488,1.0,0.1875,False,True,1.0,0.04878,0.789474,10.631579,True,True,5.0,False,False,1.0
2,False,0.121957,0.461538,5.0,0.0,False,True,11.0,0.0,0.909091,6.181818,False,True,0.0,False,False,4.0
3,False,1.0,0.190476,1.0,0.0,False,True,1.0,0.0,1.0,1.631579,False,False,1.0,False,False,0.0
4,False,0.940063,0.125,3.0,0.0,False,True,3.0,0.125,1.0,1.0,True,False,3.0,False,False,0.0


In [11]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'output_cell_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error',
 'has_export',
 'num_functions',
 'has_test',
 'has_param',
 'num_stars']

# Principal Component Analysis (PCA)

## Adjust the Data

Since PCA is performed on quantitative variables, we change `True` and `False` in the data to `1` and `0`, respectively

In [217]:
# replace True with 1 in both groups
md_adjusted = md_df.replace(True, 1)
no_md_adjusted = no_md_df.replace(True, 1)

In [218]:
# replace False with 0 in both groups
md_adjusted = md_adjusted.replace(False, 0)
no_md_adjusted = no_md_adjusted.replace(False, 0)

In [219]:
# check the markdown cell group
md_adjusted.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,1.0,0.0,0.0,0.0,0.507588,0.044444,3.0,0.0,1.0,1.0,0.0,0.068182,1.0,1.0,1.0,0.0,1.0,1.809524,0.0
1,1.0,0.0,0.0,0.0,1.0,0.16129,1.0,0.0,1.0,1.0,1.0,0.333333,0.0,2.0,1.0,0.961538,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.970851,0.375887,1.0,0.0,1.0,1.0,1.0,0.621429,1.0,1.0,1.0,0.988636,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.461538,1.0,0.0,1.0,1.0,1.0,0.64,1.0,1.0,0.0,0.0,0.923077,1.615385,0.0
4,1.0,0.0,0.0,1.0,1.0,0.461538,1.0,0.0,0.0,0.0,1.0,0.52,0.0,1.0,0.0,0.0,1.0,6.333333,0.0


In [220]:
# check the no markdown cell group
no_md_adjusted.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.0,0.71413,1.0,0.142857,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.99784,1.0,0.1875,0.0,1.0,1.0,0.04878,0.789474,10.631579,1.0
2,0.0,0.121957,5.0,0.0,0.0,1.0,11.0,0.0,0.909091,6.181818,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.631579,0.0
4,0.0,0.940063,3.0,0.0,0.0,1.0,3.0,0.125,1.0,1.0,1.0


## Standardizing the Data

In [221]:
# normalize all values in markdown cell group
md_adjusted_stand = md_adjusted.loc[:, md_vars].values
md_adjusted_stand = StandardScaler().fit_transform(md_adjusted_stand)

In [222]:
# check standardization
print(np.mean(md_adjusted_stand), np.std(md_adjusted_stand))

7.602639141268273e-17 1.0


In [223]:
# put normalized group back into dataframe format
md_adjusted_stand_df = pd.DataFrame(data = md_adjusted_stand, columns = md_vars)

# initial look at the standardized data
md_adjusted_stand_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.915797,-0.590961,-0.221343,-0.490296,-1.293088,-1.94205,0.504838,-0.960204,0.856909,1.280391,-2.838192,-1.964232,0.741665,-0.399981,1.082388,-0.410421,0.690767,-0.337021,-0.393367
1,0.915797,-0.590961,-0.221343,-0.490296,0.570932,-1.28607,-0.201473,-0.960204,0.856909,1.280391,0.352337,-0.934795,-1.348317,-0.05639,1.082388,3.185141,0.690767,-0.487356,-0.393367
2,0.915797,-0.590961,-0.221343,-0.490296,0.460589,-0.081313,-0.201473,-0.960204,0.856909,1.280391,0.352337,0.18372,0.741665,-0.399981,1.082388,3.28647,0.690767,-0.487356,-0.393367
3,0.915797,-0.590961,-0.221343,-0.490296,0.570932,0.399542,-0.201473,-0.960204,0.856909,1.280391,0.352337,0.255823,0.741665,-0.399981,-0.923883,-0.410421,0.003778,-0.35315,-0.393367
4,0.915797,-0.590961,-0.221343,2.039586,0.570932,0.399542,-0.201473,-0.960204,-1.166986,-0.781011,0.352337,-0.210071,-1.348317,-0.399981,-0.923883,-0.410421,0.690767,0.038819,-0.393367


In [224]:
# normalize all values in the no markdown cell group
no_md_adjusted_stand = no_md_adjusted.loc[:, no_md_vars].values
no_md_adjusted_stand = StandardScaler().fit_transform(no_md_adjusted_stand)

In [225]:
# check standardization
print(np.mean(no_md_adjusted_stand), np.std(no_md_adjusted_stand))

-3.909057653963651e-17 1.0


In [226]:
# put normalized group back into dataframe format
no_md_adjusted_stand_df = pd.DataFrame(data = no_md_adjusted_stand, columns = no_md_vars)

# initial look at the standardized data
no_md_adjusted_stand_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,-0.094526,-0.68694,-0.256338,-0.263196,-0.347507,-1.451826,-0.336957,-0.42506,0.52516,-0.371956,-0.435477
1,-0.094526,0.546025,-0.256338,-0.131354,-0.347507,0.688788,-0.336957,-0.23371,-0.681648,0.365723,2.296334
2,-0.094526,-3.260437,1.489081,-0.685091,-0.347507,0.688788,6.280101,-0.42506,0.004038,0.024918,-0.435477
3,-0.094526,0.55541,-0.256338,-0.685091,-0.347507,0.688788,-0.336957,-0.42506,0.52516,-0.323583,-0.435477
4,-0.094526,0.294933,0.616371,-0.685091,-0.347507,0.688788,0.986454,0.065274,0.52516,-0.371956,2.296334


## Markdown Cells

### Performing PCA

In [227]:
# call PCA on the dataset
pca_md = PCA(n_components = len(md_vars))
pc_md = pca_md.fit_transform(md_adjusted_stand)

In [228]:
# put this into dataframe format
pca_md_df = pd.DataFrame(data = pc_md, columns = ['principal component ' + str(i) for i in range(len(md_vars))])

# initial look at the results of the PCA
pca_md_df.head()

Unnamed: 0,principal component 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,principal component 11,principal component 12,principal component 13,principal component 14,principal component 15,principal component 16,principal component 17,principal component 18
0,0.325632,2.486135,-0.434603,-0.662086,-0.871493,2.313732,0.464941,-0.085904,1.41165,-2.071667,-1.24266,-0.325391,0.337758,0.112486,0.04997,0.648099,-1.139459,-0.547465,-1.359421
1,0.850372,1.964054,-0.791791,-2.924954,0.506385,-0.983145,-0.74338,1.794124,-0.508036,-1.007058,0.321607,-0.258157,0.998064,0.158485,-0.05322,0.187462,-0.580699,0.106422,-0.437267
2,1.989086,0.797023,-1.404567,-2.214956,1.127977,-0.188968,-0.073963,1.062631,-0.271357,-0.914284,1.193776,0.387692,1.237828,0.432216,-0.342647,-0.053309,-0.392262,0.449585,-0.385573
3,0.807969,-0.854003,-0.589295,-0.710216,0.724741,0.58588,0.295259,0.118766,-0.033199,-0.930153,-0.321548,0.055024,-1.164403,0.325895,0.229711,0.325803,-0.758573,1.032983,0.246042
4,-0.685577,-0.632618,-0.671905,-0.009599,-0.206052,-1.158051,-0.66403,0.57325,1.185572,-0.06996,0.821988,-1.176089,-1.239776,-1.165998,0.851267,0.976175,0.247722,-0.617327,0.810952


### Analyze the Results of PCA

In [229]:
# extract the explained variance ratios
print(pca_md.explained_variance_ratio_)

[0.16405735 0.09253741 0.08574525 0.07068464 0.06554028 0.05693957
 0.05393053 0.04959195 0.04624602 0.04467132 0.04339441 0.04109365
 0.03401479 0.03016097 0.0294863  0.02710162 0.02583591 0.02240953
 0.01655849]


In [230]:
# examine each principal component with each feature
md_pca_var = pd.DataFrame(pca_md.components_, columns = md_vars)
md_pca_var

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.285941,0.307101,0.084662,0.192641,-0.116514,0.414784,0.078877,-0.140132,0.363885,0.382156,0.036146,0.228698,0.186034,0.096104,0.381265,0.143621,0.110146,-0.094329,-0.072382
1,-0.288496,0.050112,0.072435,-0.029529,-0.335399,-0.262235,0.430237,-0.264251,0.070177,0.090323,-0.136037,-0.444716,-0.155943,0.273586,0.124651,0.288457,0.184996,-0.091546,0.045496
2,-0.005633,0.05723,0.196835,0.142991,-0.101358,-0.077816,0.001696,0.185093,0.132622,0.178538,0.164829,-0.162013,-0.118717,0.234797,0.148224,-0.199409,-0.521322,0.578203,0.216741
3,-0.002933,0.058328,0.024464,0.077071,-0.432527,0.225516,0.450558,0.340499,-0.190698,-0.232117,-0.141803,0.248416,0.108025,0.251062,-0.215664,-0.264335,-0.024007,-0.00205,-0.263521
4,0.347004,-0.238179,-0.084076,-0.355716,-0.111945,0.040386,0.200143,-0.345431,-0.194304,-0.047419,0.287275,0.246118,0.163192,0.228308,-0.147761,0.223769,-0.135978,0.069582,0.398266
5,-0.085344,0.106391,-0.087946,-0.320845,-0.066453,0.126454,0.024065,-0.329605,0.022425,0.085199,-0.637554,-0.051721,0.290083,-0.284571,-0.02757,-0.115401,-0.31159,0.216895,-0.029096
6,-0.024561,-0.086823,0.786485,-0.153616,0.046541,-0.048024,-0.152149,-0.022773,-0.185179,0.063425,0.046788,-0.136122,0.447683,0.107736,0.002924,-0.137838,0.137075,-0.093467,-0.062997
7,0.247751,-0.195482,0.226244,-0.068564,0.379966,0.041954,0.065023,-0.027564,-0.018023,-0.011351,-0.295672,0.080887,-0.363774,0.302278,-0.020847,0.331163,-0.242417,0.030638,-0.453285
8,0.18788,-0.458557,-0.056694,0.567682,0.149459,0.028491,0.094952,-0.011902,-0.01779,-0.061789,-0.391835,-0.125206,0.195191,0.10843,0.089609,-0.11645,0.025002,-0.12332,0.368804
9,-0.278342,0.477272,0.24408,0.102523,0.191244,0.136159,-0.096159,-0.046555,0.054506,-0.287274,-0.204121,0.25398,-0.134804,0.200284,-0.234026,0.16098,-0.021766,-0.075286,0.470473


- principal component 0: `markdown_prop`
- principal component 1: `md_frequency`, `num_contrib`
- principal component 2: `exec_inorder`, `exec_skips`

## No Markdown Cells

### Performing PCA

In [231]:
# call PCA on the dataset
pca_no_md = PCA(n_components = len(no_md_vars))
pc_no_md = pca_no_md.fit_transform(no_md_adjusted_stand)

In [232]:
# put this into dataframe format
pca_no_md_df = pd.DataFrame(data = pc_no_md, columns = ['principal component ' + str(i) for i in range(len(no_md_vars))])

# initial look at the results of the PCA
pca_no_md_df.head()

Unnamed: 0,principal component 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10
0,-0.893334,-0.524358,-0.55799,1.153832,-0.612463,0.477258,0.286244,0.188093,-0.550187,0.010195,-0.060608
1,1.796207,-0.522384,1.105356,0.051893,-0.33725,-0.253991,-1.346783,-0.182888,0.32491,-0.092964,0.401822
2,-0.766702,6.184324,-0.229399,0.522036,0.28679,-0.26411,-1.5266,1.184071,-3.0719,0.078415,-1.116162
3,-0.299909,-0.53915,-0.087177,-0.537409,-0.241197,-0.28183,-0.20065,-0.757863,-0.052443,0.510133,-0.787259
4,0.379913,0.570896,1.053284,-0.151507,-0.585976,-0.573586,-2.347405,0.228461,0.061454,0.475055,0.371588


### Analyze the Results of PCA

In [233]:
# extract the explained variance ratios
print(pca_no_md.explained_variance_ratio_)

[0.14487747 0.11721992 0.1108202  0.10747086 0.09345702 0.0832355
 0.07703527 0.07193561 0.06973362 0.06435767 0.05985687]


In [234]:
# examine each principal component with each feature
no_md_pca_var = pd.DataFrame(pca_no_md.components_, columns = no_md_vars)
no_md_pca_var

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,-0.07796,0.218722,-0.201551,0.275545,-0.130421,0.193203,0.041966,-0.334509,-0.51103,0.503892,0.38273
1,0.152516,-0.406227,0.55243,0.122769,-0.013507,0.249644,0.620293,-0.039759,-0.10004,0.150655,-0.099567
2,0.422296,-0.09567,0.008768,-0.590601,0.019222,0.155617,-0.106086,0.449732,-0.242202,0.152586,0.378923
3,-0.281288,-0.497047,0.194897,-0.268845,0.129516,-0.557427,-0.196965,-0.346389,-0.157662,0.125616,0.191423
4,-0.223756,-0.087359,-0.230465,0.009006,0.885108,0.223421,0.076566,0.11867,-0.099391,0.110357,-0.117352
5,0.770579,-0.21974,-0.284887,0.260221,0.214538,-0.173985,-0.037504,-0.339071,0.09106,-0.080071,0.04087
6,0.157259,-0.042017,0.014937,0.080415,-0.10119,-0.198097,-0.275393,0.231249,-0.272875,0.481311,-0.694186
7,0.06399,0.393428,0.002759,0.069811,0.136451,-0.655305,0.506356,0.278243,-0.203352,-0.052168,0.101516
8,0.165593,0.385746,0.676924,0.166861,0.30379,0.045157,-0.417174,-0.096796,-0.167529,-0.160298,0.078855
9,0.089319,0.344508,0.123094,-0.327837,0.121861,-0.043609,0.117353,-0.266636,0.581166,0.557669,-0.018433


These numbers are bad :(

# Factor Analysis for Mixed Data (FAMD)

## Markdown Cells

### Performing FAMD

In [12]:
# instantiate FAMD object and fit to data
md_famd = prince.FAMD(n_components = len(md_vars),
                     n_iter = 10,
                     copy = True,
                     check_input = True,
                     engine = 'auto',
                     random_state = 42)
md_famd_fit = md_famd.fit(md_df)

In [13]:
# put the results into dataframe format
md_famd_df = md_famd_fit.row_coordinates(md_df)

# initial look at the results of the FAMD
md_famd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.215619,0.199043,1.462193,-0.700761,-1.45073,0.777237,-0.96799,-0.702297,0.145261,-0.51928,...,0.095209,-0.524307,-0.026219,-0.227167,0.399082,-0.262078,0.319301,-0.03192,0.012367,-0.001587
1,0.93429,1.810674,0.036827,-2.203585,0.423893,-0.166698,0.672928,-0.199408,0.174447,0.065143,...,0.062795,-0.049922,-0.086471,-0.095315,0.069468,-0.023475,-0.095093,-0.023917,-0.000812,-0.008565
2,2.358721,1.387319,-0.617634,-1.191236,1.366885,0.575668,0.160355,-0.327866,0.086821,0.769913,...,0.317542,-0.00859,0.09695,-0.274791,0.263944,-0.001506,-0.065913,-0.032497,-0.060816,-0.010933
3,0.992929,-0.450898,-0.183815,-0.751875,-0.101203,-0.153192,-0.430943,0.043736,0.219263,-0.498767,...,-0.07167,-0.052852,-0.311093,-0.08088,0.268709,0.3012,-0.109842,-0.037289,0.045047,-0.009
4,0.359657,0.190099,-0.268256,-0.991836,-0.20511,-0.402321,-0.116393,-0.048718,0.030793,-0.34579,...,-0.323593,0.458284,0.188447,0.521721,0.079391,-0.026368,-0.123698,-0.022121,-0.012504,-0.025019


### Anayzing the Results of FAMD

In [14]:
# extract the explained variance
md_famd_fit.explained_inertia_

[0.15313878956637544,
 0.12829224067590747,
 0.10012740464303764,
 0.09718277341682649,
 0.0776524356492517,
 0.062439731727354494,
 0.05418729861375089,
 0.053815924756407146,
 0.05017085156459898,
 0.04754356581484439,
 0.03366434598040732,
 0.031158282248140302,
 0.023274612484252902,
 0.019490267014121612,
 0.011943890588787793,
 0.009443928022961085,
 0.007730246969253294,
 0.007381430065571195,
 0.006462392673667498,
 0.00584649621866735,
 0.005459556286913409,
 0.004871220304870025,
 0.004766396647243685,
 0.00199621705179581,
 0.0013051396851040825,
 0.0006545613298875217]

In [15]:
# examine correlation between variables and componbents
md_famd_fit.column_correlations(md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
exec_inorder,0.232137,0.254819,-0.29981,-0.014961,-0.332107,0.137788,0.000331,0.073229,0.100586,-0.032943,...,-0.070842,-0.078498,-0.081626,0.096642,-0.42378,0.419055,-0.024043,0.037418,-0.115061,0.044774
exec_skips,-0.316614,-0.45156,0.716341,0.222568,0.78755,-0.230753,-0.158455,-0.109047,-0.132789,0.242525,...,0.133874,0.116055,-0.027008,-0.062595,0.863869,-0.889882,-0.005348,-0.203368,0.192586,-0.215128
has_author,0.117786,0.014196,0.06078,0.144099,0.055172,0.043327,-0.006157,-0.055842,0.10257,0.081499,...,-0.048312,0.041107,-0.131692,0.150764,0.030667,-0.061178,-0.085168,-0.160385,-0.077043,-0.051668
has_comments,0.072941,-0.077196,0.027303,0.083199,0.165816,0.112042,-0.044615,0.022445,-0.038002,0.045178,...,-0.105361,-0.090945,-0.106516,0.015754,0.060183,-0.083547,-0.084755,-0.051844,-0.088054,0.006537
has_equation,0.171193,0.029157,0.029887,0.171099,0.050159,0.075547,-0.040657,-0.050174,0.100381,0.10759,...,-0.065969,0.033912,-0.159243,0.171934,-0.028342,-0.013545,-0.108532,-0.166953,-0.111151,-0.056841
has_error,0.012791,-0.094269,0.047282,0.046088,0.14963,0.066093,0.004288,-0.028259,-0.052108,0.001444,...,-0.07734,-0.006778,-0.037989,-0.002351,0.08069,-0.123811,-0.065128,-0.031309,-0.053496,0.023553
has_export,0.080976,-0.046257,0.021059,0.090387,0.142311,0.128589,-0.02175,0.045609,-0.058038,0.008998,...,-0.120366,-0.113905,-0.100623,-0.007399,0.027917,-0.052001,-0.082055,-0.027244,-0.07842,0.026256
has_links,0.315182,0.003115,0.001522,0.27563,0.109493,0.186637,0.001034,-0.164018,0.138024,0.11945,...,-0.172499,0.125916,-0.259379,0.293982,-0.057351,-0.062847,-0.26483,-0.285465,-0.25766,-0.031273
has_param,-0.0016,0.075036,0.087574,0.047965,-0.01255,-0.075513,-0.094818,0.063072,0.074939,0.093158,...,0.083474,-0.045061,-0.047181,0.046683,0.030494,0.012417,0.068642,-0.048652,0.069159,-0.089432
has_test,0.12996,-0.038868,-0.038201,0.090543,0.086315,0.133059,-0.005993,-0.042046,-0.009216,0.02606,...,-0.12535,-0.01505,-0.10915,0.071381,-0.026309,-0.025938,-0.123982,-0.067277,-0.126107,0.026422


Mostly `exec_skips`, `exec_inorder`, `num_headers`, and `num_stars`

## No Markdown Cells

### Performing FAMD

In [16]:
# instantiate FAno_md object and fit to data
no_md_famd = prince.FAMD(n_components = len(no_md_vars),
                     n_iter = 10,
                     copy = True,
                     check_input = True,
                     engine = 'auto',
                     random_state = 42)
no_md_famd_fit = no_md_famd.fit(no_md_df)

In [17]:
# put the results into dataframe format
no_md_famd_df = no_md_famd_fit.row_coordinates(no_md_df)

# initial look at the results of the FAMD
no_md_famd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.241451,-0.028969,0.214683,-0.407954,-0.819815,0.235226,-0.109712,-0.261547,0.012072,0.14538,-0.359774,0.88899,-0.135161,-0.024829,-0.001791,-0.021386,0.000638
1,1.583855,0.988358,-0.387829,-0.018508,-0.54689,0.24278,0.25525,0.263382,0.312823,-0.2316,0.353116,0.737947,0.522074,-0.091569,-0.017373,-0.036099,0.000517
2,1.225617,0.74197,4.149854,0.910854,0.656601,-0.557845,-2.047834,-2.020079,1.546516,-0.593085,-0.69754,0.281404,-0.09048,-0.17396,0.025995,-0.055131,-0.000753
3,0.890367,-0.533191,-0.158509,-0.59947,-0.354109,-0.188582,0.32716,0.087423,0.373343,0.078749,-0.397251,-0.727297,-0.072982,-0.173055,-0.021079,0.005686,-0.00075
4,1.219395,-0.633438,0.615289,-0.12067,0.132493,-0.175228,-0.199369,0.16251,0.696241,-0.143041,-0.255339,-0.641777,1.045086,-0.137332,-0.020855,-0.009234,-0.000687


### Analyzing the Results of FAMD

In [18]:
# extract the explained variance
no_md_famd_fit.explained_inertia_

[0.145122395757816,
 0.14067180344400765,
 0.112990666758662,
 0.09211243509148188,
 0.08159430793597336,
 0.07428142739261921,
 0.06750136057929759,
 0.06209386058921017,
 0.05767403392691925,
 0.05578233469203489,
 0.041002066094891085,
 0.030001824235793534,
 0.019878640450033954,
 0.01590825314912561,
 0.0018324400722709662,
 0.0014286552345348692,
 0.0001234945953287226]

In [19]:
# examine correlation between variables and componbents
no_md_famd_fit.column_correlations(no_md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
exec_inorder,-0.255108,-0.299838,0.15214,-0.298674,-0.126712,0.114394,0.008181,0.110719,-0.07624,-0.278385,-0.184545,0.264581,0.221669,-0.137489,0.07123,-0.227311,0.041447
exec_skips,0.767727,0.974889,-0.453018,0.955854,0.364851,-0.473723,-0.00172,-0.386127,0.344328,0.995012,0.734106,-0.881546,-0.53384,0.383267,-0.396054,0.826886,-0.303148
has_author,0.027125,-0.022936,-0.001284,0.003302,0.019158,0.043383,-0.028752,0.035425,-0.003391,-0.018112,-0.016971,-0.008162,-0.02589,0.038623,0.071636,0.02085,0.05211
has_comments,0.177651,0.089536,-0.039508,0.129863,0.1622,0.081519,-0.029664,-0.003405,-0.008319,0.086253,-0.020431,-0.151689,-0.110592,0.120884,0.112022,0.001563,0.124637
has_error,0.196354,0.136395,-0.141534,0.168157,0.050754,0.014805,-0.064136,-0.029908,0.030582,0.137074,0.080951,-0.183458,-0.179345,0.150282,0.023293,0.058166,0.052861
has_export,0.139005,0.04176,-0.065462,0.09361,0.066399,0.056132,-0.058531,0.026864,0.007938,0.05662,-0.02607,-0.111657,-0.109787,0.119398,0.078677,-0.015514,0.093472
has_param,-0.003443,0.007686,-0.024996,-0.002003,-0.022655,-0.02564,-0.005947,-0.02194,0.010192,0.008674,0.015719,-0.002288,-0.003885,-0.004286,0.0876,0.023141,-0.030491
has_test,-0.010877,-0.008318,-0.000622,-0.009252,-0.012557,-0.009072,0.000166,-0.0084,-0.000645,-0.005572,0.009308,0.011676,0.008777,-0.010724,-0.010033,-0.003661,0.189745
image_prop,0.013767,0.108853,-0.006741,0.024802,0.123521,-0.061206,0.015782,-0.102404,-0.032596,0.063474,0.040859,0.00339,0.060155,-0.008674,-0.114507,0.108702,-0.060462
is_education,-0.082541,-0.017066,0.081032,-0.049661,0.008493,-0.005861,0.099093,-0.07999,-0.0426,-0.025367,-0.030972,0.070661,0.10136,-0.104241,-0.037488,0.030986,-0.067406


Seems to mostly be `exec_skips`, `exec_inorder`, `num_functions`, and `num_stars`

# Multiple Factor Analysis (MFA)

## Markdown Cells

### Grouping Variables

In [20]:
# look at the current overall variables we have and types
md_df.dtypes

longer_beginning       bool
longer_ending          bool
has_author             bool
has_equation           bool
jupyter_prop        float64
output_cell_prop    float64
markdown_prop       float64
num_contrib         float64
image_prop          float64
is_education           bool
has_links              bool
has_comments           bool
md_frequency        float64
has_title              bool
num_commits         float64
md_format              bool
non_exec_prop       float64
exec_inorder        float64
exec_skips          float64
has_error              bool
has_export             bool
num_functions       float64
has_test               bool
num_headers         float64
has_param              bool
num_stars           float64
dtype: object

We divide as follows (groups must be **either** quantitative or qualitative):
- Group 1 (markdown analysis - qualitative):
    - `longer_beginning`
    - `longer_ending`
    - `has_equation`
    - `has_links`
    - `md_format`
    - `has_title`
- Group 2 (markdown analysis - quantitative):
    - `markdown_prop`
    - `md_frequency`
    - `num_headers`
- Group 3 (repo analysis - quantitative)
    - `jupyter_prop`
    - `num_contrib`
    - `num_commits`
    - `num_stars`
- Group 4 (code analysis - quantitative)
    - `image_prop`
    - `non_exec_prop`
    - `exec_inorder`
    - `exec_skips`
    - `output_cell_prop`
    - `num_functions`
- Group 5 (code analysis - qualitative)
    - `has_comments`
    - `has_error`
    - `has_export`
    - `has_test`
    - `has_param`
- Group 6 (notebook analysis - qualitative)
    - `has_author`
    - `is_education`

In [21]:
# define the groups for the markdown cell group
md_groups = {
    'md_qual' : ['longer_beginning', 'longer_ending', 'has_equation', 'has_links', 'md_format', 'has_title'],
    'md_quant' : ['markdown_prop', 'md_frequency', 'num_headers'],
    'repo_quant' : ['jupyter_prop', 'num_contrib', 'num_commits', 'num_stars'],
    'exec_quant' : ['image_prop', 'non_exec_prop', 'exec_inorder', 'exec_skips', 'output_cell_prop', 'num_functions'],
    'code_qual' : ['has_comments', 'has_error', 'has_export', 'has_test', 'has_param'],
    'nb_qual' : ['has_author', 'is_education']
}

### Performing MFA

In [22]:
# instantiate MFA object and fit to data
md_mfa = prince.MFA(groups = md_groups,
                   n_components = len(md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   normalize = True,
                   engine = 'auto',
                   random_state = 42)
md_mfa_fit = md_mfa.fit(md_df)

In [23]:
# put the results into dataframe format
md_mfa_df = md_mfa_fit.row_coordinates(md_df)

# initial look at the results of the MFA
md_mfa_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.325292,0.096224,1.711163,0.601874,-1.135405,-1.106592,-1.114856,-0.125886,0.424318,-1.003181,...,0.023467,-0.364791,-0.895554,0.657751,0.172117,0.195351,-0.204241,0.20111,0.024267,-0.002515
1,1.742505,0.955239,1.843608,-2.123991,0.662747,0.075954,-0.112824,0.292931,-0.839682,0.164968,...,-0.46153,-0.076464,-0.349126,-0.088746,-0.063273,-0.085251,-0.016815,-0.028085,0.00436,-0.013971
2,2.659186,1.537812,0.111147,-1.875473,1.718017,-0.440642,-0.212546,-0.04191,0.24323,0.744629,...,0.098822,-0.022573,-0.702802,0.047235,0.070363,-0.144437,0.035099,-0.136678,-0.092745,-0.018105
3,1.942002,-0.430158,-0.071178,-0.617181,-0.140694,0.21163,-0.158972,0.114134,0.15546,-0.727712,...,-0.120056,-0.054852,-0.397068,-0.109491,-0.294902,0.106541,0.023873,-0.48514,0.100811,-0.015874
4,0.752943,-0.240007,0.437617,-0.990699,-0.285648,0.386115,0.086308,-0.10105,-0.177167,-0.349388,...,-0.736879,0.319949,0.256193,-0.714873,0.365801,0.472202,-0.200081,-0.040976,-0.030308,-0.042549


### Analyzing the Results of MFA

In [24]:
# extract the explained variance
md_mfa_fit.explained_inertia_

[0.2361205495836892,
 0.1061963693228391,
 0.09536410761890167,
 0.08091068151030965,
 0.060363731100334245,
 0.05200038795848923,
 0.0479353663794169,
 0.04502864652583242,
 0.043753173400086276,
 0.037337855397904754,
 0.029611378846893317,
 0.024097386279311208,
 0.022431449778045297,
 0.019356699935510337,
 0.0172919795923872,
 0.012242948776472746,
 0.011102264974587963,
 0.009959068185720444,
 0.009412688331437708,
 0.008739829740642599,
 0.007921287626788176,
 0.006885292004563716,
 0.00637306207248795,
 0.005889500671614773,
 0.0024356240084366797,
 0.0012386703772965572]

In [25]:
# examine correlation between variables and components
md_mfa_fit.column_correlations(md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
exec_inorder,0.103778,0.207506,-0.118598,-0.287359,-0.283977,-0.16485,0.111514,0.110421,0.065648,-0.04601,...,0.078603,-0.133208,0.322637,0.056803,-0.135803,-0.004338,0.094399,-0.42797,-0.105866,0.044153
exec_skips,-0.048163,-0.281169,0.226456,0.733208,0.682349,0.36424,-0.351918,-0.152276,0.034811,0.27558,...,-0.18149,0.163422,-0.783983,-0.163283,0.106628,0.096292,-0.323203,0.855479,0.1839,-0.213565
has_author,0.140723,0.081514,-0.049763,0.110846,0.073439,-0.034167,-0.098449,0.072833,0.020957,0.092104,...,-0.101136,-0.122654,-0.125074,-0.095913,-0.098724,0.076304,-0.164103,-0.012849,-0.071083,-0.05145
has_comments,0.103266,-0.015775,-0.075178,0.072671,0.164709,-0.036277,0.016667,-0.027371,0.077387,0.059552,...,-0.00698,-0.135867,-0.107624,-0.062307,-0.039589,-0.048689,-0.057686,0.039885,-0.083463,0.006866
has_equation,0.185998,0.109761,-0.087918,0.094278,0.069415,-0.053125,-0.113827,0.072271,0.061492,0.119877,...,-0.096269,-0.147447,-0.104015,-0.105721,-0.128742,0.064334,-0.159174,-0.069245,-0.102659,-0.056665
has_error,0.050005,-0.049295,-0.042861,0.076305,0.148433,-0.02641,0.002557,-0.055196,0.005415,0.007329,...,-0.047321,-0.054767,-0.114021,-0.072009,0.017889,-0.034571,-0.045112,0.099283,-0.053735,0.023891
has_export,0.092416,0.00799,-0.06565,0.067071,0.144731,-0.051279,0.07753,-0.037204,0.065802,0.018463,...,-0.004805,-0.145331,-0.069994,-0.050088,-0.035011,-0.073009,-0.02756,0.016633,-0.074175,0.026552
has_links,0.350917,0.151582,-0.215257,0.126593,0.157939,-0.166148,-0.17881,0.06859,0.037867,0.13532,...,-0.234003,-0.283448,-0.21153,-0.263582,-0.182114,0.051775,-0.272518,-0.096579,-0.245505,-0.030808
has_param,-0.007737,0.071891,0.083729,0.070705,-0.028088,0.092395,-0.069974,0.082522,0.079895,0.089404,...,0.049115,0.017025,-0.007606,0.062738,-0.073718,0.093444,-0.051141,-0.011506,0.07317,-0.089578
has_test,0.142132,0.023675,-0.123477,0.019274,0.103593,-0.095889,-0.013991,-0.021485,0.037591,0.036912,...,-0.07127,-0.160484,-0.073485,-0.103798,-0.053947,-0.029139,-0.064942,-0.031281,-0.122356,0.026576


Again, mostly `num_headers` and `num_stars`

## No Markdown Cells

### Grouping Variables

In [26]:
# look at the current overall variables we have and types
no_md_df.dtypes

has_author             bool
jupyter_prop        float64
output_cell_prop    float64
num_contrib         float64
image_prop          float64
is_education           bool
has_comments           bool
num_commits         float64
non_exec_prop       float64
exec_inorder        float64
exec_skips          float64
has_error              bool
has_export             bool
num_functions       float64
has_test               bool
has_param              bool
num_stars           float64
dtype: object

We simply take the groups from the previous section, but exclude the groups pertaining to markdown cells:

- Group 1 (repo analysis - quantitative)
    - `jupyter_prop`
    - `num_contrib`
    - `num_commits`
    - `num_stars`
- Group 2 (code analysis - quantitative)
    - `image_prop`
    - `non_exec_prop`
    - `exec_inorder`
    - `exec_skips`
    - `output_cell_prop`
    - `num_functions`
- Group 3 (code analysis - qualitative)
    - `has_comments`
    - `has_error`
    - `has_export`
    - `has_test`
    - `has_param`
- Group 4 (notebook analysis - qualitative)
    - `has_author`
    - `is_education`

In [27]:
# define the groups for the no markdown cell group
no_md_groups = {
    'repo_quant' : ['jupyter_prop', 'num_contrib', 'num_commits', 'num_stars'],
    'exec_quant' : ['image_prop', 'non_exec_prop', 'exec_inorder', 'exec_skips', 'output_cell_prop', 'num_functions'],
    'code_qual' : ['has_comments', 'has_error', 'has_export', 'has_test', 'has_param'],
    'nb_qual' : ['has_author', 'is_education']
}

### Performing MFA

In [28]:
# instantiate MFA object and fit to data
no_md_mfa = prince.MFA(groups = no_md_groups,
                   n_components = len(no_md_vars),
                   n_iter = 10,
                   copy = True,
                   check_input = True,
                   normalize = True,
                   engine = 'auto',
                   random_state = 42)
no_md_mfa_fit = no_md_mfa.fit(no_md_df)

In [29]:
# put the results into dataframe format
no_md_mfa_df = no_md_mfa_fit.row_coordinates(no_md_df)

# initial look at the results of the MFA
no_md_mfa_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.270875,-0.004379,0.098358,-0.209989,-0.312053,-0.650078,0.531311,-0.462411,-0.251289,0.165144,0.134147,-0.323135,0.881075,-0.134458,-0.081636,-0.001774,0.000646
1,1.013241,0.341982,-1.397644,0.570687,-0.380888,-0.316318,0.671948,0.182728,-0.049705,0.462497,-0.240089,0.383902,0.70072,0.527917,-0.124213,-0.0174,0.000527
2,1.497606,3.192224,2.56486,2.637452,-0.192951,-0.739534,-2.314456,-2.405582,-0.628297,1.29945,-0.207195,-0.607018,0.257824,-0.101315,-0.168285,0.026049,-0.000745
3,0.677828,-0.737519,-0.209235,0.214709,0.267934,-0.489621,0.376687,0.204775,-0.427001,0.45228,0.076045,-0.398568,-0.749794,-0.075274,0.036333,-0.021176,-0.000754
4,0.95289,-0.308072,0.403222,1.121432,0.26916,-0.434909,-0.193974,0.068202,-0.039824,0.639657,-0.060426,-0.261637,-0.664312,1.047863,-0.006204,-0.020968,-0.000688


### Analyzing the Results of MFA

In [30]:
# extract the explained variance
no_md_mfa_fit.explained_inertia_

[0.14566435269620975,
 0.11926041473034475,
 0.11538771165279108,
 0.09529416841522907,
 0.07959330430661983,
 0.0722078316507278,
 0.0689055228214948,
 0.0646652467520292,
 0.0554964669349955,
 0.05147777785386469,
 0.04632229567803794,
 0.03399737863469325,
 0.024766276666489694,
 0.01643869897587332,
 0.008906240444965803,
 0.0015142608438289,
 0.00010205094180431982]

In [31]:
# examine correlation between variables and components
no_md_mfa_fit.column_correlations(no_md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
exec_inorder,-0.032179,-0.171197,0.252341,-0.204306,0.263509,-0.201409,0.128113,-0.145616,0.059078,0.167613,-0.279961,-0.16947,0.269763,0.22689,-0.243699,0.071622,0.041921
exec_skips,0.095909,0.579127,-0.783027,0.618164,-0.853438,0.628843,-0.448274,0.582856,-0.294924,-0.484992,0.999419,0.702705,-0.910514,-0.547944,0.87639,-0.397345,-0.304677
has_author,0.013129,-0.019153,0.007463,0.041837,-0.000287,-0.004498,0.006721,-0.044299,0.059558,-0.007942,-0.016509,-0.025556,-0.002888,-0.026177,0.013134,0.071472,0.052003
has_comments,0.100099,0.063015,-0.078648,0.195054,-0.075921,0.111448,-0.000563,-0.056245,0.094575,-0.124672,0.088044,-0.044731,-0.148192,-0.113692,0.024397,0.111785,0.124444
has_error,-0.001303,0.032298,-0.174746,0.139255,-0.16632,0.097204,-0.058615,-0.020776,0.070416,-0.115371,0.141101,0.054689,-0.179533,-0.181289,0.076134,0.023165,0.052609
has_export,0.034318,-0.000443,-0.074047,0.132283,-0.08119,0.052628,-0.019539,-0.052245,0.098591,-0.079116,0.05994,-0.053694,-0.104616,-0.111137,0.001937,0.078557,0.093336
has_param,-0.026179,-0.011534,-0.022721,-0.026549,-0.01063,-0.009316,-0.018582,0.007481,-0.01875,0.003988,0.009165,0.017102,-0.002958,-0.003371,0.020918,0.087563,-0.030503
has_test,-0.006679,-0.00845,0.003674,-0.015694,0.003935,-0.00889,-0.003581,0.000555,-0.009848,0.005779,-0.005586,0.012167,0.011114,0.0091,-0.005382,-0.01001,0.189635
image_prop,-0.001942,0.089803,-0.058657,0.011341,0.00529,0.038824,-0.045855,0.029757,-0.071215,-0.12334,0.06203,0.051702,0.003778,0.060261,0.100871,-0.114694,-0.060598
is_education,0.08233,0.039983,0.072758,-0.076237,0.078262,0.02382,0.061367,0.022349,-0.099607,0.030615,-0.030987,-0.005092,0.06231,0.100954,0.021011,-0.037444,-0.067328


Again, mostly `num_stars` and `num_functions`