# Imports

In [45]:
import pandas as pd
import prince
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

# Loading the Data

## Markdown Cells

In [26]:
md_filepath = 'markdown_group.csv'
md_df = pd.read_csv(md_filepath)

In [27]:
# initial look at the data
md_df.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,...,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,3,594,True,False,False,False,0.507588,0.044444,3.0,0.0,...,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
1,6,1222,True,False,False,False,1.0,0.16129,1.0,0.0,...,True,True,0.333333,False,2.0,True,0.961538,1.0,0.0,False
2,7,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,...,True,True,0.621429,True,1.0,True,0.988636,1.0,0.0,False
3,12,2705,True,False,False,False,1.0,0.461538,1.0,0.0,...,True,True,0.64,True,1.0,False,0.0,0.923077,1.615385,False
4,15,2861,True,False,False,True,1.0,0.461538,1.0,0.0,...,False,True,0.52,False,1.0,False,0.0,1.0,6.333333,False


In [28]:
# clear the first column, which holds indexes
md_df = md_df.drop(['Unnamed: 0'], axis = 1)

In [29]:
# check the data again
md_df.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,594,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
1,1222,True,False,False,False,1.0,0.16129,1.0,0.0,True,True,True,0.333333,False,2.0,True,0.961538,1.0,0.0,False
2,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,1.0,0.0,False
3,2705,True,False,False,False,1.0,0.461538,1.0,0.0,True,True,True,0.64,True,1.0,False,0.0,0.923077,1.615385,False
4,2861,True,False,False,True,1.0,0.461538,1.0,0.0,False,False,True,0.52,False,1.0,False,0.0,1.0,6.333333,False


In [55]:
# extract the column titles
md_vars = list(md_df)
md_vars

['longer_beginning',
 'longer_ending',
 'has_author',
 'has_equation',
 'jupyter_prop',
 'markdown_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_links',
 'has_comments',
 'md_frequency',
 'has_title',
 'num_commits',
 'md_format',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

## No Markdown Cells

In [30]:
no_md_filepath = 'no_markdown_group.csv'
no_md_df = pd.read_csv(no_md_filepath)

In [31]:
# clear the first column, which holds indexes
no_md_df = no_md_df.drop(['Unnamed: 0'], axis = 1)

In [32]:
# initial look at the data
no_md_df.head()

Unnamed: 0,nb_id,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,has_title,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,1589,False,0.71413,1.0,0.142857,False,False,False,1.0,0.0,1.0,1.0,False
1,1919,False,0.99784,1.0,0.1875,False,True,False,1.0,0.04878,0.789474,10.631579,True
2,2857,False,0.121957,5.0,0.0,False,True,False,11.0,0.0,0.909091,6.181818,False
3,4339,False,1.0,1.0,0.0,False,True,False,1.0,0.0,1.0,1.631579,False
4,4659,False,0.940063,3.0,0.0,False,True,False,3.0,0.125,1.0,1.0,True


In [56]:
# extract the column titles
no_md_vars = list(no_md_df)
no_md_vars

['has_author',
 'jupyter_prop',
 'num_contrib',
 'image_prop',
 'is_education',
 'has_comments',
 'has_title',
 'num_commits',
 'non_exec_prop',
 'exec_inorder',
 'exec_skips',
 'has_error']

# Principal Component Analysis (PCA)

## Adjust the Data

Since PCA is performed on quantitative variables, we change `True` and `False` in the data to `1` and `0`, respectively

In [40]:
# replace True with 1 in both groups
md_adjusted = md_df.replace(True, 1)
no_md_adjusted = no_md_df.replace(True, 1)

In [41]:
# replace False with 0 in both groups
md_adjusted = md_adjusted.replace(False, 0)
no_md_adjusted = no_md_adjusted.replace(False, 0)

In [42]:
# check the markdown cell group
md_adjusted.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,1.0,0.0,0.0,0.0,0.507588,0.044444,3.0,0.0,1.0,1.0,0.0,0.068182,1.0,1.0,1.0,0.0,1.0,1.809524,0.0
1,1.0,0.0,0.0,0.0,1.0,0.16129,1.0,0.0,1.0,1.0,1.0,0.333333,0.0,2.0,1.0,0.961538,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.970851,0.375887,1.0,0.0,1.0,1.0,1.0,0.621429,1.0,1.0,1.0,0.988636,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.461538,1.0,0.0,1.0,1.0,1.0,0.64,1.0,1.0,0.0,0.0,0.923077,1.615385,0.0
4,1.0,0.0,0.0,1.0,1.0,0.461538,1.0,0.0,0.0,0.0,1.0,0.52,0.0,1.0,0.0,0.0,1.0,6.333333,0.0


In [43]:
# check the no markdown cell group
no_md_adjusted.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,has_title,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.0,0.71413,1.0,0.142857,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.0,0.99784,1.0,0.1875,0.0,1.0,0.0,1.0,0.04878,0.789474,10.631579,1.0
2,0.0,0.121957,5.0,0.0,0.0,1.0,0.0,11.0,0.0,0.909091,6.181818,0.0
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.631579,0.0
4,0.0,0.940063,3.0,0.0,0.0,1.0,0.0,3.0,0.125,1.0,1.0,1.0


## Standardizing the Data

In [57]:
# normalize all values in markdown cell group
md_adjusted_stand = md_adjusted.loc[:, md_vars].values
md_adjusted_stand = StandardScaler().fit_transform(md_adjusted_stand)

In [58]:
# check standardization
print(np.mean(md_adjusted_stand), np.std(md_adjusted_stand))

7.602639141268273e-17 1.0


In [59]:
# put normalized group back into dataframe format
md_adjusted_stand_df = pd.DataFrame(data = md_adjusted_stand, columns = md_vars)

# initial look at the standardized data
md_adjusted_stand_df.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.915797,-0.590961,-0.221343,-0.490296,-1.293088,-1.94205,0.504838,-0.960204,0.856909,1.280391,-2.838192,-1.964232,0.741665,-0.399981,1.082388,-0.410421,0.690767,-0.337021,-0.393367
1,0.915797,-0.590961,-0.221343,-0.490296,0.570932,-1.28607,-0.201473,-0.960204,0.856909,1.280391,0.352337,-0.934795,-1.348317,-0.05639,1.082388,3.185141,0.690767,-0.487356,-0.393367
2,0.915797,-0.590961,-0.221343,-0.490296,0.460589,-0.081313,-0.201473,-0.960204,0.856909,1.280391,0.352337,0.18372,0.741665,-0.399981,1.082388,3.28647,0.690767,-0.487356,-0.393367
3,0.915797,-0.590961,-0.221343,-0.490296,0.570932,0.399542,-0.201473,-0.960204,0.856909,1.280391,0.352337,0.255823,0.741665,-0.399981,-0.923883,-0.410421,0.003778,-0.35315,-0.393367
4,0.915797,-0.590961,-0.221343,2.039586,0.570932,0.399542,-0.201473,-0.960204,-1.166986,-0.781011,0.352337,-0.210071,-1.348317,-0.399981,-0.923883,-0.410421,0.690767,0.038819,-0.393367


In [61]:
# normalize all values in the no markdown cell group
no_md_adjusted_stand = no_md_adjusted.loc[:, no_md_vars].values
no_md_adjusted_stand = StandardScaler().fit_transform(no_md_adjusted_stand)

In [62]:
# check standardization
print(np.mean(no_md_adjusted_stand), np.std(no_md_adjusted_stand))

-3.779947518034973e-17 0.9574271077563381


In [63]:
# put normalized group back into dataframe format
no_md_adjusted_stand_df = pd.DataFrame(data = no_md_adjusted_stand, columns = no_md_vars)

# initial look at the standardized data
no_md_adjusted_stand_df.head()

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,has_title,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,-0.094526,-0.68694,-0.256338,-0.263196,-0.347507,-1.451826,0.0,-0.336957,-0.42506,0.52516,-0.371956,-0.435477
1,-0.094526,0.546025,-0.256338,-0.131354,-0.347507,0.688788,0.0,-0.336957,-0.23371,-0.681648,0.365723,2.296334
2,-0.094526,-3.260437,1.489081,-0.685091,-0.347507,0.688788,0.0,6.280101,-0.42506,0.004038,0.024918,-0.435477
3,-0.094526,0.55541,-0.256338,-0.685091,-0.347507,0.688788,0.0,-0.336957,-0.42506,0.52516,-0.323583,-0.435477
4,-0.094526,0.294933,0.616371,-0.685091,-0.347507,0.688788,0.0,0.986454,0.065274,0.52516,-0.371956,2.296334


## Markdown Cells

### Performing PCA

In [64]:
# call PCA on the dataset
pca_md = PCA(n_components = len(md_vars))
pc_md = pca_md.fit_transform(md_adjusted_stand)

In [66]:
# put this into dataframe format
pca_md_df = pd.DataFrame(data = pc_md, columns = ['principal component ' + str(i) for i in range(len(md_vars))])

# initial look at the results of the PCA
pca_md_df.head()

Unnamed: 0,principal component 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,principal component 11,principal component 12,principal component 13,principal component 14,principal component 15,principal component 16,principal component 17,principal component 18
0,0.325632,2.486135,-0.434603,-0.662086,-0.871493,2.313732,0.464941,-0.085904,1.41165,-2.071667,-1.24266,-0.325391,0.337758,0.112486,0.04997,0.648099,-1.139459,-0.547465,-1.359421
1,0.850372,1.964054,-0.791791,-2.924954,0.506385,-0.983145,-0.74338,1.794124,-0.508036,-1.007058,0.321607,-0.258157,0.998064,0.158485,-0.05322,0.187462,-0.580699,0.106422,-0.437267
2,1.989086,0.797023,-1.404567,-2.214956,1.127977,-0.188968,-0.073963,1.062631,-0.271357,-0.914284,1.193776,0.387692,1.237828,0.432216,-0.342647,-0.053309,-0.392262,0.449585,-0.385573
3,0.807969,-0.854003,-0.589295,-0.710216,0.724741,0.58588,0.295259,0.118766,-0.033199,-0.930153,-0.321548,0.055024,-1.164403,0.325895,0.229711,0.325803,-0.758573,1.032983,0.246042
4,-0.685577,-0.632618,-0.671905,-0.009599,-0.206052,-1.158051,-0.66403,0.57325,1.185572,-0.06996,0.821988,-1.176089,-1.239776,-1.165998,0.851267,0.976175,0.247722,-0.617327,0.810952


### Analyze the Results of PCA

In [68]:
# extract the explained variance ratios
print(pca_md.explained_variance_ratio_)

[0.16405735 0.09253741 0.08574525 0.07068464 0.06554028 0.05693957
 0.05393053 0.04959195 0.04624602 0.04467132 0.04339441 0.04109365
 0.03401479 0.03016097 0.0294863  0.02710162 0.02583591 0.02240953
 0.01655849]


In [72]:
# examine each principal component with each feature
md_pca_var = pd.DataFrame(pca_md.components_, columns = md_vars)
md_pca_var

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
0,0.285941,0.307101,0.084662,0.192641,-0.116514,0.414784,0.078877,-0.140132,0.363885,0.382156,0.036146,0.228698,0.186034,0.096104,0.381265,0.143621,0.110146,-0.094329,-0.072382
1,-0.288496,0.050112,0.072435,-0.029529,-0.335399,-0.262235,0.430237,-0.264251,0.070177,0.090323,-0.136037,-0.444716,-0.155943,0.273586,0.124651,0.288457,0.184996,-0.091546,0.045496
2,-0.005633,0.05723,0.196835,0.142991,-0.101358,-0.077816,0.001696,0.185093,0.132622,0.178538,0.164829,-0.162013,-0.118717,0.234797,0.148224,-0.199409,-0.521322,0.578203,0.216741
3,-0.002933,0.058328,0.024464,0.077071,-0.432527,0.225516,0.450558,0.340499,-0.190698,-0.232117,-0.141803,0.248416,0.108025,0.251062,-0.215664,-0.264335,-0.024007,-0.00205,-0.263521
4,0.347004,-0.238179,-0.084076,-0.355716,-0.111945,0.040386,0.200143,-0.345431,-0.194304,-0.047419,0.287275,0.246118,0.163192,0.228308,-0.147761,0.223769,-0.135978,0.069582,0.398266
5,-0.085344,0.106391,-0.087946,-0.320845,-0.066453,0.126454,0.024065,-0.329605,0.022425,0.085199,-0.637554,-0.051721,0.290083,-0.284571,-0.02757,-0.115401,-0.31159,0.216895,-0.029096
6,-0.024561,-0.086823,0.786485,-0.153616,0.046541,-0.048024,-0.152149,-0.022773,-0.185179,0.063425,0.046788,-0.136122,0.447683,0.107736,0.002924,-0.137838,0.137075,-0.093467,-0.062997
7,0.247751,-0.195482,0.226244,-0.068564,0.379966,0.041954,0.065023,-0.027564,-0.018023,-0.011351,-0.295672,0.080887,-0.363774,0.302278,-0.020847,0.331163,-0.242417,0.030638,-0.453285
8,0.18788,-0.458557,-0.056694,0.567682,0.149459,0.028491,0.094952,-0.011902,-0.01779,-0.061789,-0.391835,-0.125206,0.195191,0.10843,0.089609,-0.11645,0.025002,-0.12332,0.368804
9,-0.278342,0.477272,0.24408,0.102523,0.191244,0.136159,-0.096159,-0.046555,0.054506,-0.287274,-0.204121,0.25398,-0.134804,0.200284,-0.234026,0.16098,-0.021766,-0.075286,0.470473


- principal component 0: `markdown_prop`
- principal component 1: `md_frequency`, `num_contrib`
- principal component 2: `exec_inorder`, `exec_skips`

## No Markdown Cells

### Performing PCA

In [70]:
# call PCA on the dataset
pca_no_md = PCA(n_components = len(no_md_vars))
pc_no_md = pca_no_md.fit_transform(no_md_adjusted_stand)

In [71]:
# put this into dataframe format
pca_no_md_df = pd.DataFrame(data = pc_no_md, columns = ['principal component ' + str(i) for i in range(len(no_md_vars))])

# initial look at the results of the PCA
pca_no_md_df.head()

Unnamed: 0,principal component 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,principal component 11
0,-0.893334,-0.524358,-0.55799,1.153832,-0.612463,0.477258,0.286244,0.188093,-0.550187,0.010195,-0.060608,-4.165598e-18
1,1.796207,-0.522384,1.105356,0.051893,-0.33725,-0.253991,-1.346783,-0.182888,0.32491,-0.092964,0.401822,5.119269e-17
2,-0.766702,6.184324,-0.229399,0.522036,0.28679,-0.26411,-1.5266,1.184071,-3.0719,0.078415,-1.116162,2.013429e-17
3,-0.299909,-0.53915,-0.087177,-0.537409,-0.241197,-0.28183,-0.20065,-0.757863,-0.052443,0.510133,-0.787259,-2.8403140000000005e-17
4,0.379913,0.570896,1.053284,-0.151507,-0.585976,-0.573586,-2.347405,0.228461,0.061454,0.475055,0.371588,-3.310026e-17


### Analyze the Results of PCA

In [73]:
# extract the explained variance ratios
print(pca_no_md.explained_variance_ratio_)

[1.44877473e-01 1.17219921e-01 1.10820199e-01 1.07470862e-01
 9.34570160e-02 8.32355020e-02 7.70352685e-02 7.19356098e-02
 6.97336151e-02 6.43576663e-02 5.98568668e-02 2.40966085e-34]


In [74]:
# examine each principal component with each feature
no_md_pca_var = pd.DataFrame(pca_no_md.components_, columns = no_md_vars)
no_md_pca_var

Unnamed: 0,has_author,jupyter_prop,num_contrib,image_prop,is_education,has_comments,has_title,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
0,-0.07795982,0.2187223,-0.2015507,0.2755454,-0.1304211,0.1932026,-0.0,0.04196626,-0.3345087,-0.5110295,0.5038921,0.3827301
1,0.1525162,-0.4062267,0.5524296,0.1227691,-0.01350744,0.2496445,-5.5511150000000004e-17,0.6202933,-0.03975941,-0.1000401,0.1506551,-0.09956688
2,0.4222961,-0.0956695,0.008768478,-0.5906011,0.01922221,0.1556169,5.5511150000000004e-17,-0.1060864,0.4497323,-0.2422024,0.1525859,0.3789232
3,-0.2812878,-0.4970473,0.1948967,-0.2688449,0.1295157,-0.5574266,-7.632783000000001e-17,-0.1969647,-0.3463893,-0.1576619,0.125616,0.1914233
4,-0.2237558,-0.08735949,-0.230465,0.009005992,0.8851078,0.2234213,1.110223e-16,0.07656581,0.11867,-0.0993912,0.1103568,-0.1173518
5,0.7705787,-0.2197396,-0.2848873,0.2602215,0.2145381,-0.1739854,2.775558e-17,-0.03750411,-0.3390709,0.0910601,-0.08007137,0.04087026
6,0.1572588,-0.04201735,0.01493735,0.08041495,-0.1011896,-0.1980967,-5.5511150000000004e-17,-0.2753931,0.2312492,-0.272875,0.4813113,-0.6941859
7,0.06399016,0.3934278,0.002759005,0.06981117,0.1364514,-0.6553052,5.5511150000000004e-17,0.5063562,0.2782431,-0.2033518,-0.05216798,0.1015156
8,0.165593,0.385746,0.6769236,0.1668611,0.3037905,0.04515743,-0.0,-0.4171738,-0.09679597,-0.1675285,-0.1602984,0.07885533
9,0.08931923,0.3445075,0.1230944,-0.3278368,0.1218606,-0.04360874,5.5511150000000004e-17,0.1173526,-0.2666361,0.5811662,0.5576695,-0.01843345


These numbers are bad :(

# Factor Analysis for Mixed Data (FAMD)

In [33]:
# drop the nb_id column
md_df = md_df.drop(['nb_id'], axis = 1)
no_md_df = no_md_df.drop(['nb_id'], axis = 1)

## Markdown Cells

### Performing FAMD

In [35]:
# instantiate FAMD object and fit to data
md_famd = prince.FAMD(n_components = len(md_vars),
                     n_iter = 10,
                     copy = True,
                     check_input = True,
                     engine = 'auto',
                     random_state = 42)
md_famd_fit = md_famd.fit(md_df)

In [37]:
# put the results into dataframe format
md_famd_df = md_famd_fit.row_coordinates(md_df)

# initial look at the results of the FAMD
md_famd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.164608,0.783306,1.270432,1.745692,-0.766967,-1.016525,-0.468929,-0.190143,-0.052992,-0.079018,-0.226218,-0.364906,-0.50683,0.190981,-0.460599,0.407758,0.087183,0.343825,-0.041551
1,0.929644,0.950496,0.205273,2.232974,1.084687,0.53009,1.095137,-0.057975,-0.308907,-0.17327,-0.381614,-0.468251,-0.095395,-0.040621,-0.153221,0.033503,-0.091679,-0.048205,-0.024298
2,1.746732,0.59778,-0.297726,1.551034,1.263304,0.295653,1.231061,0.014399,-0.231543,-0.18207,0.047084,-0.191655,-0.251523,0.101156,-0.246126,0.011362,-0.020445,-0.08584,-0.027967
3,1.080098,-0.61684,-0.173218,0.633549,0.174869,-0.252463,-0.678797,0.302454,0.054771,0.005511,0.138099,-0.150368,-0.004087,-0.163362,-0.282679,0.213046,0.069216,-0.334597,-0.028344
4,0.577267,-0.244647,-0.295428,0.615868,0.069033,-0.228658,-0.526441,0.076103,0.560053,0.085306,0.175634,-0.41218,0.583973,0.095564,0.48483,0.235923,-0.137051,-0.058211,-0.017775


### Anayzing the Results of FAMD

In [78]:
# extract the explained variance
md_famd_fit.explained_inertia_

[0.17772524635812395,
 0.14355001168479897,
 0.13029119555456128,
 0.11612587304922266,
 0.09046632300563355,
 0.07503281068692562,
 0.05971416346023756,
 0.05160943939416483,
 0.04527894428703184,
 0.03843553339342854,
 0.015091815429582758,
 0.011399314451769563,
 0.009085536875217896,
 0.008096080047352471,
 0.007172389658923194,
 0.006824402222182047,
 0.005884159118587794,
 0.005808309267694248,
 0.002408452054561979]

In [85]:
# examine correlation between variables and componbents
md_famd_fit.column_correlations(md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
exec_inorder,0.454924,0.444056,-0.42264,0.387653,-0.454772,0.217723,-0.410412,0.356803,-0.4375,0.436999,-0.399825,0.419575,0.247676,0.310979,0.417727,-0.413386,0.454702,-0.448383,0.332592
exec_skips,-0.975867,-0.932176,0.947281,-0.86313,0.995135,-0.529886,0.950824,-0.719924,0.99785,-0.959785,0.894771,-0.978186,-0.506328,-0.82827,-0.871335,0.846716,-0.984216,0.978178,-0.665659
has_author,-0.02714,-0.021712,0.065765,-0.073857,0.042496,0.078688,0.002435,-0.068617,0.041328,-0.061916,0.102569,-0.021956,0.046277,-0.025051,-0.079711,0.06029,-0.061453,0.040938,-0.077781
has_comments,-0.087134,-0.100201,0.071774,-0.065071,0.092178,0.011913,0.064639,-0.143396,0.078057,-0.131872,0.088371,-0.068716,-0.021332,0.018084,-0.127617,0.123248,-0.102283,0.096596,-0.121835
has_equation,0.008199,0.003563,0.022375,-0.046628,0.004535,0.034608,0.003546,-0.002204,0.011436,-0.003233,0.035946,0.000326,0.089681,0.044956,0.002653,-0.013725,-0.004482,-0.009125,-0.032984
has_error,-0.115399,-0.107323,0.111619,-0.087843,0.120199,-0.022081,0.084924,-0.121139,0.109988,-0.132006,0.121423,-0.104271,-0.002208,-0.05886,-0.110208,0.13463,-0.111854,0.13167,-0.122174
has_links,0.026079,0.019292,0.033193,-0.055268,0.019654,0.069714,-0.025148,-0.026859,0.0113,-0.01563,0.083131,0.002468,0.146586,-0.02631,-0.00087,-0.017597,-0.026783,-0.020304,-0.077846
has_title,0.058483,0.035773,-0.052338,0.0276,-0.039312,0.005443,-0.044668,0.02623,-0.043737,0.043061,-0.009292,0.056004,0.073376,0.087823,0.053532,-0.079958,0.043901,-0.058164,0.042064
image_prop,-0.164238,-0.172875,0.100503,-0.112629,0.084089,-0.030216,0.160951,-0.148939,0.115302,-0.113831,0.08404,-0.135146,-0.376321,-0.165719,-0.282116,0.126801,-0.083045,0.143166,-0.109114
is_education,0.047402,0.037046,0.006643,-0.034742,-0.004343,0.063753,-0.035007,0.007033,-0.009964,0.011317,0.036724,0.034725,0.208689,0.004172,-0.010005,-0.059025,0.00184,-0.03806,-0.036186


Mostly `exec_skips` and `exec_inorder`?

## No Markdown Cells

### Performing FAMD

In [90]:
# instantiate FAno_md object and fit to data
no_md_famd = prince.FAMD(n_components = len(no_md_vars),
                     n_iter = 10,
                     copy = True,
                     check_input = True,
                     engine = 'auto',
                     random_state = 42)
no_md_famd_fit = no_md_famd.fit(no_md_df)

In [91]:
# put the results into dataframe format
no_md_famd_df = no_md_famd_fit.row_coordinates(no_md_df)

# initial look at the results of the FAMD
no_md_famd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.384848,-0.174134,-0.025656,-0.071469,-0.783579,-0.31397,0.066796,-0.008589,0.010241,-0.051499,0.001302,-3.153429e-17
1,1.464745,-0.339557,-0.523199,0.42541,-0.262094,0.276548,0.096525,-0.305167,0.794933,-0.071657,-0.019252,1.944057e-17
2,0.824295,-2.034832,4.818672,-0.562361,0.963186,-2.497767,0.92657,-1.164406,-0.071591,-0.224247,-0.048413,3.290462e-16
3,0.541519,-0.868991,-0.705387,-0.114001,-0.533037,0.17761,0.628889,-0.277912,-0.27079,-0.169231,-0.010555,6.352306000000001e-17
4,0.696381,-1.443751,0.284118,-0.140115,0.122949,0.138177,0.735506,-0.489016,0.959957,-0.071282,-0.032579,1.735457e-16


### Analyzing the Results of FAMD

In [92]:
# extract the explained variance
no_md_famd_fit.explained_inertia_

[0.17476534204487856,
 0.15439702710667477,
 0.14203027829514048,
 0.12432596088010421,
 0.10488251185885655,
 0.08897343352711908,
 0.07974773906591694,
 0.07854034224324315,
 0.02827831843671608,
 0.022096977105165123,
 0.001962069436186329,
 5.244446005682904e-67]

In [93]:
# examine correlation between variables and componbents
no_md_famd_fit.column_correlations(no_md_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
exec_inorder,-0.305886,-0.305443,-0.226028,-0.301211,-0.273961,0.124051,-0.275413,-0.280449,0.322225,-0.273652,-0.288085,0.286443
exec_skips,0.997394,0.991559,0.782926,0.995174,0.892568,-0.376471,0.99549,0.998482,-0.979113,0.950751,0.937746,-0.96902
has_author,-0.017685,-0.023175,0.005013,-0.014993,0.008689,0.005246,-0.013495,-0.017154,0.023412,-0.013781,0.10693,0.022201
has_comments,0.09776,0.082436,0.09756,0.076027,0.14199,-0.075909,0.095285,0.083044,-0.079394,0.08643,0.049123,-0.076808
has_error,0.148971,0.154654,0.067966,0.150893,0.105903,-0.100856,0.142672,0.140701,-0.142416,0.116109,0.155042,-0.157483
has_title,,,,,,,,,,,,
image_prop,0.091062,0.09015,0.066623,0.007668,0.102915,-0.086593,0.040327,0.068047,-0.009798,0.086819,0.125597,-0.051636
is_education,-0.032447,-0.031216,-0.041306,-0.030483,-0.034273,-0.026818,-0.033504,-0.03343,0.023649,-0.00128,-0.039363,0.021536
jupyter_prop,0.03761,0.052529,-0.140011,0.011644,0.036863,-0.088479,0.024428,0.017612,-0.057777,0.008369,0.092228,-0.054739
non_exec_prop,-0.134708,-0.148901,-0.106564,-0.098928,-0.022718,0.049238,-0.131993,-0.11214,0.15022,-0.11601,-0.254623,0.107525


Again, seems to mostly be `exec_skips` and `exec_inorder`