# Imports

In [50]:
import pandas as pd
import itertools
import math

# Loading the Data

## Markdown Cells

In [3]:
md_binary_filepath = 'binary-data/markdown_group_binary.csv'
md_binary_original = pd.read_csv(md_binary_filepath)

In [4]:
# make a copy of the original dataframe that we will modify
md_binary_df = md_binary_original.copy()

# initial look at the data
md_binary_df.head()

Unnamed: 0.1,Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,0,594,True,False,False,False,high,high,low,low,...,low,high,low,False,False,low,False,low,False,high
1,1,1222,True,False,False,False,high,low,low,low,...,high,high,low,False,False,low,False,low,False,low
2,2,1447,True,False,False,False,high,low,low,low,...,high,high,low,False,True,high,False,high,False,low
3,3,2705,True,False,False,False,high,low,low,low,...,low,high,low,False,False,low,False,low,False,low
4,4,2861,True,False,False,True,high,low,low,low,...,low,high,low,False,False,low,False,low,False,low


In [5]:
# clear the first two columns
md_binary_df = md_binary_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

## No Markdown Cells

In [6]:
no_md_binary_filepath = 'binary-data/no_markdown_group_binary.csv'
no_md_binary_original = pd.read_csv(no_md_binary_filepath)

In [7]:
# make a copy of the original dataframe that we will modify
no_md_binary_df = no_md_binary_original.copy()

# initial look at the data
no_md_binary_df.head()

Unnamed: 0.1,Unnamed: 0,nb_id,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,0,1589,False,high,high,low,low,False,False,low,low,high,low,False,True,low,False,False,low
1,1,1919,False,high,high,low,low,False,True,low,low,high,low,True,True,low,False,False,low
2,2,2857,False,low,low,high,low,False,True,high,low,high,low,False,True,low,False,False,low
3,3,4339,False,high,low,low,low,False,True,low,low,high,low,False,False,low,False,False,low
4,4,4659,False,high,low,low,low,False,True,high,low,high,low,True,False,low,False,False,low


In [8]:
# drop the first two columns
no_md_binary_df = no_md_binary_df.drop(['Unnamed: 0', 'nb_id'], axis = 1)

# Constructing Contingency Tables

## Markdown Cells

In [9]:
# create the pairs we need contingency tables for 
md_pairs = list(itertools.product(md_binary_df.columns, md_binary_df.columns))

# filter out duplicates
md_pairs = [(var1, var2) for (var1, var2) in md_pairs if var1 != var2]
md_pairs_f = []
for var1, var2 in md_pairs:
    if (var2, var1) not in md_pairs_f:
        md_pairs_f.append((var1, var2))
md_pairs = md_pairs_f

# check on the pairs
md_pairs

[('longer_beginning', 'longer_ending'),
 ('longer_beginning', 'has_author'),
 ('longer_beginning', 'has_equation'),
 ('longer_beginning', 'jupyter_prop'),
 ('longer_beginning', 'output_cell_prop'),
 ('longer_beginning', 'markdown_prop'),
 ('longer_beginning', 'num_contrib'),
 ('longer_beginning', 'image_prop'),
 ('longer_beginning', 'is_education'),
 ('longer_beginning', 'has_links'),
 ('longer_beginning', 'has_comments'),
 ('longer_beginning', 'md_frequency'),
 ('longer_beginning', 'has_title'),
 ('longer_beginning', 'num_commits'),
 ('longer_beginning', 'md_format'),
 ('longer_beginning', 'non_exec_prop'),
 ('longer_beginning', 'exec_inorder'),
 ('longer_beginning', 'exec_skips'),
 ('longer_beginning', 'has_error'),
 ('longer_beginning', 'has_export'),
 ('longer_beginning', 'num_functions'),
 ('longer_beginning', 'has_test'),
 ('longer_beginning', 'num_headers'),
 ('longer_beginning', 'has_param'),
 ('longer_beginning', 'num_stars'),
 ('longer_ending', 'has_author'),
 ('longer_ending

In [10]:
# create a copy of the dataframe that we will work with
md_binary_enc = md_binary_df.copy()

In [11]:
# encode with 0s and 1s
md_binary_enc = md_binary_enc.replace(True, 1)
md_binary_enc = md_binary_enc.replace(False, 0)
md_binary_enc = md_binary_enc.replace('high', 1)
md_binary_enc = md_binary_enc.replace('low', 0)

# initial look at the encoded data
md_binary_enc.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,1.0,0.0,0.0,0.0,1,1,0,0,0,1.0,...,0,1,0,0.0,0.0,0,0.0,0,0.0,1
1,1.0,0.0,0.0,0.0,1,0,0,0,0,1.0,...,1,1,0,0.0,0.0,0,0.0,0,0.0,0
2,1.0,0.0,0.0,0.0,1,0,0,0,0,1.0,...,1,1,0,0.0,1.0,1,0.0,1,0.0,0
3,1.0,0.0,0.0,0.0,1,0,0,0,0,1.0,...,0,1,0,0.0,0.0,0,0.0,0,0.0,0
4,1.0,0.0,0.0,1.0,1,0,0,0,0,0.0,...,0,1,0,0.0,0.0,0,0.0,0,0.0,0


In [12]:
# check value counts for each column
for col in md_binary_enc.columns:
    print(md_binary_enc[col].value_counts())

1.0    1246
0.0    1045
Name: longer_beginning, dtype: int64
0.0    1698
1.0     593
Name: longer_ending, dtype: int64
0.0    2184
1.0     107
Name: has_author, dtype: int64
0.0    1847
1.0     444
Name: has_equation, dtype: int64
1    2003
0     288
Name: jupyter_prop, dtype: int64
1    1381
0     910
Name: output_cell_prop, dtype: int64
0    1763
1     528
Name: markdown_prop, dtype: int64
0    2140
1     151
Name: num_contrib, dtype: int64
0    1721
1     570
Name: image_prop, dtype: int64
1.0    1321
0.0     970
Name: is_education, dtype: int64
0.0    1423
1.0     868
Name: has_links, dtype: int64
1.0    2038
0.0     253
Name: has_comments, dtype: int64
1    1434
0     857
Name: md_frequency, dtype: int64
1.0    1478
0.0     813
Name: has_title, dtype: int64
0    1981
1     310
Name: num_commits, dtype: int64
0.0    1236
1.0    1055
Name: md_format, dtype: int64
0    2081
1     210
Name: non_exec_prop, dtype: int64
1    2180
0     111
Name: exec_inorder, dtype: int64
0    1925
1   

In [13]:
# make sure everything is int
for col in md_binary_enc.columns:
    md_binary_enc[col] = md_binary_enc[col].apply(int)

In [14]:
# check the data again
md_binary_enc.head()

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
0,1,0,0,0,1,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,1,...,1,1,0,0,1,1,0,1,0,0
3,1,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
# create empty dictionary of contingency tables
md_tables = {}

# for each pair
for pair in md_pairs:
    var1, var2 = pair
    md_tables[pair] = pd.crosstab(md_binary_enc[var1], md_binary_enc[var2], margins = True)

## No Markdown Cells

In [16]:
# create the pairs we need contingency tables for 
no_md_pairs = list(itertools.product(no_md_binary_df.columns, no_md_binary_df.columns))

# filter out duplicates
no_md_pairs = [(var1, var2) for (var1, var2) in no_md_pairs if var1 != var2]
no_md_pairs_f = []
for var1, var2 in no_md_pairs:
    if (var2, var1) not in no_md_pairs_f:
        no_md_pairs_f.append((var1, var2))
no_md_pairs = no_md_pairs_f

# check on the pairs
no_md_pairs

[('has_author', 'jupyter_prop'),
 ('has_author', 'output_cell_prop'),
 ('has_author', 'num_contrib'),
 ('has_author', 'image_prop'),
 ('has_author', 'is_education'),
 ('has_author', 'has_comments'),
 ('has_author', 'num_commits'),
 ('has_author', 'non_exec_prop'),
 ('has_author', 'exec_inorder'),
 ('has_author', 'exec_skips'),
 ('has_author', 'has_error'),
 ('has_author', 'has_export'),
 ('has_author', 'num_functions'),
 ('has_author', 'has_test'),
 ('has_author', 'has_param'),
 ('has_author', 'num_stars'),
 ('jupyter_prop', 'output_cell_prop'),
 ('jupyter_prop', 'num_contrib'),
 ('jupyter_prop', 'image_prop'),
 ('jupyter_prop', 'is_education'),
 ('jupyter_prop', 'has_comments'),
 ('jupyter_prop', 'num_commits'),
 ('jupyter_prop', 'non_exec_prop'),
 ('jupyter_prop', 'exec_inorder'),
 ('jupyter_prop', 'exec_skips'),
 ('jupyter_prop', 'has_error'),
 ('jupyter_prop', 'has_export'),
 ('jupyter_prop', 'num_functions'),
 ('jupyter_prop', 'has_test'),
 ('jupyter_prop', 'has_param'),
 ('jupyte

In [17]:
# create a copy of the dataframe that we will work with
no_md_binary_enc = no_md_binary_df.copy()

In [18]:
# encode with 0s and 1s
no_md_binary_enc = no_md_binary_enc.replace(True, 1)
no_md_binary_enc = no_md_binary_enc.replace(False, 0)
no_md_binary_enc = no_md_binary_enc.replace('high', 1)
no_md_binary_enc = no_md_binary_enc.replace('low', 0)

# initial look at the encoded data
no_md_binary_enc.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,0.0,1,1,0,0,0.0,0.0,0,0,1,0,0.0,1.0,0,0.0,0.0,0
1,0.0,1,1,0,0,0.0,1.0,0,0,1,0,1.0,1.0,0,0.0,0.0,0
2,0.0,0,0,1,0,0.0,1.0,1,0,1,0,0.0,1.0,0,0.0,0.0,0
3,0.0,1,0,0,0,0.0,1.0,0,0,1,0,0.0,0.0,0,0.0,0.0,0
4,0.0,1,0,0,0,0.0,1.0,1,0,1,0,1.0,0.0,0,0.0,0.0,0


In [19]:
# check value counts for each column
for col in no_md_binary_enc.columns:
    print(no_md_binary_enc[col].value_counts())

0.0    1342
1.0      12
Name: has_author, dtype: int64
1    1148
0     206
Name: jupyter_prop, dtype: int64
0    841
1    513
Name: output_cell_prop, dtype: int64
0    1236
1     118
Name: num_contrib, dtype: int64
0    1101
1     253
Name: image_prop, dtype: int64
0.0    1208
1.0     146
Name: is_education, dtype: int64
1.0    919
0.0    435
Name: has_comments, dtype: int64
0    1041
1     313
Name: num_commits, dtype: int64
0    1248
1     106
Name: non_exec_prop, dtype: int64
1    1204
0     150
Name: exec_inorder, dtype: int64
0    1178
1     176
Name: exec_skips, dtype: int64
0.0    1138
1.0     216
Name: has_error, dtype: int64
0.0    935
1.0    419
Name: has_export, dtype: int64
0    1153
1     201
Name: num_functions, dtype: int64
0.0    1353
1.0       1
Name: has_test, dtype: int64
0.0    1339
1.0      15
Name: has_param, dtype: int64
0    1299
1      55
Name: num_stars, dtype: int64


In [20]:
# make sure everything is int
for col in no_md_binary_enc.columns:
    no_md_binary_enc[col] = no_md_binary_enc[col].apply(int)

In [21]:
# check the data again
no_md_binary_enc.head()

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0
1,0,1,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0
2,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0,0,0
3,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
4,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0


In [22]:
# create empty dictionary of contingency tables
no_md_tables = {}

# for each pair
for pair in no_md_pairs:
    var1, var2 = pair
    no_md_tables[pair] = pd.crosstab(no_md_binary_enc[var1], no_md_binary_enc[var2], margins = True)

# Calculating $\phi$ Coefficients

## Markdown Cells

In [29]:
# create the dataframe that will hold the coefficients
md_coeffs = pd.DataFrame(columns = md_binary_enc.columns, index = md_binary_enc.columns)
md_coeffs

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
longer_beginning,,,,,,,,,,,...,,,,,,,,,,
longer_ending,,,,,,,,,,,...,,,,,,,,,,
has_author,,,,,,,,,,,...,,,,,,,,,,
has_equation,,,,,,,,,,,...,,,,,,,,,,
jupyter_prop,,,,,,,,,,,...,,,,,,,,,,
output_cell_prop,,,,,,,,,,,...,,,,,,,,,,
markdown_prop,,,,,,,,,,,...,,,,,,,,,,
num_contrib,,,,,,,,,,,...,,,,,,,,,,
image_prop,,,,,,,,,,,...,,,,,,,,,,
is_education,,,,,,,,,,,...,,,,,,,,,,


In [31]:
# for everything on the diagonal, we populate with 1
for col in md_coeffs.columns:
    md_coeffs[col][col] = 1
md_coeffs

Unnamed: 0,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,is_education,...,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,num_stars
longer_beginning,1.0,,,,,,,,,,...,,,,,,,,,,
longer_ending,,1.0,,,,,,,,,...,,,,,,,,,,
has_author,,,1.0,,,,,,,,...,,,,,,,,,,
has_equation,,,,1.0,,,,,,,...,,,,,,,,,,
jupyter_prop,,,,,1.0,,,,,,...,,,,,,,,,,
output_cell_prop,,,,,,1.0,,,,,...,,,,,,,,,,
markdown_prop,,,,,,,1.0,,,,...,,,,,,,,,,
num_contrib,,,,,,,,1.0,,,...,,,,,,,,,,
image_prop,,,,,,,,,1.0,,...,,,,,,,,,,
is_education,,,,,,,,,,1.0,...,,,,,,,,,,


In [49]:
# check the format of a contingency table
md_tables[('longer_beginning', 'has_author')]

has_author,0,1,All
longer_beginning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1002,43,1045
1,1182,64,1246
All,2184,107,2291


In [51]:
# for each contingency table, calculate the coefficient and input to the table
for pair in md_pairs:
    
    # get the contingency table
    table = md_tables[pair]
    
    # calculate the phi coefficient
    n = table.loc['All', 'All']
    one_all = table.loc[1, 'All']
    all_one = table.loc['All', 1]
    phi_coeff_num = (n * table.loc[1, 1]) - (one_all * all_one)
    phi_coeff_denom = math.sqrt(one_all * all_one * (n - one_all) * (n - all_one))
    phi_coeff = phi_coeff_num / phi_coeff_denom
    
    # input into the table
    var1, var2 = pair
    md_coeffs[var1][var2] = phi_coeff
    md_coeffs[var2][var1] = phi_coeff

In [56]:
# output to csv 
md_coeffs.to_csv('correlation-results/markdown_group_phi_coefficients.csv')

## No Markdown Cells

In [57]:
# create the dataframe that will hold the coefficients
no_md_coeffs = pd.DataFrame(columns = no_md_binary_enc.columns, index = no_md_binary_enc.columns)
no_md_coeffs

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
has_author,,,,,,,,,,,,,,,,,
jupyter_prop,,,,,,,,,,,,,,,,,
output_cell_prop,,,,,,,,,,,,,,,,,
num_contrib,,,,,,,,,,,,,,,,,
image_prop,,,,,,,,,,,,,,,,,
is_education,,,,,,,,,,,,,,,,,
has_comments,,,,,,,,,,,,,,,,,
num_commits,,,,,,,,,,,,,,,,,
non_exec_prop,,,,,,,,,,,,,,,,,
exec_inorder,,,,,,,,,,,,,,,,,


In [58]:
# for everything on the diagonal, we populate with 1
for col in no_md_coeffs.columns:
    no_md_coeffs[col][col] = 1
no_md_coeffs

Unnamed: 0,has_author,jupyter_prop,output_cell_prop,num_contrib,image_prop,is_education,has_comments,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,has_param,num_stars
has_author,1.0,,,,,,,,,,,,,,,,
jupyter_prop,,1.0,,,,,,,,,,,,,,,
output_cell_prop,,,1.0,,,,,,,,,,,,,,
num_contrib,,,,1.0,,,,,,,,,,,,,
image_prop,,,,,1.0,,,,,,,,,,,,
is_education,,,,,,1.0,,,,,,,,,,,
has_comments,,,,,,,1.0,,,,,,,,,,
num_commits,,,,,,,,1.0,,,,,,,,,
non_exec_prop,,,,,,,,,1.0,,,,,,,,
exec_inorder,,,,,,,,,,1.0,,,,,,,


In [59]:
# for each contingency table, calculate the coefficient and input to the table
for pair in no_md_pairs:
    
    # get the contingency table
    table = no_md_tables[pair]
    
    # calculate the phi coefficient
    n = table.loc['All', 'All']
    one_all = table.loc[1, 'All']
    all_one = table.loc['All', 1]
    phi_coeff_num = (n * table.loc[1, 1]) - (one_all * all_one)
    phi_coeff_denom = math.sqrt(one_all * all_one * (n - one_all) * (n - all_one))
    phi_coeff = phi_coeff_num / phi_coeff_denom
    
    # input into the table
    var1, var2 = pair
    no_md_coeffs[var1][var2] = phi_coeff
    no_md_coeffs[var2][var1] = phi_coeff

In [61]:
# output to a csv
no_md_coeffs.to_csv('correlation-results/no_markdown_group_phi_coefficients.csv')