# Imports

In [1]:
import pandas as pd

# Loading the Data

In [2]:
filepath = 'all_data.csv'
original_df = pd.read_csv(filepath)

In [3]:
# initial look at the data
original_df.head()

Unnamed: 0,nb_id,error,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,speaking_language
0,294,nb_file,,,,,,,,,...,,,,,,,,,,
1,329,nb_file,,,,,,,,,...,,,,,,,,,,
2,580,nb_file,,,,,,,,,...,,,,,,,,,,
3,594,,True,False,False,False,0.507588,1.0,0.044444,3.0,...,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False,English
4,921,,False,True,False,False,0.999968,0.0,0.295455,1.0,...,False,0.27907,True,1.0,True,1.0,,,False,Estonian


# Cleaning the Data

- get rid of rows with `nb_file`, `api`, and `no_code` errors, as these have no data associated with them
- filter to English and `python` notebooks

## Notebook Errors

In [4]:
# filter out nb_file errors
nb_errors = original_df[original_df['error'] == 'nb_file'].index
nb_filtered = original_df.drop(nb_errors, inplace = False)

# filter out api errors
api_errors = nb_filtered[nb_filtered['error'] == 'api'].index
api_filtered = nb_filtered.drop(api_errors, inplace = False)

# filter out no_code errors
code_errors = api_filtered[api_filtered['error'] == 'no_code'].index
code_filtered = api_filtered.drop(code_errors, inplace = False)

In [5]:
# we should no longer need the error column, since it was only used as filtering criteria
final_error_filtered = code_filtered.drop(["error"], axis = 1)

# initial look at the error-filtered data
final_error_filtered.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,...,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error,speaking_language
3,594,True,False,False,False,0.507588,1.0,0.044444,3.0,0.0,...,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False,English
4,921,False,True,False,False,0.999968,0.0,0.295455,1.0,,...,False,0.27907,True,1.0,True,1.0,,,False,Estonian
6,1222,True,False,False,False,1.0,0.0,0.16129,1.0,,...,True,0.333333,False,2.0,True,0.961538,,,False,English
7,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,0.0,...,True,0.621429,True,1.0,True,0.988636,,,False,English
8,1589,,,False,,0.71413,0.538462,0.0,1.0,0.142857,...,False,,False,1.0,,0.0,1.0,1.0,False,


## English and Python Notebooks

In [6]:
# filter down to only english notebooks and notebooks with no markdown cells
english_notebooks = final_error_filtered[(final_error_filtered['speaking_language'] == "English") |
                                        (final_error_filtered['markdown_prop'] == 0)]

# we now no longer need the speaking_language column
english_notebooks = english_notebooks.drop(["speaking_language"], axis = 1)

# initial look at the english notebooks
english_notebooks.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,...,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,1.0,0.044444,3.0,0.0,...,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.0,0.16129,1.0,,...,True,True,0.333333,False,2.0,True,0.961538,,,False
7,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,0.0,...,True,True,0.621429,True,1.0,True,0.988636,,,False
8,1589,,,False,,0.71413,0.538462,0.0,1.0,0.142857,...,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.375,0.111111,2.0,0.666667,...,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


**Note: We determine the language of the notebook by looking at markdown cells. Therefore, if there are no markdown cells, then we cannot determine the language of the notebook; however, since then the language cannot skew markdown analysis, we still include those notebooks in our filtered data**

In [7]:
# drop the notebooks with 'nan' in the language column
lang_notebooks = english_notebooks.dropna(subset = ['language'])

# filter down to only python notebooks
python_notebooks = lang_notebooks[lang_notebooks['language'].str.contains("python")]

In [8]:
# we should now no longer need the language column
final_lang_filtered = python_notebooks.drop(["language"], axis = 1)

#initial look at the language-filtered data
final_lang_filtered.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,...,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,1.0,0.044444,3.0,0.0,...,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.0,0.16129,1.0,,...,True,True,0.333333,False,2.0,True,0.961538,,,False
7,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,0.0,...,True,True,0.621429,True,1.0,True,0.988636,,,False
8,1589,,,False,,0.71413,0.538462,0.0,1.0,0.142857,...,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.375,0.111111,2.0,0.666667,...,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


## Smaller API Errors

We filter out the notebooks that had API errors when trying to measure specific metrics (more specifically, those notebooks that have a `NaN` recorded for `jupyter_prop` and `has_author`)

In [9]:
nans = final_lang_filtered[(pd.isnull(final_lang_filtered['jupyter_prop'])) |
               (pd.isnull(final_lang_filtered['has_author']))].index
nans_filtered = final_lang_filtered.drop(nans, inplace = False)

In [10]:
# check on the filtered data
nans_filtered.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,image_prop,...,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,1.0,0.044444,3.0,0.0,...,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.0,0.16129,1.0,,...,True,True,0.333333,False,2.0,True,0.961538,,,False
7,1447,True,False,False,False,0.970851,0.011364,0.375887,1.0,0.0,...,True,True,0.621429,True,1.0,True,0.988636,,,False
8,1589,,,False,,0.71413,0.538462,0.0,1.0,0.142857,...,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.375,0.111111,2.0,0.666667,...,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


In [11]:
# check the size of the filtered data
print(len(nans_filtered))

3924


In [12]:
final_filtered = nans_filtered

# Adjusting the Data

- `output_cell_prop` not as useful a metric as tracking execution order, cut this 
- based on above, for `image_prop`, all `NaN` should be changed to 0 (no output cells $\Longrightarrow$ no images)
- for execution order metrics, all `NaN` should also be changed to 1 or 0 (1 for exec_inorder, since then there are no cells that were executed out-of-order, and 0 for exec_skips, since there are no skips in execution order)

## Cutting Output Cell Metrics

In [13]:
# get rid of the output_cell_prop column
no_output = final_filtered.drop(["output_cell_prop"], axis = 1)
no_output.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.16129,1.0,,True,True,True,0.333333,False,2.0,True,0.961538,,,False
7,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,,,False
8,1589,,,False,,0.71413,0.0,1.0,0.142857,False,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.111111,2.0,0.666667,False,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


In [14]:
# change NaN in image_prop to 0
adjusted_image_prop = no_output
adjusted_image_prop['image_prop'] = adjusted_image_prop['image_prop'].fillna(0)

In [15]:
# initial look at the new data
adjusted_image_prop.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.16129,1.0,0.0,True,True,True,0.333333,False,2.0,True,0.961538,,,False
7,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,,,False
8,1589,,,False,,0.71413,0.0,1.0,0.142857,False,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.111111,2.0,0.666667,False,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


## Adjusting Execution Order Metrics

In [16]:
# change NaN in exec_inorder to 1
adjusted_exec_inorder = adjusted_image_prop
adjusted_exec_inorder['exec_inorder'] = adjusted_exec_inorder['exec_inorder'].fillna(1)

# initial look at the new data
adjusted_exec_inorder.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.16129,1.0,0.0,True,True,True,0.333333,False,2.0,True,0.961538,1.0,,False
7,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,1.0,,False
8,1589,,,False,,0.71413,0.0,1.0,0.142857,False,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.111111,2.0,0.666667,False,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


In [17]:
# change NaN in exec_skips to 0
adjusted_exec_skips = adjusted_exec_inorder
adjusted_exec_skips['exec_skips'] = adjusted_exec_skips['exec_skips'].fillna(0)

# initial look at the new data
adjusted_exec_skips.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.16129,1.0,0.0,True,True,True,0.333333,False,2.0,True,0.961538,1.0,0.0,False
7,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,1.0,0.0,False
8,1589,,,False,,0.71413,0.0,1.0,0.142857,False,,False,,False,1.0,,0.0,1.0,1.0,False
9,1624,,,False,False,0.990929,0.111111,2.0,0.666667,False,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False


## Check for Remaining NaN

In [18]:
final_adjusted = adjusted_exec_skips

In [19]:
# check each column for NaN
for column in list(final_adjusted):
    hasNaN = final_adjusted[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
longer_beginning : True
longer_ending : True
has_author : False
has_equation : True
jupyter_prop : False
markdown_prop : False
num_contrib : False
image_prop : False
is_education : False
has_links : True
has_comments : False
md_frequency : True
has_title : False
num_commits : False
md_format : True
non_exec_prop : False
exec_inorder : False
exec_skips : False
has_error : True


For the features associated with markdown cells, it is fine to have `NaN` values; we will separate the data into groups accordingly in a later section.
- `longer_beginning`
- `longer_ending`
- `has_equation`
- `has_links`
- `md_frequency`
- `md_format`

There is a single notebook for which `has_error` came up with a `NaN`, this is because some cells in the notebook did not record `output_type`. Analyzing the notebook, it doesn't seem to have any errors, so we will assign this one a value of `False`

In [20]:
# assign notebook 672725 has_error False
final_adjusted.loc[final_adjusted['nb_id'] == 672725, 'has_error'] = False

In [21]:
# check NaNs again
for column in list(final_adjusted):
    hasNaN = final_adjusted[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
longer_beginning : True
longer_ending : True
has_author : False
has_equation : True
jupyter_prop : False
markdown_prop : False
num_contrib : False
image_prop : False
is_education : False
has_links : True
has_comments : False
md_frequency : True
has_title : False
num_commits : False
md_format : True
non_exec_prop : False
exec_inorder : False
exec_skips : False
has_error : False


# Grouping the Data

## Markdown Cells

In [22]:
# filter to notebooks who have non-zero markdown cell proportion
with_markdown = final_adjusted[final_adjusted['markdown_prop'] > 0]

In [23]:
# initial look at this group
with_markdown.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
3,594,True,False,False,False,0.507588,0.044444,3.0,0.0,True,True,False,0.068182,True,1.0,True,0.0,1.0,1.809524,False
6,1222,True,False,False,False,1.0,0.16129,1.0,0.0,True,True,True,0.333333,False,2.0,True,0.961538,1.0,0.0,False
7,1447,True,False,False,False,0.970851,0.375887,1.0,0.0,True,True,True,0.621429,True,1.0,True,0.988636,1.0,0.0,False
9,1624,,,False,False,0.990929,0.111111,2.0,0.666667,False,False,True,0.125,True,2.0,False,0.0,1.0,1.0,False
12,2705,True,False,False,False,1.0,0.461538,1.0,0.0,True,True,True,0.64,True,1.0,False,0.0,0.923077,1.615385,False


In [24]:
# make sure there are no NaNs in the data
for column in list(with_markdown):
    hasNaN = with_markdown[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
longer_beginning : True
longer_ending : True
has_author : False
has_equation : False
jupyter_prop : False
markdown_prop : False
num_contrib : False
image_prop : False
is_education : False
has_links : False
has_comments : False
md_frequency : False
has_title : False
num_commits : False
md_format : True
non_exec_prop : False
exec_inorder : False
exec_skips : False
has_error : False


Forgot that the script doesn't perform the `longer_beginning` and `longer_ending` metrics on notebooks with less than 10 cells, so we can filter these out

In [25]:
no_small = with_markdown.dropna(subset = ['longer_beginning'])

However, there are still `NaN` values in `md_format`:

In [26]:
# extract rows with NaN in md_format
format_nans = no_small[pd.isnull(no_small['md_format'])]
format_nans.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
2964,551103,True,False,False,True,0.98943,0.538462,1.0,0.272727,True,False,True,0.92,True,2.0,,0.083333,0.9,4.5,False
2968,551325,True,False,False,True,0.98943,0.538462,1.0,0.272727,True,False,True,0.92,True,1.0,,0.083333,0.9,4.3,False
3355,627235,True,False,False,True,0.790195,0.518519,7.0,0.272727,True,False,True,0.884615,True,6.0,,0.0,1.0,1.0,False


Running these notebooks individually in `testing.py`, not sure why they are recorded as `NaN`, since all three come up as `True`. We will simple replace these values with the correct ones.

In [40]:
# change the rows so that 'md_format' is True instead of NaN
adjusted_md_format = no_small
adjusted_md_format.loc[adjusted_md_format['nb_id'] == 551103, 'md_format'] = True
adjusted_md_format.loc[adjusted_md_format['nb_id'] == 551325, 'md_format'] = True
adjusted_md_format.loc[adjusted_md_format['nb_id'] == 627235, 'md_format'] = True

In [41]:
final_markdown = adjusted_md_format

In [42]:
# check for NaNs again
for column in list(final_markdown):
    hasNaN = final_markdown[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
longer_beginning : False
longer_ending : False
has_author : False
has_equation : False
jupyter_prop : False
markdown_prop : False
num_contrib : False
image_prop : False
is_education : False
has_links : False
has_comments : False
md_frequency : False
has_title : False
num_commits : False
md_format : False
non_exec_prop : False
exec_inorder : False
exec_skips : False
has_error : False


In [43]:
# check the size of this group
print(len(final_markdown))

2291


## No Markdown Cells

In [44]:
# filter to notebooks who have zero markdown cell proportion
no_markdown = final_adjusted[final_adjusted['markdown_prop'] == 0]

In [45]:
# initial look at this group
no_markdown.head()

Unnamed: 0,nb_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_links,has_comments,md_frequency,has_title,num_commits,md_format,non_exec_prop,exec_inorder,exec_skips,has_error
8,1589,,,False,,0.71413,0.0,1.0,0.142857,False,,False,,False,1.0,,0.0,1.0,1.0,False
10,1919,,,False,,0.99784,0.0,1.0,0.1875,False,,True,,False,1.0,,0.04878,0.789474,10.631579,True
14,2857,,,False,,0.121957,0.0,5.0,0.0,False,,True,,False,11.0,,0.0,0.909091,6.181818,False
27,4339,,,False,,1.0,0.0,1.0,0.0,False,,True,,False,1.0,,0.0,1.0,1.631579,False
29,4659,,,False,,0.940063,0.0,3.0,0.0,False,,True,,False,3.0,,0.125,1.0,1.0,True


Since this group has no markdown cells, we can get rid of the features that have to do with markdown cells, which are listed above

In [46]:
adjusted_no_markdown = no_markdown.drop(['longer_beginning', 'longer_ending', 'has_equation', 'has_links',
                                        'md_frequency', 'md_format'], axis = 1)

In [47]:
# check the new data table
adjusted_no_markdown.head()

Unnamed: 0,nb_id,has_author,jupyter_prop,markdown_prop,num_contrib,image_prop,is_education,has_comments,has_title,num_commits,non_exec_prop,exec_inorder,exec_skips,has_error
8,1589,False,0.71413,0.0,1.0,0.142857,False,False,False,1.0,0.0,1.0,1.0,False
10,1919,False,0.99784,0.0,1.0,0.1875,False,True,False,1.0,0.04878,0.789474,10.631579,True
14,2857,False,0.121957,0.0,5.0,0.0,False,True,False,11.0,0.0,0.909091,6.181818,False
27,4339,False,1.0,0.0,1.0,0.0,False,True,False,1.0,0.0,1.0,1.631579,False
29,4659,False,0.940063,0.0,3.0,0.0,False,True,False,3.0,0.125,1.0,1.0,True


In [48]:
final_no_markdown = adjusted_no_markdown

In [49]:
# check for NaNs
for column in list(final_no_markdown):
    hasNaN = final_no_markdown[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
has_author : False
jupyter_prop : False
markdown_prop : False
num_contrib : False
image_prop : False
is_education : False
has_comments : False
has_title : False
num_commits : False
non_exec_prop : False
exec_inorder : False
exec_skips : False
has_error : False


In [50]:
# check the size of this group
print(len(final_no_markdown))

1355


# Export Groups to CSV Files

In case we want to do EDA on each group separately later, we extract each dataframe to their own `.csv` file

In [51]:
# export markdown cell group to csv file
markdown_output = 'markdown_group.csv'
final_markdown.to_csv(markdown_output)

In [52]:
# export no markdown cell group to csv file
no_markdown_output = 'no_markdown_group.csv'
final_no_markdown.to_csv(no_markdown_output)

# Principal Component Analysis

# Clustering

# Scrap