# Imports

In [21]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np

# Loading the Data

In [2]:
full_data_original = pd.read_pickle('full_all_data.pkl')

In [3]:
full_data_df = full_data_original.copy()

In [4]:
# initial look at the data
full_data_df.head()

Unnamed: 0,nb_id,repo_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,has_error,speaking_language,has_export,num_functions,has_test,num_headers,has_param,has_reqtext,num_stars,errors
0,0,165196403,False,False,False,False,0.995303,0.166667,0.538462,1.0,...,False,,False,0.0,False,9.0,False,False,1.0,
1,2,165196403,True,True,False,True,0.995303,0.965517,0.508475,1.0,...,False,,False,0.0,False,10.0,False,False,1.0,
2,3,165196403,False,False,False,False,0.995303,0.5,0.529412,1.0,...,False,,False,3.0,False,6.0,False,False,1.0,
3,5,165196403,False,True,False,True,0.995303,0.461538,0.518519,1.0,...,False,,False,7.0,False,10.0,False,False,1.0,
4,7,165196403,,,False,True,0.995303,0.666667,0.571429,1.0,...,False,Norwegian,False,2.0,False,8.0,False,False,1.0,


# Filtering, Cleaning, and Grouping the Data

## Cleaning (errors)

In [6]:
# look at the errors
full_data_df['errors'].value_counts()

no code                                                    5528
filtered out                                                533
nb file                                                     300
has_author                                                  197
api                                                          97
has_reqtext                                                  81
num_commits                                                  32
output_cell_prop,image_prop                                   7
image_prop                                                    4
longer_beginning,longer_ending                                2
has_export,num_functions,has_test,num_headers,has_param       1
md_format                                                     1
has_export,num_functions                                      1
num_commits,has_reqtext                                       1
Name: errors, dtype: int64

We can filter those out with `no code`, `filtered out`, `nb file`, and `api` errors

In [8]:
condition = lambda error : error not in ['no code', 'filtered out', 'nb file', 'api']
error_filtered = full_data_df[full_data_df['errors'].map(condition)]

len(error_filtered)

136667

In [10]:
# look at errors again
error_filtered['errors'].value_counts()

has_author                                                 197
has_reqtext                                                 81
num_commits                                                 32
output_cell_prop,image_prop                                  7
image_prop                                                   4
longer_beginning,longer_ending                               2
has_export,num_functions,has_test,num_headers,has_param      1
md_format                                                    1
has_export,num_functions                                     1
num_commits,has_reqtext                                      1
Name: errors, dtype: int64

We can leave those with `has_author` errors, since the author URLs alone must have been invalid. We can assign those to a value of `False` for `has_author`

In [16]:
for nb_id in error_filtered[error_filtered['errors'] == 'has_author']['nb_id']:
    error_filtered.loc[error_filtered['nb_id'] == nb_id, 'has_author'] = False

In [20]:
# check that we assigned values correctly (no more nans in has_author)
error_filtered['has_author'].isna().sum()

0

We can now get rid of the `has_author` errors

In [22]:
for nb_id in error_filtered[error_filtered['errors'] == 'has_author']['nb_id']:
    error_filtered.loc[error_filtered['nb_id'] == nb_id, 'errors'] = np.nan

In [24]:
# check the errors again
error_filtered['errors'].value_counts()

has_reqtext                                                81
num_commits                                                32
output_cell_prop,image_prop                                 7
image_prop                                                  4
longer_beginning,longer_ending                              2
has_export,num_functions,has_test,num_headers,has_param     1
md_format                                                   1
has_export,num_functions                                    1
num_commits,has_reqtext                                     1
Name: errors, dtype: int64

Let's tackle the `has_reqtext` errors next

In [29]:
error_filtered[error_filtered['errors'] == 'has_reqtext'][['nb_id', 'has_reqtext']].head()

Unnamed: 0,nb_id,has_reqtext
2095,2223,
11650,12256,
15039,15821,
17464,18541,
22263,23601,


Looking into these errors, it seems that the path was assigned incorrectly in the `notebooks.csv`, so we re-do the process of finding notebook path and running `has_requirements` for these notebooks.

Next, the `num_commits` errors

In [31]:
error_filtered[error_filtered['errors'] == 'num_commits'][['nb_id', 'num_commits']].head()

Unnamed: 0,nb_id,num_commits
5005,5260,
5225,5495,
11420,12011,
11454,12047,
41942,44300,


Looking into these errors, it seems that the `%` characters in the notebook are not being escaped when the notebook paths are put into the URL, so we can re-do running the script on these notebooks and make sure to escape this character properly

The rest of the errors have less than 10 notebooks associated with them, so we can handle those individually

## Filtering

In [32]:
filtered = error_filtered.copy()

For now, let's filter out those that still have errors, as those will get resolved later

In [40]:
filtered = filtered[filtered['errors'].isnull()]
filtered['errors'].value_counts()

Series([], Name: errors, dtype: int64)

In [41]:
# we can now drop the errors column
filtered = filtered.drop(['errors'], axis = 1)

Next, we must filter down to just English and `python` notebooks

In [44]:
# look at the languages
filtered['language'].value_counts()

python                                                    126025
python3                                                     6774
julia                                                        985
R                                                            922
python2                                                      181
                                                           ...  
markdown                                                       1
python37564bitsidconda1d45427122d043f58fabe1fddaa2b008         1
python37764bitaa037e6b9ce0482a97f7f2f266a78e54                 1
lisp                                                           1
mathematica                                                    1
Name: language, Length: 71, dtype: int64

In [47]:
# drop the notebooks with 'nan' in the language column
filtered = filtered.dropna(subset = ['language'])

# filter down to those with python
filtered = filtered[filtered['language'].str.contains("python")]
len(filtered)

133071

In [48]:
# check new value counts
filtered['language'].value_counts()

python                                                              126025
python3                                                               6774
python2                                                                181
micropython                                                             65
python37164bitbasecondad2822a14379545b6ad8221fdb28a47f5                  5
python37664bite5e01d20666b4ea39b669a865bfedc91                           3
python37664bit63fb4f53efb44350a8fe54ef578d532a                           2
python37464bit2210ce78c9014828b7442ce7d2fdf2c3                           2
python37132bitsimplificandocondad4a025f5665f45f198698144d61138fc         2
python36964bit159e511b6454490fab42e7c387ca06d9                           2
python37464bitbaseconda688895a7bc4d4baeb81185c00bfb955a                  1
python3.5                                                                1
python2.7                                                                1
python36064bittfbaseconda

In [49]:
# we can now drop the language column
filtered = filtered.drop(['language'], axis = 1)

In [50]:
# look at the speaking languages
filtered['speaking_language'].value_counts()

English                    79938
Korean                      2257
Spanish                     1479
Russian                     1404
Portuguese                  1362
French                      1012
Welsh                        867
Japanese                     825
Catalan                      770
Norwegian                    765
Italian                      649
German                       550
Vietnamese                   521
Polish                       456
Romanian                     299
Estonian                     254
Indonesian                   231
Tagalog                      179
Danish                       148
Turkish                      145
Dutch                        119
Croatian                     117
Afrikaans                    106
Swedish                       65
Somali                        54
Bulgarian                     50
Czech                         45
Hungarian                     44
Finnish                       38
Modern Greek (1453-)          34
Thai      

In [53]:
# filter down to only english notebooks and notebooks with no markdown cells
filtered = filtered[(filtered['speaking_language'] == 'English') |
                   (filtered['markdown_prop'] == 0)]
len(filtered)

115821

In [52]:
# check the speaking languages again
filtered['speaking_language'].value_counts()

English    79938
Name: speaking_language, dtype: int64

In [54]:
# we now no longer need the speaking language column
filtered = filtered.drop(['speaking_language'], axis = 1)

## Cleaning and Adjusting (nan values)

In [56]:
adjusted = filtered.copy()

In [57]:
# check each column for NaN
for column in list(adjusted):
    hasNaN = adjusted[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
repo_id : False
longer_beginning : True
longer_ending : True
has_author : False
has_equation : True
jupyter_prop : True
output_cell_prop : False
markdown_prop : False
num_contrib : False
image_prop : True
is_education : False
has_links : True
has_comments : False
md_frequency : True
has_title : False
num_commits : False
md_format : True
non_exec_prop : False
exec_inorder : True
exec_skips : True
has_error : False
has_export : False
num_functions : False
has_test : False
num_headers : False
has_param : False
has_reqtext : False
num_stars : False


For the features associated with markdown cells, it is fine to have `NaN` values; we will separate the data into groups accordingly in a later section.
- `longer_beginning`
- `longer_ending`
- `has_equation`
- `has_links`
- `md_frequency`
- `md_format`

Let us look at the `jupyter_prop` nan values

In [61]:
len(adjusted[pd.isnull(adjusted['jupyter_prop'])])

23

In [62]:
adjusted[pd.isnull(adjusted['jupyter_prop'])].head()

Unnamed: 0,nb_id,repo_id,longer_beginning,longer_ending,has_author,has_equation,jupyter_prop,output_cell_prop,markdown_prop,num_contrib,...,exec_inorder,exec_skips,has_error,has_export,num_functions,has_test,num_headers,has_param,has_reqtext,num_stars
48689,51490,168097544,False,False,False,False,,0.0,0.166667,1.0,...,,,False,True,0.0,False,4.0,False,False,0.0
57507,60734,168518417,False,False,False,False,,0.636364,0.352941,7.0,...,1.0,1.380952,False,False,0.0,False,6.0,False,False,18.0
57508,60735,168518417,False,False,False,False,,0.6,0.347826,7.0,...,1.0,1.0,False,False,0.0,False,5.0,False,False,18.0
57510,60737,168518417,False,False,False,False,,0.888889,0.25,7.0,...,0.857143,12.714286,False,False,0.0,False,0.0,False,False,18.0
57511,60738,168518417,False,False,False,False,,0.529412,0.413793,7.0,...,1.0,1.333333,False,False,0.0,False,6.0,False,False,18.0


These are notebooks of repositories for which `Jupyter Notebook` was left out in the language data, so we can filter these out

In [64]:
jupyter_nans = adjusted[pd.isnull(adjusted['jupyter_prop'])].index
adjusted = adjusted.drop(jupyter_nans, inplace = False)
len(adjusted)

115798

In [66]:
# check nans again
for column in list(adjusted):
    hasNaN = adjusted[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
repo_id : False
longer_beginning : True
longer_ending : True
has_author : False
has_equation : True
jupyter_prop : False
output_cell_prop : False
markdown_prop : False
num_contrib : False
image_prop : True
is_education : False
has_links : True
has_comments : False
md_frequency : True
has_title : False
num_commits : False
md_format : True
non_exec_prop : False
exec_inorder : True
exec_skips : True
has_error : False
has_export : False
num_functions : False
has_test : False
num_headers : False
has_param : False
has_reqtext : False
num_stars : False


Similar to how we did so on the last dataset, we can adjust values of the `image_prop`, `exec_inorder`, and `exec_skips` columns

In [67]:
# change NaN in image_prop to 0
adjusted['image_prop'] = adjusted['image_prop'].fillna(0)

In [68]:
# change NaN in exec_inorder to 1
adjusted['exec_inorder'] = adjusted['exec_inorder'].fillna(1)

# change NaN in exec_skips to 0
adjusted['exec_skips'] = adjusted['exec_skips'].fillna(0)

In [69]:
# check nans again
for column in list(adjusted):
    hasNaN = adjusted[column].isnull().values.any()
    print(column + " : " + str(hasNaN))

nb_id : False
repo_id : False
longer_beginning : True
longer_ending : True
has_author : False
has_equation : True
jupyter_prop : False
output_cell_prop : False
markdown_prop : False
num_contrib : False
image_prop : False
is_education : False
has_links : True
has_comments : False
md_frequency : True
has_title : False
num_commits : False
md_format : True
non_exec_prop : False
exec_inorder : False
exec_skips : False
has_error : False
has_export : False
num_functions : False
has_test : False
num_headers : False
has_param : False
has_reqtext : False
num_stars : False


## Grouping

In [70]:
# separate into markdown cell group and non-markdown cell group
md_group = adjusted[adjusted['markdown_prop'] > 0].copy()
no_md_group = adjusted[adjusted['markdown_prop'] == 0].copy()

In [71]:
len(md_group) + len(no_md_group)

115798

We can export these cleaned and adjusted groups for quick retrieval later on.

In [73]:
md_group.to_pickle('full_markdown_group.pkl')
no_md_group.to_pickle('full_no_markdown_group.pkl')