In [9]:
%matplotlib inline
import pandas as pd
import json
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

In [15]:
path_to_data = '../../dataset/sklearn_full_cells.csv'
GEN_NTBS_FOLDER = 'recovered_notebooks'

In [16]:
df = pd.read_csv(path_to_data, nrows=500000)

In [17]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,id,repository_id,notebook_id,index,cell_type,execution_count,lines,output_formats,source,python,processed,skip,v
0,0,23191644,130180,808459,9,code,13.0,2,text/html;text/plain,"HTML(data=""""""<iframe width=""854"" height=""480"" ...",True,4,0,808459
1,1,23191643,130180,808459,8,code,12.0,4,text/html;text/plain,from IPython.display import HTML\n\nHTML(data=...,True,4,0,808459
2,2,23191635,130180,808459,0,code,3.0,3,,"get_ipython().run_line_magic('pylab', 'inline'...",True,4,0,808459
3,3,23191646,130180,808459,11,code,3.0,29,text/plain;image/png,from sklearn.datasets import load_iris\nget_ip...,True,4,0,808459
4,4,23191649,130180,808459,14,code,,1,,\n,True,4,0,808459
5,5,23191648,130180,808459,13,markdown,,1,,"<img src=""http://scikit-learn.sourceforge.net/...",True,4,0,808459
6,6,23191637,130180,808459,2,markdown,,1,,<img src='imgs/svm.jpg'>\n,True,4,0,808459
7,7,23191636,130180,808459,1,markdown,,1,,"# SVM, support vector machine\n",True,4,0,808459
8,8,23191638,130180,808459,3,markdown,,19,,"Дискриминантная функция: $$f(\overline{x}, \ov...",True,4,0,808459
9,9,23191647,130180,808459,12,markdown,,2,,SVM можно использовать и для регрессии:\n<a hr...,True,4,0,808459


## Generate notebook as .txt file (raw)

In [18]:


CODE_CELL_HEADER = '#%%'
MD_CELL_HEADER = '#%% md'
RAW_CELL_HEADER = '#%% raw'

def generate_notebook_txt(df, df_row_idx, up_to_cell_number=None, include_non_code_cells=True):
    row = df[df['id'] == df_row_idx]
    repo_id = row['repository_id'].values[0]
    notebook_id = row['notebook_id'].values[0]
    
    only_notebook_cells = df[df['notebook_id'] == notebook_id]
    if not include_non_code_cells:
        only_notebook_cells = only_notebook_cells[only_notebook_cells['cell_type'] == 'code']
    
    result = ''
    for idx, record in only_notebook_cells.sort_values(by=['index']).iterrows():
        record_cell_type = record['cell_type']
        if record_cell_type == 'code':
            result += CODE_CELL_HEADER + '\n'
        elif record_cell_type == 'markdown':
            result += MD_CELL_HEADER + '\n'
        elif record_cell_type == 'raw':
            result += RAW_CELL_HEADER + '\n'
        
        else:
            raise RuntimeError(f'Unknown cell type: {record_cell_type}')
        
        result += record['source'] + '\n'
    
    return result
    

In [19]:
import os
def save_jupyter_notebook_txt(src, notebook_id, folder):
    fname = '.'.join((str(notebook_id), 'txt'))
    path = os.path.join(folder, fname)
    
    with open(path, 'w') as out_f:
        out_f.write(src)

## Generate notebook as .ipynb file

In [20]:

class JPT_NB_GRAMMAR:
    
    METADATA_KEY = 'metadata'
    CELLS_KEY = 'cells'
    CELL_TYPE_KEY = 'cell_type'
    EXECUTION_COUNT_KEY = 'execution_count'
    OUTPUTS_KEY = 'outputs'
    SOURCE_KEY = 'source'
    NBFORMAT_KEY = 'nbformat'
    NBFORMAT_MINOR_KEY = 'nbformat_minor'
    
    
    class DEFAULTS:
        DEFAULT_CELL_METADATA = {}
        DEFAULT_EXECUTION_COUNT = None
        DEFAULT_OUTPUTS = []
        
        DEFAULT_NB_METADATA = {
          "kernelspec": {
           "display_name": "Python 3",
           "language": "python",
           "name": "python3"
          },
          "language_info": {
           "codemirror_mode": {
            "name": "ipython",
            "version": 3
           },
           "file_extension": ".py",
           "mimetype": "text/x-python",
           "name": "python",
           "nbconvert_exporter": "python",
           "pygments_lexer": "ipython3",
           "version": "3.7.8"
          }
        }
        DEFAULT_NBFORMAT = 4
        DEFAULT_NBFORMAT_MINOR = 1

In [25]:
def generate_notebook_ipynb(df, notebook_id, up_to_cell_number=None, include_non_code_cells=True):
    
    only_notebook_cells = df[df['notebook_id'] == notebook_id]
    if not include_non_code_cells:
        only_notebook_cells = only_notebook_cells[only_notebook_cells['cell_type'] == 'code']
    
    result = {}
    
    result[JPT_NB_GRAMMAR.METADATA_KEY] = JPT_NB_GRAMMAR.DEFAULTS.DEFAULT_NB_METADATA
    result[JPT_NB_GRAMMAR.NBFORMAT_KEY] = JPT_NB_GRAMMAR.DEFAULTS.DEFAULT_NBFORMAT
    result[JPT_NB_GRAMMAR.NBFORMAT_MINOR_KEY] = JPT_NB_GRAMMAR.DEFAULTS.DEFAULT_NBFORMAT_MINOR
    
    result[JPT_NB_GRAMMAR.CELLS_KEY] = []
    
    for idx, record in only_notebook_cells.sort_values(by=['index']).iterrows():
        cell = {}
        cell[JPT_NB_GRAMMAR.CELL_TYPE_KEY] = record['cell_type']
        cell[JPT_NB_GRAMMAR.EXECUTION_COUNT_KEY] = JPT_NB_GRAMMAR.DEFAULTS.DEFAULT_EXECUTION_COUNT
        cell[JPT_NB_GRAMMAR.METADATA_KEY] = JPT_NB_GRAMMAR.DEFAULTS.DEFAULT_CELL_METADATA
        cell[JPT_NB_GRAMMAR.OUTPUTS_KEY] = JPT_NB_GRAMMAR.DEFAULTS.DEFAULT_OUTPUTS
        cell[JPT_NB_GRAMMAR.SOURCE_KEY] = record['source']
        
        result[JPT_NB_GRAMMAR.CELLS_KEY].append(cell)
    
    return result

In [26]:
def save_jupyter_notebook_ipynb(raw_json, notebook_id, folder):
    fname = '.'.join((str(notebook_id), 'ipynb'))
    path = os.path.join(folder, fname)
    
    with open(path, 'w') as out_f:
        json.dump(raw_json, out_f)

In [33]:
# C = 1e-1
ntb_id_1 = 1675557
ntb_id_2 = 1347872

save_jupyter_notebook_ipynb(generate_notebook_ipynb(df, ntb_id_1), ntb_id_1, GEN_NTBS_FOLDER)
save_jupyter_notebook_ipynb(generate_notebook_ipynb(df, ntb_id_2), ntb_id_2, GEN_NTBS_FOLDER)