In [224]:
import json
with open('demo.ipynb', 'r', encoding='utf8') as f:
    notebook = json.load(f)
notebook.keys()

dict_keys(['cells', 'metadata', 'nbformat', 'nbformat_minor'])

其他的都不重要，主要关注cells。

先看前两个cell，可以发现，它们的`cell_type`分别是`markdown`和`code`。



In [225]:
print("cell num: %s" % len(notebook['cells']))

cell num: 12


In [226]:
notebook['cells'][0]

{'cell_type': 'markdown',
 'id': 'b779a8c2-b2de-42ad-b5e9-066782aca5cb',
 'metadata': {},
 'source': ['# demo\n', '\n', '**导包**：导入3个包']}

In [227]:
notebook['cells'][1]

{'cell_type': 'code',
 'execution_count': 27,
 'id': '4346ecd8-b86e-49ce-9a27-410ef5fc6102',
 'metadata': {},
 'outputs': [],
 'source': ['import numpy as np\n',
  'import pandas as pd\n',
  'import matplotlib.pyplot as plt']}

markdown cell没有outputs。

code cell 要转换成markdown的话需要在代码前后用三引号包起来

然后看看cell的outputs都有哪些类型

ps: pandas 的输出有 `text/html` 和 `text/plain`。一般不用html，只需要获取plain即可。

In [233]:
# text / plain
notebook['cells'][3]

# text/html 与 text/plain
# notebook['cells'][5]

{'cell_type': 'code',
 'execution_count': 39,
 'id': '04e41d1c-6777-4e87-b95c-8f03f9900599',
 'metadata': {},
 'outputs': [{'name': 'stdout', 'output_type': 'stream', 'text': ['数组a为:\n']},
  {'data': {'text/plain': ['array([[5, 8, 9],\n',
     '       [5, 0, 0],\n',
     '       [1, 7, 6],\n',
     '       [9, 2, 4]])']},
   'execution_count': 39,
   'metadata': {},
   'output_type': 'execute_result'}],
 'source': ['np.random.seed(1)\n',
  'a = np.random.randint(0, 10, size=(4,3))\n',
  'print("数组a为:")\n',
  'a']}

对于图片格式，需要把字符串保存成图片。

同时转换成markdown的图片表示形式，还需要考虑图片路径与markdown导出路径的问题

`![](./images/img_name.png)`

In [239]:
# image/png
notebook['cells'][9]['outputs'][0].keys()

dict_keys(['data', 'metadata', 'output_type'])

In [243]:
notebook['cells'][9]['outputs'][0]['data'].keys()

dict_keys(['image/png', 'text/plain'])

In [242]:
notebook['cells'][9]['outputs'][0]['data']['image/png'][:100]

'iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywg'

最终形成以下代码

In [220]:
import os
import json
import base64


class Notebook:
    def __init__(self, notebook_path, markdown_dir="."):
        
        # 读取.ipynb文件数据
        self.cells = list()
        with open(notebook_path, 'r', encoding='utf8') as f:
            notebook_dict = json.load(f)
            for cell_dict in notebook_dict.get('cells'):
                self.cells.append(Cell(self, cell_dict))

        self.notebook_path = notebook_path
        self.notebook_dict = notebook_dict
        
        self.markdown_name = os.path.split(self.notebook_path)[-1].split('.')[0]
        self.markdown_dir = markdown_dir
        self.markdown_path = os.path.join(self.markdown_dir, self.markdown_name + ".md")
        
        self.img_save_dir = os.path.join(self.markdown_dir, 'images')
        self.img_num = 1
    
    def save_img(self, image):
        # 创建图片文件夹
        if not os.path.isdir(self.img_save_dir):
            os.makedirs(self.img_save_dir)
        
        # 图片编码为二进制
        img_data = base64.b64decode(image)

        # img_name == "demoname_plot_1.png"
        img_name = f"{self.markdown_name}_plot_{self.img_num}.png"
        img_path = os.path.join(self.img_save_dir, img_name)

        with open(img_path, 'wb') as f:
            f.write(img_data)
        
        self.img_num += 1
        # img_name = os.path.split(img_path)[-1]
        
        return f"![](./images/{img_name})\n"
    
    def text(self):
        res = ""
        for cell in self.cells:
            res += cell.text()
        return res
    
    def to_markdown(self, save=False):
        text = self.text()
        if save:
            if not os.path.isdir(self.markdown_dir):
                os.makedirs(self.markdown_dir)

            with open(self.markdown_path, 'w', encoding='utf8') as f:
                f.write(text)
        return text

class Cell(dict):
    def __init__(self, notebook, cell_dict):
        super().__init__()
        self.update(cell_dict)
        
        self.notebook = notebook
        
        # self.cell_type = cell_dict['cell_type']
        # self.source = cell_dict['source']
        # self.outputs = cell_dict.get('outputs')
    def __getattr__(self, key):
        return self[key]
    
    def __setattr__(self, key, value):
        self.__dict__[key] = value
    
    # def __repr__(self):
    #    return f"{self.idx} cell_type: {self.cell_type}"
        
    def markdown_text(self):
        return "".join(self.source) + '\n'
    
    def code_text(self, text, code_type='python'):
        if text:
            return f"```{code_type}\n{text}```\n"
        return ""
        # text = "".join(self.source)
        # return f"```{code_type}\n{text}\n```\n"
    
    def outputs_text(self):
        text = ""
        imgs = list()
        for output in self.outputs:
            # print的输出
            if output['output_type'] == 'stream':
                text = text +  "".join(output['text']) + '\n'
                continue
            
            # 错误的输出
            if output['output_type'] == "error":
                tmp = "\n".join(output['traceback'])
                tmp = re.sub(r'\x1b.*?m', '', tmp)
                text = text + tmp + '\n'
                # text += f"```\n{tmp}\n```\n"
                continue

            output_data = output['data']
            
            # 图片类型的输出
            image = output_data.get('image/png')
            if image:
                # 保存图片到特定文件夹
                # img_data = base64.b64decode(image)
                markdown_img = self.notebook.save_img(image)
                imgs.append(markdown_img)
                continue
                
            # 文本类型的输出
            text_plain = output_data.get('text/plain')
            if text_plain:
                text = text +  "".join(text_plain) + "\n"
                
        text = self.code_text(text, code_type='')
        text += "".join(imgs)
        
        return text
    
    def text(self):
        if self.cell_type == 'markdown':
            return self.markdown_text()
        
        if self.cell_type == "code":
            c_text = "".join(self.source) + "\n"
            res = self.code_text(c_text)
            
            res += self.outputs_text()
            return res

In [221]:
notebook = Notebook('demo.ipynb', 'f/g/ad/')
res = notebook.to_markdown(save=True)

In [223]:
print(res)

# demo

**导包**：导入3个包
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
```
查看numpy生成的数组
```python
np.random.seed(1)
a = np.random.randint(0, 10, size=(4,3))
print("数组a为:")
a
```
```
数组a为:

array([[5, 8, 9],
       [5, 0, 0],
       [1, 7, 6],
       [9, 2, 4]])
```
查看pandas生成的DataFrame
```python
df = pd.DataFrame([['tom', 22, 89],
                   ['alice', 21, 92],
                   ['jack', 20, 78]]
                  , columns=["name","age","score"])
df
```
```
    name  age  score
0    tom   22     89
1  alice   21     92
2   jack   20     78
```
查看其中一个series
```python
df['name']
```
```
0      tom
1    alice
2     jack
Name: name, dtype: object
```
使用matplotlib绘图
```python
x = np.linspace(0, np.pi)
y = np.sin(x)
plt.plot(y)
plt.show()
```
![](./images/demo_plot_1.png)
如果代码报错
```python
c
```
```
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
~\Ap