# Chatpter 11 调试

## 处理异常

In [1]:
raise Exception('This is an error message')

Exception: This is an error message

In [4]:
# boxPrint.py
def boxPrint(symbol, width, height):
    if len(symbol) !=1:
        raise Exception('Symbol must be a sigle character string.')
    if width <=2:
        raise Exception('Width must be greater than 2.')
    if height <=2:
        raise Exception('Height must be greater than 2.')
    print(symbol*width)
    for i in range(height-2):
        print(symbol+(' '*(width-2))+symbol)
    print(symbol*width)

In [5]:
for sym, w, h in (('*',4,4), ('0',20,5), ('x',1,3), ('zz',3,3)):
    try:
        boxPrint(sym,w,h)
    except Exception as err:
        print(f'An exception happended: {err}')        

****
*  *
*  *
****
00000000000000000000
0                  0
0                  0
0                  0
00000000000000000000
An exception happended: Width must be greater than 2.
An exception happended: Symbol must be a sigle character string.


In [6]:
for sym, w, h in (('*',4,4), ('0',20,5), ('x',1,3), ('zz',3,3)):
    boxPrint(sym,w,h) 

****
*  *
*  *
****
00000000000000000000
0                  0
0                  0
0                  0
00000000000000000000


Exception: Width must be greater than 2.

+ ↑ 异常最好和try...except搭配以便更好地处理，而不是让程序崩溃
+ 针对的是用户的错误，抛出相应的异常，避免其看到程序的崩溃页面

## 获取回溯字符串

In [9]:
def spam():
    foo()
    
def foo():
    raise Exception('This is an error message.')

In [10]:
spam()

Exception: This is an error message.

In [11]:
import traceback

try:
    raise Exception('This is the error message')
except:
    with open('errorInfo.txt', 'w') as f:
        f.write(traceback.format_exc())
    print('The traceback info was written to errorInfo.txt')        

The traceback info was written to errorInfo.txt


+ ps： logging模块比↑更合适

## 断言assert

+ assert针对的是程序员的错误
+ 面对assert的问题，那么程序就应该崩溃，这样的“快速失败”有利于bug的解决

In [12]:
age = [22,55,87,96,11,36,45,12,75,68]
age.reverse()

In [13]:
age

[68, 75, 12, 45, 36, 11, 96, 87, 55, 22]

In [14]:
assert age[0] < age[-1]

AssertionError: 

In [15]:
assert age[0] < age[-1], 'some error'

AssertionError: some error

## 日志logging

### 使用

In [19]:
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('Start of program')

def factorial(n):
    logging.debug(f'Start of factorial({n})')
    total = 1
    for i in range(n+1):
        total *=i
        logging.debug(f'i is {i} and total is {total}')
    logging.debug('End of factorial')
    return total

print(factorial(5))
logging.debug('End of program')

2023-10-30 12:05:54,573 - DEBUG - Start of program
2023-10-30 12:05:54,574 - DEBUG - Start of factorial(5)
2023-10-30 12:05:54,575 - DEBUG - i is 0 and total is 0
2023-10-30 12:05:54,575 - DEBUG - i is 1 and total is 0
2023-10-30 12:05:54,575 - DEBUG - i is 2 and total is 0
2023-10-30 12:05:54,576 - DEBUG - i is 3 and total is 0
2023-10-30 12:05:54,576 - DEBUG - i is 4 and total is 0
2023-10-30 12:05:54,576 - DEBUG - i is 5 and total is 0
2023-10-30 12:05:54,577 - DEBUG - End of factorial
2023-10-30 12:05:54,577 - DEBUG - End of program


0


In [20]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('Start of program')

def factorial(n):
    logging.debug(f'Start of factorial({n})')
    total = 1
    for i in range(1,n+1):
        total *=i
        logging.debug(f'i is {i} and total is {total}')
    logging.debug('End of factorial')
    return total

print(factorial(5))
logging.debug('End of program')

2023-10-30 12:08:32,305 - DEBUG - Start of program
2023-10-30 12:08:32,306 - DEBUG - Start of factorial(5)
2023-10-30 12:08:32,306 - DEBUG - i is 1 and total is 1
2023-10-30 12:08:32,307 - DEBUG - i is 2 and total is 2
2023-10-30 12:08:32,307 - DEBUG - i is 3 and total is 6
2023-10-30 12:08:32,307 - DEBUG - i is 4 and total is 24
2023-10-30 12:08:32,308 - DEBUG - i is 5 and total is 120
2023-10-30 12:08:32,308 - DEBUG - End of factorial
2023-10-30 12:08:32,308 - DEBUG - End of program


120


### 日志级别

|级别|日志函数|
|---|---|
|DEBUG|logging.debug()|
|INFO|logging.info()|
|WARNING|..warning()|
|ERROR|..error()|
|CRITICAL|..critical()|


### 禁用

In [21]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info('info info info')

2023-10-30 12:19:17,627 - INFO - info info info


In [22]:
logging.disable()

In [23]:
logging.warning('warning warning warning')

### 记录到文件

In [None]:
logging.basicConfig(level=logging.INFO, filename='some_prog_log.txt', format='%(asctime)s - %(levelname)s - %(message)s')

logging.warning('warning warning warning')

logging.info('info info info')

logging.disable()

In [34]:
import logging

logging.basicConfig(filename="test.log", filemode="w", format="%(asctime)s %(name)s:%(levelname)s:%(message)s", datefmt="%d-%M-%Y %H:%M:%S", level=logging.DEBUG)
logging.debug('This is a debug message')
logging.info('This is an info message')
logging.warning('This is a warning message')
logging.error('This is an error message')
logging.critical('This is a critical message')

+ PS: ↑ 在Jupyter里生成文件失败，在vscode里执行生成Log文件成功

# Chapter 12 从web抓取信息

## 用webbrowser打开网页

In [37]:
import webbrowser
webbrowser.open('https://baidu.com/')

True

In [None]:
#! python3
# mapIt.py - Lauches a map in the browser using an address from the
# cml or clipboard

import webbrowser, sys, pyperclip

# get address from cml
if len(sys.argv) > 1:
    loc = sys.argv[1:]
    loc = ' '.join(loc)
else:
    loc = pyperclip.paste()

webbrowser.open(f'https://map.baidu.com/search/{loc}/@-8222684.38,4961265.97,12.96z?querytype=s&da_src=shareurl&wd={loc}&c=360')

## 用request从web中下载文件

In [38]:
import requests
res = requests.get('https://automatetheboringstuff.com/files/rj.txt')

In [39]:
type(res)

requests.models.Response

In [40]:
len(res.text)

178978

In [42]:
print(res.text)

The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org/license


Title: Romeo and Juliet

Author: William Shakespeare

Posting Date: May 25, 2012 [EBook #1112]
Release Date: November, 1997  [Etext #1112]

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***













*Project Gutenberg is proud to cooperate with The World Library*
in the presentation of The Complete Works of William Shakespeare
for your reading for education and entertainment.  HOWEVER, THIS
IS NEITHER SHAREWARE NOR PUBLIC DOMAIN. . .AND UNDER THE LIBRARY
OF THE FUTURE CONDITIONS OF THIS PRESENTATION. . .NO CHARGES MAY
BE MADE FOR *ANY* ACCESS TO THIS MATERIAL.  YOU ARE ENCOURAGED!!
TO GIVE IT AWAY TO ANYONE YOU LIKE, BUT NO

In [44]:
res.status_code

200

### 检查错误

In [49]:
res = requests.get('https://automatetheboringstuff.com/page_does_not_exist')

res.raise_for_status()

HTTPError: 404 Client Error: Not Found for url: https://automatetheboringstuff.com/page_does_not_exist

In [47]:
res.status_code

404

+ raise_for_status()会在网页不存在时抛出错误使程序马上停止
+ 如果不想马上停止，用try...except将其包裹起来防止程序崩溃

In [51]:
res = requests.get('https://automatetheboringstuff.com/page_does_not_exist')
try:
    res.raise_for_status()
except Exception as err:
    print(f'There was a problem: {err}')

There was a problem: 404 Client Error: Not Found for url: https://automatetheboringstuff.com/page_does_not_exist


### 将下载的文件保存到硬盘

In [56]:
res = requests.get('https://automatetheboringstuff.com/files/rj.txt')
res.raise_for_status()
with open('../RomeoAndJuliet.txt', 'wb') as f:
    for chunk in res.iter_content(100000):
        f.write(chunk)

## HTML

+ ~~基础知识PASS~~

### 用bs4模块解析HTML

In [57]:
import bs4

In [59]:
res = requests.get('https://automatetheboringstuff.com/')
res.raise_for_status()
nostarch_soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [61]:
type(nostarch_soup)

bs4.BeautifulSoup

In [62]:
example_file = open('../../example.html')
example_soup = bs4.BeautifulSoup(example_file, 'html.parser')
type(example_soup)

bs4.BeautifulSoup

#### 用select()方法寻找元素

+ css选择器

|eg.|result|
|--|--|
|soup.select('div')|all \<div\>|
|soup.select('#author')|all elements with id=author|
|('div span')|\<span\> in \<div\>|
|('div > span')|\<span\> right after \<div\> without other elements between the two|
|('input[name')|all input that have attr name with any value|
|(input[type='button'])|all input that have attr type with value button|

In [63]:
example_file = open('../../example.html')
example_soup = bs4.BeautifulSoup(example_file, 'html.parser')
elems = example_soup.select('#author')

In [64]:
elems

[<span id="author">Al Sweigart</span>]

In [65]:
type(elems)

bs4.element.ResultSet

In [66]:
len(elems)

1

In [67]:
type(elems[0])

bs4.element.Tag

In [68]:
str(elems[0])

'<span id="author">Al Sweigart</span>'

In [69]:
elems[0].getText()

'Al Sweigart'

In [70]:
elems[0].attrs

{'id': 'author'}

### 通过元素的属性获取数据

In [74]:
elems[0].get('id')

'author'

In [75]:
elems[0].get('does_not_exist') == None

True

### 项目：打开所有的搜索结果

In [None]:
#! python3
# searchpypi.py  - Opens several search results.

# import requests, sys, webbrowser, bs4
# print('Searching...')    # display text while downloading the search result page
# res = requests.get('https://pypi.org/search/?q='
# + ' '.join(sys.argv[1:]))
# res.raise_for_status()

# # TODO: Retrieve top search result links.
# soup = bs4.BeautifulSoup(res.text, 'html.parser')
# # TODO: Open a browser tab for each result.
# link_elems= soup.select('.package-snippet')
# numOpen = min(5, len(link_elems))
# for i in range(numOpen):
#     urlToOpen = 'https://pypi.org' + link_elems[i].get('href')
#     print('Opening', urlToOpen)
#     webbrowser.open(urlToOpen)

### 项目:下载所有的XKCD漫画TODO

## 用selenium控制浏览器

### 启动

In [99]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [85]:
browser = webdriver.Edge()

type(browser)

browser.get('https://inventwithpython.com/')

In [83]:
# browser = webdriver.Chrome()

# type(browser)

# browser.get('https://www.google.com')

### 在页面中寻找元素

webdriver方法，用于寻找元素

+ browser.find_element_by_class_name(name)
+ browser.find_elements_by_class_name(name)
+ browser.find_element_by_css_selector(selector)
+ browser.find_element_by_css_selector(selector)
+ browser.find_element_by_id(id)
+ browser.find_elements_by_id(id)
+ browser.find_element_by_link_text(text)
+ browser.find_elements_by_link_text(tex)
+ browser.find_element_by_partial_link_text(text)
+ browser.find_elements_by_partial_link_text(tex)
+ browser.find_element_by_name(name)
+ brower.find_elements_by_name(name)
+ browser.find_element_by_tag_name(name)  -- 大小写不敏感
+ browser.find_elements_by_tag_name(name) 

webelem的属性和方法
+ tag_name
+ get_attribute(name)
+ text
+ clear()
+ is_displayed()
+ is_enabled()
+ is_selected()
+ location

In [87]:
browser = webdriver.Edge()
browser.get('https://inventwithpython.com/')
try:
    elem = browser.find_element_by_class_name('cover-thumb')
    print(f'{elem.tag_name} element with that class name!')
except:
    print('was not able to find an element with that name')

was not able to find an element with that name


In [103]:
try:
    elem = browser.find_element(By.CLASS_NAME, value='cover-thumb')
    print(f'<{elem.tag_name}> element with that class name!')
except:
    print('was not able to find an element with that name')

<img> element with that class name!


### 点击网页

In [1]:
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://inventwithpython.com')
linkElem = browser.find_element_by_link_text('Read Online for Free')
type(linkElem)

selenium.webdriver.remote.webelement.WebElement

In [2]:
linkElem.click()

### 填写与提交表格

In [4]:
browser.get('https://login.metafilter.com')
userElem = browser.find_element_by_id('user_name')

In [6]:
userElem = browser.find_element_by_id('user_name')
userElem.send_keys('your_real_username_here')

In [7]:
passwordElem = browser.find_element_by_id('user_pass')
passwordElem.send_keys('your_real_password_here')

In [8]:
passwordElem.submit()

### 发送特殊键

|Keys|含义|
|--|--|
|DOWN,UP,LEFT,RIGHT|键盘方向键|
|ENTER,RETURN|回车与换行|
|HONE,END,PAGE_DOWN/UP|home,end,pagedonw/up|
|ESCAPE,BACK_SPACE,DELETE|...|
|F1-F12|...|
|TAB|...|

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

browser = webdriver.Edge()
browser.get('https://www.sina.com.cn')

In [4]:
from selenium.webdriver.common.by import By

In [6]:
html_elem = browser.find_element(By.TAG_NAME,'html')

In [7]:
html_elem

<selenium.webdriver.remote.webelement.WebElement (session="5bda91c352606044dedd487ec0dbb600", element="F3B18DE10B16B7B53F73078CA994B8A5_element_163")>

In [9]:
html_elem.send_keys(Keys.END)

In [10]:
html_elem.send_keys(Keys.HOME)

+ 一般通过html_elem = browser.find_element(By.TAG_NAME,'html')向web发送按键

### 单击浏览器按钮

+ browser.back()
+ browser.forward()
+ browser.refresh()
+ browser.quit()

# Chapter 13 处理Excel表格

## openpyxl模块

基础名词：
+ 一个xlsx文档叫做工作簿(workbook)
+ 工作簿里可有多张表，表也可叫工作表(sheet)
+ 表中有行(row)有列(column)
+ 工作表的每一个单元格(cell)用来记录某个值(value)

### 读取Excel文档

In [13]:
import openpyxl

In [16]:
wb = openpyxl.load_workbook(r'D:\Code\automate_online-materials\example.xlsx')

In [17]:
type(wb)

openpyxl.workbook.workbook.Workbook

### 从工作簿中取得工作表

+ wb有属性sheetnames,通过[]可获得sheet,sheet有属性title, activate

In [18]:
wb.sheetnames

['Sheet1', 'Sheet2', 'Sheet3']

In [19]:
sheet = wb['Sheet3']

In [20]:
sheet

<Worksheet "Sheet3">

In [22]:
type(sheet)

openpyxl.worksheet.worksheet.Worksheet

In [23]:
sheet.title

'Sheet3'

In [24]:
another_sheet = wb.active

In [25]:
another_sheet

<Worksheet "Sheet1">

### 从表中取得单元格

+ sheet通过[]获得cell, cell有属性row,coulumn,coordinate,value

In [26]:
another_sheet['A1']

<Cell 'Sheet1'.A1>

In [27]:
another_sheet['A1'].value

datetime.datetime(2015, 4, 5, 13, 34, 2)

In [28]:
b= another_sheet['B1']

In [29]:
b.value

'Apples'

In [31]:
f'Row {b.row}, Column {b.column} is {b.value}'

'Row 1, Column 2 is Apples'

In [32]:
f'Cell {b.coordinate} is {b.value}'

'Cell B1 is Apples'

In [33]:
another_sheet['C1'].value

73

In [37]:
another_sheet.cell(row=1,column=2)

<Cell 'Sheet1'.B1>

In [39]:
another_sheet.cell(row=1,column=2).value

'Apples'

In [40]:
for i in range(1,8,2):
    print(i,another_sheet.cell(row=i,column=2).value)

1 Apples
3 Pears
5 Apples
7 Strawberries


In [41]:
sheet = wb.active

In [42]:
sheet

<Worksheet "Sheet1">

In [43]:
sheet.max_column

3

In [44]:
sheet.max_row

7

### 列字母和数字之间的转换

+ openpyxl.utils.column_index_from_string() -- 字母转数字
+ openpyxl.utils.get_column_letter()  -- 数字转字母

In [45]:
from openpyxl.utils import column_index_from_string, get_column_letter

In [46]:
column_index_from_string('A')

1

In [47]:
get_column_letter(256)

'IV'

In [48]:
get_column_letter(sheet.max_column)

'C'

### 从表中获取行和列

In [50]:
tuple(sheet['A1':'C3'])

((<Cell 'Sheet1'.A1>, <Cell 'Sheet1'.B1>, <Cell 'Sheet1'.C1>),
 (<Cell 'Sheet1'.A2>, <Cell 'Sheet1'.B2>, <Cell 'Sheet1'.C2>),
 (<Cell 'Sheet1'.A3>, <Cell 'Sheet1'.B3>, <Cell 'Sheet1'.C3>))

In [58]:
row_of_cell_objs = tuple(sheet['A1':'C3'])

In [60]:
for cell_objs in row_of_cell_objs:
    for cell_obj in cell_objs:
        print(cell_obj.value)
    print('---END OF ROW ---')

2015-04-05 13:34:02
Apples
73
---END OF ROW ---
2015-04-05 03:41:23
Cherries
85
---END OF ROW ---
2015-04-06 12:46:51
Pears
14
---END OF ROW ---


In [61]:
sheet.columns

<generator object Worksheet._cells_by_col at 0x00000193560D44C0>

In [62]:
list(sheet.columns)

[(<Cell 'Sheet1'.A1>,
  <Cell 'Sheet1'.A2>,
  <Cell 'Sheet1'.A3>,
  <Cell 'Sheet1'.A4>,
  <Cell 'Sheet1'.A5>,
  <Cell 'Sheet1'.A6>,
  <Cell 'Sheet1'.A7>),
 (<Cell 'Sheet1'.B1>,
  <Cell 'Sheet1'.B2>,
  <Cell 'Sheet1'.B3>,
  <Cell 'Sheet1'.B4>,
  <Cell 'Sheet1'.B5>,
  <Cell 'Sheet1'.B6>,
  <Cell 'Sheet1'.B7>),
 (<Cell 'Sheet1'.C1>,
  <Cell 'Sheet1'.C2>,
  <Cell 'Sheet1'.C3>,
  <Cell 'Sheet1'.C4>,
  <Cell 'Sheet1'.C5>,
  <Cell 'Sheet1'.C6>,
  <Cell 'Sheet1'.C7>)]

In [63]:
len(list(sheet.columns))

3

In [64]:
b_col = list(sheet.columns)[1]

In [65]:
b_col

(<Cell 'Sheet1'.B1>,
 <Cell 'Sheet1'.B2>,
 <Cell 'Sheet1'.B3>,
 <Cell 'Sheet1'.B4>,
 <Cell 'Sheet1'.B5>,
 <Cell 'Sheet1'.B6>,
 <Cell 'Sheet1'.B7>)

In [66]:
for cell_obj in b_col:
    print(cell_obj.value)

Apples
Cherries
Pears
Oranges
Apples
Bananas
Strawberries


## 项目：从电子表格中读取数据

In [70]:
#! python3
# readCensusExcel.py - Tabulates population and number of census tracts for
# each county.

import openpyxl, pprint


print('Opening workbook...')
wb = openpyxl.load_workbook(r'D:\Code\automate_online-materials\censuspopdata.xlsx')
sheet = wb['Population by Census Tract']
countyData = {}

# TODO: Fill in countyData with each county's population and tracts.
print('Reading rows...')
for row in range(2, sheet.max_row + 1):
    # Each row in the spreadsheet has data for one census tract.
    state  = sheet['B' + str(row)].value
    county = sheet['C' + str(row)].value
    pop    = sheet['D' + str(row)].value

    countyData.setdefault(state,{})
    countyData[state].setdefault(county,{'tracks':0, 'pop':0})
    countyData[state][county]['tracks'] +=1
    countyData[state][county]['pop'] += int(pop)

# TODO: Open a new text file and write the contents of countyData to it.
print('Wrting results...')
with open('census.py', 'w') as f:
    f.write('allData = '+ pprint.pformat(countyData))
print('Done.')

Opening workbook...
Reading rows...
Wrting results...
Done.


In [1]:
import census

In [2]:
census.allData

{'AK': {'Aleutians East': {'pop': 3141, 'tracks': 1},
  'Aleutians West': {'pop': 5561, 'tracks': 2},
  'Anchorage': {'pop': 291826, 'tracks': 55},
  'Bethel': {'pop': 17013, 'tracks': 3},
  'Bristol Bay': {'pop': 997, 'tracks': 1},
  'Denali': {'pop': 1826, 'tracks': 1},
  'Dillingham': {'pop': 4847, 'tracks': 2},
  'Fairbanks North Star': {'pop': 97581, 'tracks': 19},
  'Haines': {'pop': 2508, 'tracks': 1},
  'Hoonah-Angoon': {'pop': 2150, 'tracks': 2},
  'Juneau': {'pop': 31275, 'tracks': 6},
  'Kenai Peninsula': {'pop': 55400, 'tracks': 13},
  'Ketchikan Gateway': {'pop': 13477, 'tracks': 4},
  'Kodiak Island': {'pop': 13592, 'tracks': 5},
  'Lake and Peninsula': {'pop': 1631, 'tracks': 1},
  'Matanuska-Susitna': {'pop': 88995, 'tracks': 24},
  'Nome': {'pop': 9492, 'tracks': 2},
  'North Slope': {'pop': 9430, 'tracks': 3},
  'Northwest Arctic': {'pop': 7523, 'tracks': 2},
  'Petersburg': {'pop': 3815, 'tracks': 1},
  'Prince of Wales-Hyder': {'pop': 5559, 'tracks': 4},
  'Sitka': 

In [4]:
census.allData.keys()

dict_keys(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'])

In [5]:
len(census.allData.keys())

51

In [6]:
census.allData['DC']

{'District of Columbia': {'pop': 601723, 'tracks': 179}}

## 写入Excel文档

### 创建并保存Excel文档

In [7]:
import openpyxl

In [9]:
wb = openpyxl.Workbook()

In [10]:
wb.sheetnames

['Sheet']

In [11]:
sheet = wb.active

In [12]:
sheet.title

'Sheet'

In [13]:
sheet.title = 'spam bacon egg sheet'

In [14]:
sheet.title

'spam bacon egg sheet'

In [15]:
wb.sheetnames

['spam bacon egg sheet']

In [23]:
example_wb  = openpyxl.load_workbook(r'D:\DESKTOP\example.xlsx')
example_sheet = example_wb.active

In [25]:
example_sheet

<Worksheet "Sheet1">

In [26]:
example_sheet.title = 'spam spam spam'

In [29]:
example_wb.save('./example_copy.xlsx')

### 创建和删除工作表

In [34]:
wb

<openpyxl.workbook.workbook.Workbook at 0x2202af68f10>

In [35]:
wb.sheetnames

['spam bacon egg sheet']

In [36]:
wb.create_sheet()

<Worksheet "Sheet">

In [37]:
wb.sheetnames

['spam bacon egg sheet', 'Sheet']

In [38]:
wb.create_sheet(index=0, title='bar bar bar')

<Worksheet "bar bar bar">

In [39]:
wb.sheetnames

['bar bar bar', 'spam bacon egg sheet', 'Sheet']

In [40]:
del wb['Sheet']

In [41]:
wb.sheetnames

['bar bar bar', 'spam bacon egg sheet']

In [42]:
del wb['bar bar bar']

In [43]:
wb.sheetnames

['spam bacon egg sheet']

+ 做完改动后记得保存

In [45]:
wb

<openpyxl.workbook.workbook.Workbook at 0x2202af68f10>

In [46]:
wb.create_sheet(title='what you got')

<Worksheet "what you got">

In [47]:
wb.sheetnames

['spam bacon egg sheet', 'what you got']

In [49]:
wb.save('./created_by_code.xlsx')

In [50]:
wb.create_sheet(title='tilted')

<Worksheet "tilted">

In [53]:
wb.sheetnames

['spam bacon egg sheet', 'what you got', 'tilted']

In [54]:
sheet

<Worksheet "spam bacon egg sheet">

In [55]:
sheet['A1'] = 'hello world'

In [56]:
sheet['A1'].value

'hello world'

In [57]:
wb.save('./created_by_code.xlsx')

## 项目：更新电子表格

In [58]:
wb = openpyxl.load_workbook(r'D:/DESKTOP/produceSales.xlsx')


In [59]:
wb

<openpyxl.workbook.workbook.Workbook at 0x2204ad87c50>

In [61]:
wb.sheetnames

['Sheet']

In [62]:
sheet = wb['Sheet']

In [68]:
sheet['A1'].value

'PRODUCE'

In [69]:
sheet.max_row

23758

In [75]:
names = list(sheet.rows)[0]

In [77]:
names

(<Cell 'Sheet'.A1>, <Cell 'Sheet'.B1>, <Cell 'Sheet'.C1>, <Cell 'Sheet'.D1>)

In [81]:
names

['PRODUCE', 'COST PER POUND', 'POUNDS SOLD', 'TOTAL']

In [82]:
transactions = list(sheet.rows)[1:]

In [83]:
len(transactions)

23757

In [84]:
transactions[:5]

[(<Cell 'Sheet'.A2>, <Cell 'Sheet'.B2>, <Cell 'Sheet'.C2>, <Cell 'Sheet'.D2>),
 (<Cell 'Sheet'.A3>, <Cell 'Sheet'.B3>, <Cell 'Sheet'.C3>, <Cell 'Sheet'.D3>),
 (<Cell 'Sheet'.A4>, <Cell 'Sheet'.B4>, <Cell 'Sheet'.C4>, <Cell 'Sheet'.D4>),
 (<Cell 'Sheet'.A5>, <Cell 'Sheet'.B5>, <Cell 'Sheet'.C5>, <Cell 'Sheet'.D5>),
 (<Cell 'Sheet'.A6>, <Cell 'Sheet'.B6>, <Cell 'Sheet'.C6>, <Cell 'Sheet'.D6>)]

In [85]:
for transaction in transactions:
    if transaction[0].value == 'Celery':
        transaction[1].value = 1.19
    elif transaction[0].value == 'Garlic':
        transaction[1].value = 3.07
    elif transaction[0].value == 'Lemon':
        transaction[1].value = 1.27
  

In [86]:
wb.save(r'D:\DESKTOP\new_product.xlsx')

+ 答案

In [87]:
#! python3
# updateProduce.py - Corrects costs in produce sales spreadsheet.

# import openpyxl

# wb = openpyxl.load_workbook('produceSales.xlsx')
# sheet = wb['Sheet']

# # The produce types and their updated prices
# PRICE_UPDATES = {'Garlic': 3.07,
#                  'Celery': 1.19,
#                  'Lemon': 1.27}

# # TODO: Loop through the rows and update the prices.
# # Loop through the rows and update the prices.
# for rowNum in range(2, sheet.max_row):    # skip the first row
#     produceName = sheet.cell(row=rowNum, column=1).value
#     if produceName in PRICE_UPDATES:
#           sheet.cell(row=rowNum, column=2).value = PRICE_UPDATES[produceName]
# wb.save('updatedProduceSales.xlsx')

## 设置单元格的字体风格

In [88]:
from openpyxl.styles import Font

In [90]:
wb= openpyxl.load_workbook('./created_by_code.xlsx')
sheet = wb.active
italic24font = Font(size=24, italic=True)
sheet['A1'].font = italic24font

In [91]:
wb.save('./created_by_code.xlsx')

### Font对象

Font的关键字参数
|keyword|type|desc|
|--|--|--|
|name|str|eg:'Calibri' or 'Times New Roman'|
|size|int|...|
|bold|bool|...|
|italic|bool|...|

使用方法：调用Font对象赋给一个变量，将变量赋Cell对象的font属性

In [92]:
bold_times_new_roman = Font(name='Times New Roman', bold=True)
sheet['B1'].font = bold_times_new_roman

In [94]:
wb.save('./created_by_code.xlsx')

In [95]:
sheet['A1']

<Cell 'spam bacon egg sheet'.A1>

In [96]:
sheet['C1'] = 'default sytle'

In [97]:
wb.save('./created_by_code.xlsx')

## 公式

In [98]:
sheet['B8'] = '=SUM(B2:B7)'

In [99]:
wb.save('./created_by_code.xlsx')

## 调整行和列

### 设置行高和列宽

In [101]:
wb= openpyxl.Workbook()

In [102]:
sheet = wb.active

In [107]:
sheet['A1'] = 'tall row'
sheet['B2'] = 'wide column'
sheet.row_dimensions[1].height = 70
sheet.column_dimensions['B'].width = 20

In [23]:
# 自动调整
# sheet.column_dimension[column_letter].auto_size = True

In [108]:
wb.save('./cell_h_w.xlsx')

###  合并和拆分单元格

In [109]:
sheet.merge_cells('C1:D5')

In [110]:
sheet['C1'] = 'this is a merged cell'

In [111]:
wb.save('./cell_h_w.xlsx')

In [117]:
sheet.unmerge_cells('C1:D5')

In [118]:
wb.save('./cell_h_w.xlsx')

### 冻结窗格

freeze_panes的设置
+ 指定哪一个cell，其左边列和上面行都会被冻结，cell所在的位置不会

|eg.|result|
|--|--|
|sheet.freeze_panes='A2'|第一行|
|B1|第一列|
|C1|前两列|
|C2|第一行，前两列|
|A1/None|无冻结|

In [1]:
import openpyxl

In [2]:
wb = openpyxl.load_workbook(r'D://DESKTOP/produceSales.xlsx')

In [3]:
wb.sheetnames

['Sheet']

In [4]:
sheet = wb.active

In [5]:
sheet

<Worksheet "Sheet">

In [8]:
rows = list(sheet.rows)

In [9]:
rows

[(<Cell 'Sheet'.A1>, <Cell 'Sheet'.B1>, <Cell 'Sheet'.C1>, <Cell 'Sheet'.D1>),
 (<Cell 'Sheet'.A2>, <Cell 'Sheet'.B2>, <Cell 'Sheet'.C2>, <Cell 'Sheet'.D2>),
 (<Cell 'Sheet'.A3>, <Cell 'Sheet'.B3>, <Cell 'Sheet'.C3>, <Cell 'Sheet'.D3>),
 (<Cell 'Sheet'.A4>, <Cell 'Sheet'.B4>, <Cell 'Sheet'.C4>, <Cell 'Sheet'.D4>),
 (<Cell 'Sheet'.A5>, <Cell 'Sheet'.B5>, <Cell 'Sheet'.C5>, <Cell 'Sheet'.D5>),
 (<Cell 'Sheet'.A6>, <Cell 'Sheet'.B6>, <Cell 'Sheet'.C6>, <Cell 'Sheet'.D6>),
 (<Cell 'Sheet'.A7>, <Cell 'Sheet'.B7>, <Cell 'Sheet'.C7>, <Cell 'Sheet'.D7>),
 (<Cell 'Sheet'.A8>, <Cell 'Sheet'.B8>, <Cell 'Sheet'.C8>, <Cell 'Sheet'.D8>),
 (<Cell 'Sheet'.A9>, <Cell 'Sheet'.B9>, <Cell 'Sheet'.C9>, <Cell 'Sheet'.D9>),
 (<Cell 'Sheet'.A10>,
  <Cell 'Sheet'.B10>,
  <Cell 'Sheet'.C10>,
  <Cell 'Sheet'.D10>),
 (<Cell 'Sheet'.A11>,
  <Cell 'Sheet'.B11>,
  <Cell 'Sheet'.C11>,
  <Cell 'Sheet'.D11>),
 (<Cell 'Sheet'.A12>,
  <Cell 'Sheet'.B12>,
  <Cell 'Sheet'.C12>,
  <Cell 'Sheet'.D12>),
 (<Cell 'Sheet'.A13>,


In [10]:
len(rows)

23758

In [14]:
sheet.freeze_panes = 'A2'

In [15]:
wb.save(r'D://DESKTOP/produceSales.xlsx')

In [16]:
sheet.freeze_panes = 'A1'

In [17]:
wb.save(r'D://DESKTOP/produceSales.xlsx')

## 图表

绘图流程
+ 数据的矩形选区，生成reference
+ 从reference生成series
+ 生成chart
+ 将series添加到chart
+ 将chart添加到sheet
+ 保存

In [18]:
wb = openpyxl.Workbook()

sheet = wb.active
for i in range(1,11):
    sheet['A'+str(i)] = i

In [19]:
ref_obj = openpyxl.chart.Reference(sheet, min_col=1,min_row=1,max_col=1,max_row=10)
series_obj = openpyxl.chart.Series(ref_obj, title='First series')

chart_obj = openpyxl.chart.BarChart()

In [20]:
chart_obj.title = 'My chart'
chart_obj.append(series_obj)

In [21]:
sheet.add_chart(chart_obj,'C5')

In [22]:
wb.save('./sample_chart.xlsx')

## 项目：乘法表...TODO

# Chapter 14 处理Google电子表格

# Chapter 15 处理PDF和Word文档

## PDF文档

### PyPDF2模块

### 从PDF中提取文本

In [1]:
import PyPDF2

In [9]:
with open(r'D:\Code\automate_online-materials\meetingminutes.pdf', 'rb') as f:
    pdf_reader = PyPDF2.PdfReader(f)

In [11]:
pdf_reader.numPages

DeprecationError: reader.numPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead.

In [12]:
len(pdf_reader.pages)

ValueError: seek of closed file

In [14]:
pdf_file = open(r'D:/Code/automate_online-materials/meetingminutes.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

In [15]:
len(pdf_reader.pages)

19

In [16]:
page_obj = pdf_reader.getPage(0)

DeprecationError: reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead.

In [17]:
page_obj = pdf_reader.pages[0]

In [18]:
page_obj

{'/Contents': [IndirectObject(961, 0, 2097348423632),
  IndirectObject(962, 0, 2097348423632),
  IndirectObject(963, 0, 2097348423632),
  IndirectObject(964, 0, 2097348423632),
  IndirectObject(965, 0, 2097348423632),
  IndirectObject(966, 0, 2097348423632),
  IndirectObject(967, 0, 2097348423632),
  IndirectObject(968, 0, 2097348423632)],
 '/CropBox': [0, 0, 612, 792],
 '/MediaBox': [0, 0, 612, 792],
 '/Parent': {'/Count': 9,
  '/Kids': [IndirectObject(959, 0, 2097348423632),
   IndirectObject(1, 0, 2097348423632),
   IndirectObject(11, 0, 2097348423632),
   IndirectObject(13, 0, 2097348423632),
   IndirectObject(15, 0, 2097348423632),
   IndirectObject(17, 0, 2097348423632),
   IndirectObject(19, 0, 2097348423632),
   IndirectObject(24, 0, 2097348423632),
   IndirectObject(26, 0, 2097348423632)],
  '/Parent': {'/Count': 19,
   '/Kids': [IndirectObject(953, 0, 2097348423632),
    IndirectObject(954, 0, 2097348423632),
    IndirectObject(955, 0, 2097348423632)],
   '/Type': '/Pages'},


In [19]:
page_obj.extract_text()

'OOFFFFIICCIIAALL  BBOOAARRDD  MMIINNUUTTEESS  \n \nMeeting of March 7 , 2014  \n \n \n \n  \n \n  \n \n   \nThe Board of Elementary and Secondary Education shall provide leadership and \ncreate policies for education that expand opportunities for children, empower \nfamilies and communities, and advance Louisiana in an increasingly \ncompetitive glob al market.  BOARD  \nof \nELEMENTARY  \nand  \nSECONDARY  \nEDUCATION  \n '

In [20]:
pdf_file.close()

### 解密PDF

+ reader.decrypt()只是解密reader，文件的状态依旧是加密的

In [21]:
pdf_file = open(r'D:/Code/automate_online-materials/encrypted.pdf', 'rb')

In [22]:
pdf_reader = PyPDF2.PdfReader(pdf_file)

In [23]:
pdf_reader.isEncrypted

DeprecationError: isEncrypted is deprecated and was removed in PyPDF2 3.0.0. Use is_encrypted instead.

In [24]:
pdf_reader.is_encrypted

True

In [25]:
pdf_reader.pages[0]

FileNotDecryptedError: File has not been decrypted

In [26]:
pdf_reader.decrypt('rosebud')

<PasswordType.OWNER_PASSWORD: 2>

In [27]:
pdf_reader.pages[0]

{'/CropBox': [0, 0, 612, 792],
 '/Parent': {'/Parent': {'/Type': '/Pages',
   '/Count': 19,
   '/Kids': [IndirectObject(4, 0, 2097370310352),
    IndirectObject(36, 0, 2097370310352),
    IndirectObject(47, 0, 2097370310352)]},
  '/Type': '/Pages',
  '/Count': 9,
  '/Kids': [IndirectObject(72, 0, 2097370310352),
   IndirectObject(3, 0, 2097370310352),
   IndirectObject(17, 0, 2097370310352),
   IndirectObject(20, 0, 2097370310352),
   IndirectObject(22, 0, 2097370310352),
   IndirectObject(24, 0, 2097370310352),
   IndirectObject(26, 0, 2097370310352),
   IndirectObject(31, 0, 2097370310352),
   IndirectObject(33, 0, 2097370310352)]},
 '/Type': '/Page',
 '/Contents': [IndirectObject(946, 0, 2097370310352),
  IndirectObject(947, 0, 2097370310352),
  IndirectObject(948, 0, 2097370310352),
  IndirectObject(949, 0, 2097370310352),
  IndirectObject(950, 0, 2097370310352),
  IndirectObject(951, 0, 2097370310352),
  IndirectObject(952, 0, 2097370310352),
  IndirectObject(953, 0, 2097370310352

### 创建PDF

能力仅限于从其他PDF文件中
+ 复制页面
+ 旋转页面
+ 重叠..
+ 加密文件

In [29]:
# 复制
pdf_file1 = open(r'D:/Code/automate_online-materials\meetingminutes.pdf', 'rb')
pdf_file2 = open(r'D:/Code/automate_online-materials/meetingminutes2.pdf', 'rb')
pdf_reader1 = PyPDF2.PdfReader(pdf_file1)
pdf_reader2 = PyPDF2.PdfReader(pdf_file2)
pdf_writer = PyPDF2.PdfWriter()

In [30]:
for i in range(len(pdf_reader1.pages)):
    page_obj = pdf_reader1.pages[i]
    pdf_writer.add_page(page_obj)

In [31]:
len(pdf_writer.pages)

19

In [32]:
for i in range(len(pdf_reader2.pages)):
    page_obj = pdf_reader2.pages[i]
    pdf_writer.add_page(page_obj)

In [33]:
len(pdf_writer.pages)

40

In [34]:
with open(r'../combinedminutes.pdf', 'wb') as f:
    pdf_writer.write(f)

In [35]:
pdf_file1.close()
pdf_file2.close()

ps: writer.add_page()只能在末尾加页面，不能中插.

In [36]:
# 旋转
pdf_file = open(r'D:/Code/automate_online-materials/meetingminutes.pdf', 'rb')

In [37]:
pdf_reader  = PyPDF2.PdfReader(pdf_file)

In [38]:
page = pdf_reader.pages[0]

In [40]:
page.rotate(angle=90)

{'/Contents': [IndirectObject(961, 0, 2097390622800),
  IndirectObject(962, 0, 2097390622800),
  IndirectObject(963, 0, 2097390622800),
  IndirectObject(964, 0, 2097390622800),
  IndirectObject(965, 0, 2097390622800),
  IndirectObject(966, 0, 2097390622800),
  IndirectObject(967, 0, 2097390622800),
  IndirectObject(968, 0, 2097390622800)],
 '/CropBox': [0, 0, 612, 792],
 '/MediaBox': [0, 0, 612, 792],
 '/Parent': {'/Count': 9,
  '/Kids': [IndirectObject(959, 0, 2097390622800),
   IndirectObject(1, 0, 2097390622800),
   IndirectObject(11, 0, 2097390622800),
   IndirectObject(13, 0, 2097390622800),
   IndirectObject(15, 0, 2097390622800),
   IndirectObject(17, 0, 2097390622800),
   IndirectObject(19, 0, 2097390622800),
   IndirectObject(24, 0, 2097390622800),
   IndirectObject(26, 0, 2097390622800)],
  '/Parent': {'/Count': 19,
   '/Kids': [IndirectObject(953, 0, 2097390622800),
    IndirectObject(954, 0, 2097390622800),
    IndirectObject(955, 0, 2097390622800)],
   '/Type': '/Pages'},


In [42]:
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(page)
with open('../rotatepage.pdf', 'wb') as f:
    pdf_writer.write(f)

In [43]:
pdf_file.close()

In [46]:
# 叠加
minute_file = open(r'../meetingminutes.pdf', 'rb')
water_file = open(r'../watermark.pdf', 'rb') 
minute_reader = PyPDF2.PdfReader(minute_file)
water_reader = PyPDF2.PdfReader(water_file)
minute_page = minute_reader.pages[0]
minute_page.merge_page(water_reader.pages[0])


In [47]:
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(minute_page)

for i in range(1, len(minute_reader.pages)):
    pdf_writer.add_page(minute_reader.pages[i])

with open(r'../watermarked_cover.pdf', 'wb') as f:
    pdf_writer.write(f)

In [48]:
minute_file.close()
water_file.close()

In [51]:
# 加密
minute_file = open(r'../meetingminutes.pdf', 'rb')
minute_reader = PyPDF2.PdfReader(minute_file)
pdf_writer = PyPDF2.PdfWriter()
for i in range(len(minute_reader.pages)):
    pdf_writer.add_page(minute_reader.pages[i])

In [52]:
pdf_writer.encrypt('swordfish')  # user_password, owner_password

In [53]:
with open(r'encryped.pdf', 'wb') as f:
    pdf_writer.write(f)

In [58]:
minute_file.close()

### 项目：从多个PDF中合并选择的页面TODO

In [7]:
#! python3
# combinePdfs.py - Combines all the PDFs in the current working directory into
# into a single PDF.

import PyPDF2, os
# Get all the PDF filenames.
pdfFiles = []
root = r'D:\Code\jupyter'
result_path = '../'
result_name = 'result.pdf'
for filename in os.listdir(root):
    if filename.endswith('.pdf'):
         pdfFiles.append(os.path.join(root,filename))
pdfFiles.sort(key = str.lower)

pdfWriter = PyPDF2.PdfWriter()

for file_path in pdfFiles:
    file = open(file_path, 'rb')
    file_reader = PyPDF2.PdfReader(file)
    for i in range(1,len(file_reader.pages)):
        pdfWriter.add_page(file_reader.pages[i])
    file.close()

with open(os.path.join(result_path,result_name), 'wb') as f:
    pdfWriter.write(f)

In [8]:
pdfFiles

['D:\\Code\\jupyter\\gan_review.pdf',
 'D:\\Code\\jupyter\\gan_review_ch.pdf',
 'D:\\Code\\jupyter\\result.pdf']

## word文档

### python-docx模块

基础名词
+ Document -- 一个.docx文件
+ Paragraphy -- 一个段落
+ run -- 一段相同格式的文本

### 读取word文档

In [9]:
import docx

In [10]:
doc = docx.Document(r'D:/Code/automate_online-materials/demo.docx')

In [11]:
len(doc.paragraphs)

7

In [12]:
doc.paragraphs[0].text

'Document Title'

In [13]:
doc.paragraphs[1].text

'A plain paragraph with some bold and some italic'

In [14]:
len(doc.paragraphs[1].runs)

5

In [16]:
doc.paragraphs[1].runs[0].text

'A plain paragraph with'

In [17]:
doc.paragraphs[1].runs[1].text

' some '

In [18]:
doc.paragraphs[6].text

''

In [19]:
doc.paragraphs[5].text

'first item in ordered list'

### 从docx文件中取得完整的文本

In [20]:
#! python3

import docx

def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

In [23]:
print(getText(r'D:/Code/automate_online-materials/demo.docx'))

Document Title
A plain paragraph with some bold and some italic
Heading, level 1
Intense quote
first item in unordered list
first item in ordered list



### 设置Paragraph和Run对象的格式

word文档的三种样式
+ paragraph style for Paragraphy
+ character style for Run

|||||
| ------------- | --------------- | ----------------- | ----------------- |
| 'Normal'      | 'Heading  5'    | 'List  Bullet'    | 'List  Paragraph' |
| 'Body Text'   | 'Heading 6'     | 'List Bullet 2'   | 'MacroText'       |
| 'Body Text 2' | 'Heading 7'     | 'List Bullet 3'   | 'No Spacing'      |
| 'Body Text 3' | 'Heading 8'     | 'List Continue'   | 'Quote'           |
| 'Caption'     | 'Heading 9'     | 'List Continue 2' | 'Subtitle'        |
| 'Heading 1'   | 'Intense Quote' | 'List Continue 3' | 'TOC Heading'     |
| 'Heading 2'   | 'List'          | 'List Number '    | 'Title'           |
| 'Heading 3'   | 'List 2'        | 'List Number 2'   |                   |
| 'Heading 4'   | 'List 3'        | 'List Number 3'   |                   |

如果对Run对象应用链接样式，需要在样式名称末尾加上'Char'

在word里设置

### 创建带有非默认样式的word文档

在word里设置

### Run属性

Run对象的text属性

+ bold
+ italic
+ underline
+ strike
+ double_strike
+ all_caps
+ small_caps
+ shadow
+ outline
+ rtl
+ imprint
+ emboss

In [2]:
import docx

In [4]:
doc = docx.Document(r'../../automate_online-materials/demo.docx')

In [5]:
doc.paragraphs[0].text

'Document Title'

In [6]:
doc.paragraphs[0].style

_ParagraphStyle('Title') id: 2528020166992

In [7]:
doc.paragraphs[0].style = 'Normal'

In [8]:
doc.paragraphs[1].text

'A plain paragraph with some bold and some italic'

In [11]:
for i in range(len(doc.paragraphs[1].runs)):
    print(doc.paragraphs[1].runs[i].text)

A plain paragraph with
 some 
bold
 and some 
italic


In [12]:
doc.paragraphs[1].runs[0].style = 'QuoteChar'

  return self._get_style_id_from_style(self[style_name], style_type)


In [None]:
doc.paragraphs[1].runs[0].style = 'Quate Char'

In [15]:
doc.paragraphs[1].runs[1].underline = True
doc.paragraphs[1].runs[3].underline = True

In [16]:
doc.save('../restyle.docx')

### 写入Word文档

In [17]:
doc = docx.Document()
doc.add_paragraph('hello world')

<docx.text.paragraph.Paragraph at 0x24c9a0b72d0>

In [18]:
doc.save('../hello.docx')

In [20]:
doc = docx.Document(r'../hello.docx')
para2 = doc.add_paragraph('this is paragraph 2')
para3 = doc.add_paragraph('thi is paragraphy 3')

In [21]:
para2.add_run('. this text is being added to the 2nd paragraph')

<docx.text.run.Run at 0x24c98e5ae50>

In [22]:
doc.save('../hello.docx')

In [24]:
doc = docx.Document(r'../hello.docx')
doc.add_paragraph('this is paragraph 4', style='Heading 1')

<docx.text.paragraph.Paragraph at 0x24c9a296b50>

In [25]:
para4 = doc.paragraphs[-1]

In [27]:
para4.add_run('. this is another run of para 4', style='Title Char')

<docx.text.run.Run at 0x24c98e5bd50>

In [28]:
doc.save('../hello.docx')

### 添加图像

In [29]:
doc

<docx.document.Document at 0x24c9a0f1290>

In [30]:
doc.add_picture(r'D:/DESKTOP/window_view_23.10.29.jpg')

<docx.shape.InlineShape at 0x24c9a1d4150>

In [31]:
doc.save(r'../hello.docx')

In [32]:
doc.add_picture(r'D:/DESKTOP/window_view_23.10.29.jpg', width=docx.shared.Inches(1), height=docx.shared.Cm(5))

<docx.shape.InlineShape at 0x24c99c25d10>

In [33]:
doc.save(r'../hello.docx')

## 从word文档中创建PDF

In [35]:
import win32com.client

In [36]:
import docx

In [43]:
word_file_name = r'D:/Code/jupyter/hello.docx'
# pdf_file_name = r'hello.pdf'    # 会存到主路径里
pdf_file_name = r'D:/Code/jupyter/hello.pdf' 

wd_format_pdf = 17
word_obj = win32com.client.Dispatch('Word.Application')
doc_obj = word_obj.Documents.Open(word_file_name)

In [44]:
doc_obj.SaveAs(pdf_file_name, FileFormat=wd_format_pdf)
doc_obj.Close()
word_obj.Quit()

## 项目：PDF偏执狂...TODO