In [4]:
import requests
import re
import pandas as pd

headers = {
    'Accept-Language': 'zh-CN,zh;q=0.9'
}

def parse_table(url):
    # url = 'https://zh.wikipedia.org/wiki/Wikipedia:%E7%BB%9F%E8%AE%A1'
    # url = 'https://zh.wikipedia.org/wiki/Help:%E7%9B%AE%E5%BD%95'
    
    # 发送请求获取 HTML 内容
    response = requests.get(url, headers=headers)
    
    # 使用 pandas.read_html 解析 HTML 内容并获取表格数据
    tables = pd.read_html(response.content)

    return tables
    
url = 'https://zh.wikipedia.org/wiki/Wikipedia:%E7%BB%9F%E8%AE%A1'
tables = parse_table(url)

In [5]:
def process_table(table):
    
    del_list = ['机器人', '搜索引擎', '访问者来源', '编辑者来源', '全部空间']

    
    # 表格需要多于2行, 但是不能多于500行
    if len(table) < 2  or len(table) > 200:
        return

    col_names = table.columns.tolist()
    # 表格不能小于2列
    if len(col_names) < 2 or len(col_names) > 10:
        return
        
    # 表格如果没有真实含义的header 则删除
    if col_names[:2] == [0, 1]:
        return
    
    # 如果有多层Header（多层Header可能来源于合并表格），那么转换成新的header
    if isinstance(col_names[0], tuple):  # 如果多层，会以truple的形式
        # 让里面的'_' 变成'-' 以免与连接符_冲突
        col_names = [tuple(word.replace('_', '-') for word in tpl) for tpl in col_names]
        
        if len(col_names[0]) > 3 :
            return
            
        elif len(col_names[0]) == 2:  # 合并table的header
            col_names = ['_'.join(item) if item[0] != item[1] else item[0]  for item in col_names]
            table.columns = col_names
            
        elif len(col_names[0]) == 3:  # 对于三行的，直接组合
            col_names = ['_'.join(item) for item in col_names]
            table.columns = col_names
        
    # 如果列名的长度总和大于100，则删除
    if len(''.join(col_names)) > 100:
        return

    # 如果在删除字段表里则删除
    if any(re.search(keyword, ''.join(col_names)) for keyword in del_list):
        return
        
    
    return table
    


In [7]:
header_data = tables[6].columns.tolist()
header_data

['日期', '英语', '德语', '西班牙语', '葡萄牙语', '意大利语', '俄语', '中文', '日语']

In [8]:
df = tables[5]
# 将空值替换为空字符串
df.fillna('', inplace=True)
df

Unnamed: 0_level_0,语言,条目总数,典范条目,典范条目,优良条目,优良条目
Unnamed: 0_level_1,语言,条目总数,数目,比例,数目,比例
0,英语,6563859,6129,0.0934%,36784,0.5604%
1,宿务语,6125879,35,0.0006%,105,0.0017%
2,德语,2714948,2752,0.1014%,4278,0.1576%
3,瑞典语,2552394,357,0.014%,423,0.0166%
4,法语,2444756,2055,0.0841%,3731,0.1526%
5,荷兰语,2097838,366,0.0174%,不设优良条目,不设优良条目
6,俄语,1844543,1660,0.09%,4245,0.2301%
7,西班牙语,1794859,1233,0.0687%,3348,0.1865%
8,意大利语,1766749,538,0.0305%,473,0.0268%
9,埃及阿拉伯文,1596652,42,0.0026%,不设优良条目,不设优良条目


In [9]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.fonts import addMapping
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
import pandas as pd
import random

# 设置中文字体文件路径
simsum_font_path = 'SimSun.ttf'
simhei_font_path = 'SimHei.ttf'

# 注册中文字体
pdfmetrics.registerFont(TTFont('SunFont', simsum_font_path))
pdfmetrics.registerFont(TTFont('SimHei', simhei_font_path))

addMapping('SunFont', 0, 0, 'SunFont')  # 将字体映射为编码0
addMapping('SimHei', 1, 1, 'SimHei')  # 将字体映射为编码0

# 创建 PDF 文档
doc = SimpleDocTemplate("output.pdf", pagesize=letter)

col_names = df.columns.values.tolist()


# 针对header的差异，如果是多级就转置，但级就保持不变
if  isinstance(col_names[0], tuple):
    col_names =  [[row[i] for row in col_names] for i in range(len(col_names[0]))]    
else:
    col_names = [col_names]  # shape: [1, col]
  
print(len(col_names))
len_title = len(col_names)
header_column_nums = len(col_names[0])

data = col_names + df.values.tolist()

    
# 创建表格对象
table = Table(data)

# 设置表格样式  # 坐标是先列后行，如(0， -1) 指的是第0列的所有行 
base_style = [
    
    ('TEXTCOLOR', (0, 0), (-1, -1), colors.black),  # 设置标题行的文本颜色
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),  # 居中对齐
    ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), # 上下居中对齐
    
    ('FONTNAME', (0, 0), (-1, -1), 'SunFont'),  # 设置字体
    ('FONTNAME', (0, 0), (-1, len_title - 1), 'SimHei'),  # 黑体的设置长度
]

# 所有表格都划线    
line_style_1 = [   
    ('LINEBEFORE', (0, 0), (-1, -1), 0.5, colors.black),  # 设置左侧竖线（细线）
    ('LINEAFTER', (0, 0), (-1, -1), 0.5, colors.black),  # 设置右侧竖线（细线）
    ('LINEBELOW', (0, 0), (-1, -1), 0.5, colors.black),  # 设置所有横线 (细线） 
    ('LINEABOVE', (0, 0), (-1, -1), 0.5, colors.black),  
]

#  三线表
line_style_2 = [
   ('LINEABOVE', (0, 0), (-1, len_title - 1), 1.5, colors.black),  
   ('LINEBELOW', (0, 0), (-1, len_title - 1), 1.5, colors.black),
   ('LINEBELOW', (0, -1), (-1, -1), 1.5, colors.black),
]

style = base_style + line_style_2 if random.randint(0, 1) == 0 else \
        base_style + line_style_1

table_style = TableStyle(style)


for i in range(len_title):  # 多少级header    
    for j in range(header_column_nums):  # 多少列
        if j > 0 and col_names[i][j] == col_names[i][j-1]:
            print(col_names[i][j])
            table_style.add('SPAN', (j-1, i), (j, i))  # 合并列，保留行


for i in range(len_title):  # 多少级header
    for j in range(header_column_nums):  # 多少列
        if i>0 and col_names[i][j] == col_names[i-1][j]:
            print(col_names[i][j])
            table_style.add('SPAN', (j, i-1), (j, i))  # 合并行，保留列
    

table.setStyle(table_style)

# 构建 PDF 文档内容
content = [table]

# 将内容添加到 PDF 文档中并保存
doc.build(content)


2
典范条目
优良条目
语言
条目总数


In [10]:
def add_gaussian_blur(image):
    radius = np.random.randint(5, 13) * 0.1
    noisy_image = image.filter(ImageFilter.GaussianBlur(radius))   # [0.5, 1.2]
    
    return noisy_image
    

In [11]:
# 均分分布噪声

from PIL import Image, ImageOps, ImageFilter
import numpy as np

def add_uniform_noise(image):
    """添加均匀分布噪声"""
    
    img_array = np.array(image)

    
    noise = np.random.uniform(low=0, high=180, size=img_array.shape)
    noisy_img_array = img_array + noise
    noisy_img_array = np.clip(noisy_img_array, 0, 255).astype(np.uint8)
    
    return Image.fromarray(noisy_img_array)
    

In [12]:
def crop_roi_pad(image, pad_size=5):
   
    
    # 使用argwhere找到所有值为0的元素的索引
    matrix = np.array(image)
    zero_indices = np.argwhere(matrix == 0)
    
    # 计算边界坐标
    min_row, min_col = np.min(zero_indices, axis=0)
    max_row, max_col = np.max(zero_indices, axis=0) + 1  # +1是因为索引是包含的，而裁剪需要开区间
    
    # 确保边界在原矩阵范围内
    min_row, min_col = max(0, min_row), max(0, min_col)
    max_row, max_col = min(matrix.shape[0], max_row), min(matrix.shape[1], max_col)
    
    # 根据边界裁剪矩阵
    roi = matrix[min_row:max_row, min_col:max_col]
    
    padded_roi = np.pad(roi, pad_width=((pad_size, pad_size), 
                                        (pad_size, pad_size)), mode='constant', constant_values=255)
    
    
    return Image.fromarray(padded_roi)
    

In [13]:
from pdf2image import convert_from_path
from PIL import Image, ImageFilter
import random

def pdf_to_images(pdf_path):
    # 将PDF转换为图片
    images = convert_from_path(pdf_path)
    
    if len(images) > 1:  # 如果太长，对于多模态图像识别非常不利
        return 
        
    else:
        image = images[0].convert('L')  # 转为灰度图像
        image = crop_roi_pad(image, pad_size=np.random.randint(50, 200))

        noisy_funcs = [add_uniform_noise, add_gaussian_blur]
        noisy_func = random.choice(noisy_funcs)
        noisy_image = noisy_func(image)
        
        return noisy_image

noisy_image = pdf_to_images('output.pdf')
noisy_image.save(f"output.png", "PNG")
noisy_image

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

0

In [2]:
from pdf2image import convert_from_path
from PIL import Image, ImageFilter
import random

def pdf_to_images(pdf_path):
    # 将PDF转换为图片
    images = convert_from_path(pdf_path)
noisy_image = pdf_to_images('./output.pdf')

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?