In [1]:
#!pip install pdfplumber
#!pip install googletrans==4.0.0-rc1

In [2]:
import pdfplumber
import pandas as pd
import re

In [3]:
# Path to the new PDF file
#pdf_path_new = "C:/Users/sbsys20/OneDrive - Cardiff University/Protection or Harm/Extract Data From PDF/GB_T_4754－2002_2-2.pdf"
pdf_path_new = "C:/Users/sbsys20/OneDrive - Cardiff University/Protection or Harm/Extract Data From PDF/GB_T_4754-2002.pdf"

#pdf_path_new = "C:/Users/sbsys20/OneDrive - Cardiff University/Protection or Harm/Extract Data From PDF/GB_T 4754－2002_18-18.pdf"

In [4]:
output_path= "C:/Users/sbsys20/OneDrive - Cardiff University/Protection or Harm/Extract Data From PDF"

In [5]:
# Initialize lists for the structured data
structured_data = []

# Define a threshold for spacing (manually set or dynamically calculated based on data)
SPACE_THRESHOLD = 1  # Adjust based on the spacing observed in your PDF

# Open the PDF and process character positions
with pdfplumber.open(pdf_path_new) as pdf:
    for page in pdf.pages:
        # Extract character-level data
        chars = page.chars
        rows = {}
        margin = 3  # Tolerance for row alignment

        # Group characters by their 'top' position (rows)
        for char in chars:
            row_key = round(char["top"], margin)  # Group characters by approximate row position
            if row_key not in rows:
                rows[row_key] = []
            rows[row_key].append(char)

        # Sort rows by 'top' value
        sorted_rows = sorted(rows.items(), key=lambda x: x[0])

        # Process each row
        for _, row_chars in sorted_rows:
            # Sort characters in a row by their x0 (horizontal position)
            sorted_chars = sorted(row_chars, key=lambda c: c["x0"])
            
            # Find gaps between characters
            row_text = []
            current_column = ""
            previous_x = None

            for char in sorted_chars:
                if previous_x is not None and (char["x0"] - previous_x > SPACE_THRESHOLD):
                    # Start a new column if gap exceeds threshold
                    row_text.append(current_column.strip())
                    current_column = ""
                current_column += char["text"]
                previous_x = char["x1"]  # Update to the end of the current character
            
            # Append the last column
            row_text.append(current_column.strip())
            
            #Join columns with a delimiter for clarity
            structured_data.append("*".join(row_text))



In [6]:
structured_data = structured_data[26:]


In [7]:
 #Define elements to exclude
exclude_elements = {"代码",  '类   别   名   称*说明', '门类*大类*中类*小类'}

# Regular expression for elements like '第 * 页 共 47 页'
exclude_regex = r'第\s*\d+\s*页\s*共\s*\d+\s*页'

# Clean up list: Remove leading `|` or `||` and filter unwanted rows
filtered_data_list = [
    row.lstrip('*') for row in structured_data  # Remove leading | or ||
    if row.lstrip('*') not in exclude_elements  # Exclude specific elements
    and not row.strip("*").strip() == ""  # Exclude empty rows
    and not re.match(exclude_regex, row)  # Exclude rows matching the regex
]


In [8]:
filtered_data_list

['A****农、林、牧、渔业*本门类包括*01-05*大类。',
 '01***农业*指对各种农作物的种植活动。',
 '011**谷物及其他作物的种植*',
 '0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、',
 '小麦、玉米等农作物的种植。',
 '0112 薯类的种植*',
 '0113 油料的种植*',
 '0114 豆类的种植*',
 '0115 棉花的种植*',
 '0116 麻类的种植*',
 '0117 糖料的种植*',
 '0118 烟草的种植*',
 '0119 其他作物的种植*',
 '012**蔬菜、园艺作物的种植*',
 '0121 蔬菜的种植*',
 '0122 花卉的种植*',
 '0123 其他园艺作物的种植*',
 '013**水果、坚果、饮料和香料作物的种植',
 '0131 水果、坚果的种植*',
 '0132 茶及其他饮料作物的种植*',
 '0133 香料作物的种植*',
 '014*0140 中药材的种植*指主要用于中药配制以及中成药加工的药材作物的种植。',
 '02***林业*',
 '021**林木的培育和种植*',
 '0211 育种和育苗*',
 '0212 造林*指在荒山、荒地、沙丘和退耕地等一切可以造林的土地上进',
 '行的林木和竹子的种植活动和恢复森林的活动。',
 '0213 林木的抚育和管理*指为促进林木生长发育，在林木生长的不同时期进行的促进',
 '林木生长发育的措施活动。',
 '022**木材和竹材的采运*指对林木和竹木的采伐，并将其运出山场至贮木场的生产活',
 '动。',
 '0221 木材的采运*',
 '0222 竹材的采运*',
 '023*0230 林产品的采集*指在天然森林和人工林地进行的各种林木产品和其他野生植',
 '物的采集等活动。',
 '03***畜牧业*指为了获得各种畜禽产品而从事的动物饲养活动。',
 '031*0310 牲畜的饲养*指对牛、羊、马、驴、骡、骆驼等主要牲畜的饲养。',
 '032*0320 猪的饲养*',
 '033*0330 家禽的饲养*',
 '034*0340 狩猎和捕捉动物*指对各种野生动物的捕捉以及与此相关的活动。',
 '039*0390 其他畜牧业*',
 '04***渔业*',
 '041**海洋渔业*

In [9]:


# Initialize a new list for cleaned data
merged_data_list = []

# Iterate through the list and merge rows when conditions are met
i = 0
while i < len(filtered_data_list):
    current_element = filtered_data_list[i].strip()  # Remove extra spaces
    
    # Debugging: Print the raw element
    print(f"Processing: {repr(current_element)}")
    
    # If the current element starts with a number or a capital Latin character, process it as a new "main" element
    if re.match(r'^[\dA-Z]', current_element):
        print(f"Matched current: {repr(current_element)}")
        
        # Check the next element
        while i + 1 < len(filtered_data_list):
            next_element = filtered_data_list[i + 1].strip()  # Remove extra spaces
            print(f"Next element: {repr(next_element)}")
            
            # Only process the next element if it starts with a Chinese character
            if re.match(r'^[\s]*[\u4e00-\u9fff]', next_element):
                # Check if the next element contains a separator *
                if "*" in next_element:
                    # Split into parts
                    parts = next_element.split("*", 1)  # Split at the first occurrence of *
                    # Insert the first part before the last separator in current_element
                    if "*" in current_element:
                        current_element = current_element.rsplit("*", 1)[0]  + parts[0] + "*" + current_element.rsplit("*", 1)[1]
                    else:
                        current_element += " " + parts[0]
                    # Append the second part to the end
                    if len(parts) > 1:
                        current_element += " " + parts[1]
                    print(f"Updated current with separator logic: {repr(current_element)}")
                    i += 1  # Move to the next element
                else:
                    # If no separator, append the entire next_element to the end
                    current_element += " " + next_element
                    print(f"Appended next line with Chinese characters: {repr(current_element)}")
                    i += 1  # Move to the next element
            else:
                break  # Stop appending when the next element does not match the criteria

        # Append the cleaned or merged element
        merged_data_list.append(current_element)
    
    # If the current element does not match the criteria, skip to the next
    i += 1

# Print the final merged list
#print("Final Merged Data List:")
#for row in merged_data_list:
#    print(row)


Processing: 'A****农、林、牧、渔业*本门类包括*01-05*大类。'
Matched current: 'A****农、林、牧、渔业*本门类包括*01-05*大类。'
Next element: '01***农业*指对各种农作物的种植活动。'
Processing: '01***农业*指对各种农作物的种植活动。'
Matched current: '01***农业*指对各种农作物的种植活动。'
Next element: '011**谷物及其他作物的种植*'
Processing: '011**谷物及其他作物的种植*'
Matched current: '011**谷物及其他作物的种植*'
Next element: '0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、'
Processing: '0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、'
Matched current: '0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、'
Next element: '小麦、玉米等农作物的种植。'
Appended next line with Chinese characters: '0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、 小麦、玉米等农作物的种植。'
Next element: '0112 薯类的种植*'
Processing: '0112 薯类的种植*'
Matched current: '0112 薯类的种植*'
Next element: '0113 油料的种植*'
Processing: '0113 油料的种植*'
Matched current: '0113 油料的种植*'
Next element: '0114 豆类的种植*'
Processing: '0114 豆类的种植*'
Matched current: '0114 豆类的种植*'
Next element: '0115 棉花的种植*'
Processing: '0115 棉花的种植*'
Matched current: '0115 棉花的种植*'
Next element: '0116 麻类的种植*'
Processing: '0116 麻类的种植*'
Matched

In [10]:
# Add `|` separator between numbers and characters when separated by a space
updated_data_list = [
    re.sub(r'(\d+)\s+([\u4e00-\u9fff])', r'\1*\2', row) for row in merged_data_list
]

# Add `|||` separator to the start of elements starting with exactly 4 digits
updated_data_list2 = [
    re.sub(r'^(?=\d{4})', '***', row) for row in updated_data_list
]

# Add `||` separator to the start of elements starting with exactly 4 digits
updated_data_list3 = [
    re.sub(r'^(?=\d{3})', '**', row) for row in updated_data_list2
]

# Add `||` separator to the start of elements starting with exactly 4 digits
updated_data_list4 = [
    re.sub(r'^(?=\d{2})', '*', row) for row in updated_data_list3
]


In [11]:
updated_data_list4

['A****农、林、牧、渔业*本门类包括*01-05*大类。',
 '*01***农业*指对各种农作物的种植活动。',
 '**011**谷物及其他作物的种植*',
 '***0111*谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、 小麦、玉米等农作物的种植。',
 '***0112*薯类的种植*',
 '***0113*油料的种植*',
 '***0114*豆类的种植*',
 '***0115*棉花的种植*',
 '***0116*麻类的种植*',
 '***0117*糖料的种植*',
 '***0118*烟草的种植*',
 '***0119*其他作物的种植*',
 '**012**蔬菜、园艺作物的种植*',
 '***0121*蔬菜的种植*',
 '***0122*花卉的种植*',
 '***0123*其他园艺作物的种植*',
 '**013**水果、坚果、饮料和香料作物的种植',
 '***0131*水果、坚果的种植*',
 '***0132*茶及其他饮料作物的种植*',
 '***0133*香料作物的种植*',
 '**014*0140*中药材的种植*指主要用于中药配制以及中成药加工的药材作物的种植。',
 '*02***林业*',
 '**021**林木的培育和种植*',
 '***0211*育种和育苗*',
 '***0212*造林*指在荒山、荒地、沙丘和退耕地等一切可以造林的土地上进 行的林木和竹子的种植活动和恢复森林的活动。',
 '***0213*林木的抚育和管理*指为促进林木生长发育，在林木生长的不同时期进行的促进 林木生长发育的措施活动。',
 '**022**木材和竹材的采运*指对林木和竹木的采伐，并将其运出山场至贮木场的生产活 动。',
 '***0221*木材的采运*',
 '***0222*竹材的采运*',
 '**023*0230*林产品的采集*指在天然森林和人工林地进行的各种林木产品和其他野生植 物的采集等活动。',
 '*03***畜牧业*指为了获得各种畜禽产品而从事的动物饲养活动。',
 '**031*0310*牲畜的饲养*指对牛、羊、马、驴、骡、骆驼等主要牲畜的饲养。',
 '**032*0320*猪的饲养*',
 '**033*0330*家禽的饲养*',
 '**034*0340*狩猎和捕捉动物*指对各种

In [12]:
pattern = r'\*(\d+[A-Z])\*'

# Replace elements matching the pattern
updated_data_list4 = [
    re.sub(pattern, r' *1 ', element) for element in updated_data_list4
]

In [13]:
# Corrected pattern
pattern = r'\*(\d+)\*和\*(\d+)\*'

# Replace the matched pattern with a modified format
updated_data_list4 = [
    re.sub(pattern, r' \1 和 \2 ', element) for element in updated_data_list4
]

In [14]:
# Pattern to match `|` followed by an arbitrary number (including decimals) and `％`
pattern = r'\*(\d+(\.\d+)?％)'

# Replace occurrences of the pattern
updated_data_list4 = [
    re.sub(pattern, r'\1', element) for element in updated_data_list4
]

In [15]:
# Pattern to match `|` followed by an arbitrary number (including decimals) and `％`
pattern = r'\*(\d+(\.\d+))\*'

# Replace occurrences of the pattern
updated_data_list4 = [
    re.sub(pattern, r' \1 ', element) for element in updated_data_list4
]


In [16]:
# Pattern to match a Chinese character, followed by `|number|`, followed by another Chinese character
pattern = r'([\u4e00-\u9fff])\*(\d+(\.\d+)?)\*([\u4e00-\u9fff])'

# Replace occurrences of the pattern
updated_data_list4 = [
    re.sub(pattern, r'\1 \2 \4', element) for element in updated_data_list4
]


In [17]:
# Regex pattern for |number—number|
pattern = r'\*(\d+—\d+)\*'
print(updated_data_list4[0])
# Replace elements matching the pattern
updated_data_list4 = [
    re.sub(pattern, r' \1 ', element) for element in updated_data_list4
]

updated_data_list4[0] = re.sub(r'\*01-05\*', r' 01-05 ', updated_data_list4[0])

updated_data_list4[479] = re.sub(r'\*350kPa\*', r' 350kPa ', updated_data_list4[0])


A****农、林、牧、渔业*本门类包括*01-05*大类。


In [18]:
updated_data_list4[477]='||354||泵、阀门、压缩机及类似机械的制造 |指泵、真空设备、压缩机，液压和气压动力机械及类似机械 和阀门的制造。'

updated_data_list4[867]='||637||机械设备、五金交电及电子产品批发  |指通用机械、专用设备、交通运输设备、电气机械、五金交 电、家用电器、计算机设备、通讯设备、电子产品、仪器仪表及办公用机械的批发和进出口活动。'

updated_data_list4[912]='||656||汽车、摩托车、燃料及零配件专门 零售  |指专门经营汽车、摩托车、汽车部件、汽车零配件及燃料的零售活动。' 

updated_data_list4[922]='||658||五金、家具及室内装修材料专门零售|指五金用品、家具和装修材料零售店的销售活动，以及在家 具、家居装修、建材城（中心）及展销会上设摊位的销售活动。' 

In [19]:
# Split each row into columns based on `|` and store in a DataFrame
df = pd.DataFrame([row.split('*') for row in updated_data_list4])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,A,,,,农、林、牧、渔业,本门类包括 01-05 大类。,,,
1,,01,,,农业,指对各种农作物的种植活动。,,,
2,,,011,,谷物及其他作物的种植,,,,
3,,,,0111,谷物的种植,指以收获籽实为主，供人类食用的农作物的种植，如稻谷、 小麦、玉米等农作物的种植。,,,
4,,,,0112,薯类的种植,,,,
...,...,...,...,...,...,...,...,...,...
1195,,,971,9710,社区自治组织,指城市、镇的居民通过选举产生的群众性自治组织的管理活 动。,,,
1196,,,972,9720,村民自治组织,指农村村民通过选举产生的群众性自治组织的管理活动。,,,
1197,T,,,,国际组织,,,,
1198,,98,,,国际组织,,,,


In [20]:
# Concatenate column 5 and column 6, separated by a space
df[5] = df[5].fillna('') + ' ' + df[6].fillna('') + ' ' + df[7].fillna('') + ' ' + df[8].fillna('')

# Strip any trailing spaces (in case column 6 was empty or None)

df = df.drop(columns=[6, 7, 8])
df

Unnamed: 0,0,1,2,3,4,5
0,A,,,,农、林、牧、渔业,本门类包括 01-05 大类。
1,,01,,,农业,指对各种农作物的种植活动。
2,,,011,,谷物及其他作物的种植,
3,,,,0111,谷物的种植,指以收获籽实为主，供人类食用的农作物的种植，如稻谷、 小麦、玉米等农作物的种植。
4,,,,0112,薯类的种植,
...,...,...,...,...,...,...
1195,,,971,9710,社区自治组织,指城市、镇的居民通过选举产生的群众性自治组织的管理活 动。
1196,,,972,9720,村民自治组织,指农村村民通过选举产生的群众性自治组织的管理活动。
1197,T,,,,国际组织,
1198,,98,,,国际组织,


In [21]:
filtered_df = df[df.iloc[:, 6:8].notnull().any(axis=1)]
filtered_df

Unnamed: 0,0,1,2,3,4,5


In [22]:
from googletrans import Translator

# Initialize translator
translator = Translator()

# Translate and store results
data_english = []
for element in structured_data:
    try:
        translated = translator.translate(element, src="zh-cn", dest="en")
        data_english.append(translated.text)
    except Exception as e:
        data_english.append(f"Error: {e}")

# Combine into a DataFrame for storage or saving
df_translations = pd.DataFrame({
    "Original": structured_data,
    "Translated": data_english
})

In [23]:
# Save to an Excel file
output_path = "C:/Users/sbsys20/OneDrive - Cardiff University/Protection or Harm/Extract Data From PDF/translationsFull.xlsx"
df_translations.to_excel(output_path, index=False)

# Display the DataFrame
print(df_translations)

                                      Original  \
0                A****农、林、牧、渔业*本门类包括*01-05*大类。   
1                       *01***农业*指对各种农作物的种植活动。   
2                           **011**谷物及其他作物的种植*   
3     ***0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、   
4                                小麦、玉米等农作物的种植。   
...                                        ...   
2045                                T****国际组织*   
2046                               *98***国际组织*   
2047   **980*9800 国际组织*指联合国和其他国际组织驻我国境内的机构的活动。   
2048                                             
2049                             第 47 页 共 47 页   

                                             Translated  
0     A **** Agriculture, forestry, pastoral, fisher...  
1     *01 *** Agriculture*refers to the planting act...  
2             ** 011 ** Grain and other crops planting*  
3     *** 0111 The planting of the grains*refers to ...  
4                   Planting crops such as wheat, corn.  
...                                                

In [25]:
df_translations

Unnamed: 0,Original,Translated
0,A****农、林、牧、渔业*本门类包括*01-05*大类。,"A **** Agriculture, forestry, pastoral, fisher..."
1,*01***农业*指对各种农作物的种植活动。,*01 *** Agriculture*refers to the planting act...
2,**011**谷物及其他作物的种植*,** 011 ** Grain and other crops planting*
3,***0111 谷物的种植*指以收获籽实为主，供人类食用的农作物的种植，如稻谷、,*** 0111 The planting of the grains*refers to ...
4,小麦、玉米等农作物的种植。,"Planting crops such as wheat, corn."
...,...,...
2045,T****国际组织*,T **** International Organization*
2046,*98***国际组织*,*98 *** International Organization*
2047,**980*9800 国际组织*指联合国和其他国际组织驻我国境内的机构的活动。,** 980*9800 International Organization*refers ...
2048,,"Error: the JSON object must be str, bytes or b..."
