# 提取complaints


In [9]:
import pandas as pd
from io import StringIO

def extract_complaints_data(text):
    # 使用pandas直接读取CSV格式的文本
    df = pd.read_csv(StringIO(text))
    
    # 选择需要的列
    # 注意这里使用complaints_type而不是complaint_type
    df_filtered = df[['complaint_id', 'complaints', 'complaints_type']]
    
    # 重命名列以保持一致性
    df_filtered = df_filtered.rename(columns={
        'complaint_id': 'complaint_id',
        'complaints': 'complaint'
    })
    
    # 保存为CSV文件
    df_filtered.to_csv('complaints.csv', index=False, encoding='utf-8')
    
    return df_filtered

# 使用示例:
with open('/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/test_1.csv', 'r', encoding='utf-8') as file:
    text = file.read()
    df = extract_complaints_data(text)
    print(df.head())  # 查看前几行数据

   complaint_id                                          complaint  \
0         24001  The advert shows a Kiwi crossing a pedestrian ...   
1         24002  The Complainant was concerned the alcohol adve...   
2         24003  The Complainant was concerned the alcohol adve...   
3         24004  The Complainant was concerned the alcohol adve...   
4         24005  The Complainant was concerned the alcohol adve...   

  complaints_type  
0         Unknown  
1         Unknown  
2         Unknown  
3         Unknown  
4         Unknown  


# 提取adver

In [5]:
import pandas as pd
from io import StringIO
import numpy as np

def extract_adver_data(text):
    df = pd.read_csv(StringIO(text))
    # 使用pandas直接读取CSV格式的文本
    df['adver_id'] = df['adver_id'].replace([np.inf, -np.inf], np.nan)

    
    # 选择需要的列
    df_filtered = df[['adver_id', 'adver']]
    
    # 重命名列以保持一致性
    df_filtered = df_filtered.rename(columns={
        'adver_id': 'adver_id',
        'adver': 'adver'
    })
    
    # 保存为CSV文件
    df_filtered.to_csv('advers.csv', index=False, encoding='utf-8')
    
    return df_filtered

# 使用示例:
with open('/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/test_1.csv', 'r', encoding='utf-8') as file:
    text = file.read()
    df = extract_adver_data(text)
    print(df.head())  # 查看前几行数据

   adver_id                                              adver
0       NaN                                                NaN
1   24001.0  The Asahi television advertisement for the Lon...
2   24002.0  The Lion television advertisement for Stella A...
3   24003.0  The Lion television advertisement for Mac’s be...
4   24004.0  The Lion television advertisement  for Steinla...


In [7]:
import pandas as pd
import numpy as np
from io import StringIO

# 读取CSV文件
df = pd.read_csv('/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/advers.csv')

# 处理 adver_id 列
# 1. 将空值和非数值替换为 NaN
df['adver_id'] = pd.to_numeric(df['adver_id'], errors='coerce')

# 2. 删除 adver_id 为 NaN 的行
df = df.dropna(subset=['adver_id'])

# 3. 将 adver_id 转换为整数
df['adver_id'] = df['adver_id'].astype(int)

# 4. 保存结果
df.to_csv('processed_data.csv', index=False)

# 查看结果
print("数据类型:", df['adver_id'].dtype)
print("\n前几行数据:")
print(df[['adver_id', 'adver']].head())

数据类型: int64

前几行数据:
   adver_id                                              adver
1     24001  The Asahi television advertisement for the Lon...
2     24002  The Lion television advertisement for Stella A...
3     24003  The Lion television advertisement for Mac’s be...
4     24004  The Lion television advertisement  for Steinla...
5     24005  The Yoobee College of Creative Innovation Inst...
