In [2]:
import re
import pandas as pd

def parse_asa_decision(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 获取文件名中的decision_id
    decision_id = re.search(r'(\d+)\.txt$', file_path).group(1)
    
    # 定义需要提取的字段
    fields = {
        'complaint_number': r'COMPLAINT NUMBER\s*(.*?)(?=\n\w)',
        'advertiser': r'ADVERTISER\s*(.*?)(?=\n\w)',
        'advertisement': r'ADVERTISEMENT\s*(.*?)(?=\n\w)',
        'date_of_decision': r'DATE OF DECISION\s*(.*?)(?=\n\w)',
        'outcome': r'OUTCOME\s*(.*?)(?=\n\w)',
    }
    
    # 提取基本字段
    data = {'decision_id': decision_id}
    for field, pattern in fields.items():
        match = re.search(pattern, content, re.DOTALL)
        if match:
            data[field] = match.group(1).strip()
        else:
            data[field] = ''
    
    # 提取多行内容字段
    # Complaint
    complaint_match = re.search(r'Complaint:\s*(.*?)(?=\nRuling)', content, re.DOTALL)
    if complaint_match:
        complaint_text = complaint_match.group(1).strip()
        complaint_text = ' '.join(line.strip() for line in complaint_text.split('\n') if line.strip())
        data['complaint'] = complaint_text
    else:
        data['complaint'] = ''
    
    # Ruling
    ruling_match = re.search(r'Ruling\s*(.*?)(?=\nMore Information|Appeal Process)', content, re.DOTALL)
    if ruling_match:
        ruling_text = ruling_match.group(1).strip()
        ruling_text = ' '.join(line.strip() for line in ruling_text.split('\n') if line.strip())
        data['ruling'] = ruling_text
    else:
        data['ruling'] = ''
    
    # Appeal Process
    appeal_match = re.search(r'Appeal Process\s*(.*?)$', content, re.DOTALL)
    if appeal_match:
        appeal_text = appeal_match.group(1).strip()
        appeal_text = ' '.join(line.strip() for line in appeal_text.split('\n') if line.strip())
        data['appeal_process'] = appeal_text
    else:
        data['appeal_process'] = ''
    
    return data

# 处理单个文件并保存为CSV
file_path = '/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/work/output/24/24006.txt'
output_path = '/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/work/parsed_decision.csv'

# 解析文件
data = parse_asa_decision(file_path)

# 创建DataFrame并设置列的顺序
columns = ['decision_id', 'complaint_number', 'advertiser', 'advertisement', 
          'date_of_decision', 'outcome', 'complaint', 'ruling', 'appeal_process']
df = pd.DataFrame([data])[columns]

# 保存为CSV文件
df.to_csv(output_path, index=False)
print(f"文件已保存到: {output_path}")

文件已保存到: /Users/niwenyu/Desktop/OCR_PDF_EXTRACT/work/parsed_decision.csv


# 恶心人的复杂投诉

In [4]:
import re
import pandas as pd

def parse_asa_decision(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 获取文件名中的decision_id
    decision_id = re.search(r'(\d+)\.txt$', file_path).group(1)
    
    # 基本字段提取
    data = {
        'decision_id': decision_id,
        'complaint_number': extract_field(content, r'COMPLAINT NUMBER\s*(.*?)(?=\n\w)'),
        'advertiser': extract_field(content, r'ADVERTISER\s*(.*?)(?=\n\w)'),
        'advertisement': extract_field(content, r'ADVERTISEMENT\s*(.*?)(?=\n\w)'),
        'date_of_decision': extract_field(content, r'DATE OF (?:DECISION|MEETING)\s*(.*?)(?=\n\w)'),
        'outcome': extract_field(content, r'OUTCOME\s*(.*?)(?=\n\w)')
    }
    
    # 提取Summary部分
    summary_match = re.search(r'Summary of the Complaints Board Decision\s*(.*?)(?=\n\n\w)', content, re.DOTALL)
    if summary_match:
        data['summary'] = clean_text(summary_match.group(1))
    else:
        data['summary'] = ''
    
    # 提取Complaint部分
    complaint_match = re.search(r'Summary of the Complaint\s*(.*?)(?=\n\nA copy of|Issues Raised)', content, re.DOTALL)
    if complaint_match:
        data['complaint'] = clean_text(complaint_match.group(1))
    else:
        # 尝试直接从Complaint部分提取
        complaint_match = re.search(r'COMPLAINT\s*(.*?)(?=\n\nAppendix)', content, re.DOTALL)
        if complaint_match:
            data['complaint'] = clean_text(complaint_match.group(1))
        else:
            data['complaint'] = ''
    
    # 提取Ruling/Decision部分
    ruling_match = re.search(r'Complaints Board Discussion\s*(.*?)(?=\n\nOutcome|APPEAL INFORMATION)', content, re.DOTALL)
    if ruling_match:
        data['ruling'] = clean_text(ruling_match.group(1))
    else:
        data['ruling'] = ''
    
    # 提取Appeal Information
    appeal_match = re.search(r'APPEAL (?:PROCESS|INFORMATION)\s*(.*?)(?=\n\nAPPENDICES|\Z)', content, re.DOTALL)
    if appeal_match:
        data['appeal_process'] = clean_text(appeal_match.group(1))
    else:
        data['appeal_process'] = ''
    
    return data 

def extract_field(content, pattern):
    """提取单个字段的辅助函数"""
    match = re.search(pattern, content, re.DOTALL)
    return clean_text(match.group(1)) if match else ''

def clean_text(text):
    """清理文本的辅助函数"""
    # 删除多余的空白字符
    text = re.sub(r'\s+', ' ', text.strip())
    # 删除多余的空行
    text = re.sub(r'\n\s*\n', '\n', text)
    return text

# 主程序
def main(input_file, output_file):
    # 解析文件
    data = parse_asa_decision(input_file)
    
    # 创建DataFrame并设置列的顺序
    columns = ['decision_id', 'complaint_number', 'advertiser', 'advertisement', 
              'date_of_decision', 'outcome', 'summary', 'complaint', 'ruling', 
              'appeal_process']
    df = pd.DataFrame([data])[columns]
    
    # 保存为CSV文件
    df.to_csv(output_file, index=False)
    print(f"文件已保存到: {output_file}")
    return df

# 使用示例
if __name__ == "__main__":
    input_file = '/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/work/output/24/24005.txt'  # 替换为实际的输入文件路径
    output_file = 'parsed_decision_test.csv'
    df = main(input_file, output_file)

文件已保存到: parsed_decision_test.csv


# 针对24年的data处理

In [1]:
import pandas as pd
import csv

def excel_to_csv(excel_file, csv_file):
    """
    将 Excel 文件转换为 CSV，处理文本字段中的逗号
    
    Parameters:
    excel_file (str): Excel 文件路径
    csv_file (str): 输出 CSV 文件路径
    """
    # 读取 Excel 文件
    df = pd.read_excel(excel_file)
    
    # 将 DataFrame 写入 CSV，使用 csv 模块处理特殊字符
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        
        # 写入表头
        writer.writerow(df.columns)
        
        # 写入数据行
        for _, row in df.iterrows():
            # 将所有值转换为字符串
            row_values = [str(value) for value in row]
            writer.writerow(row_values)

# 使用示例
if __name__ == "__main__":
    excel_to_csv("/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/24_decision.xlsx", "output.csv")

# Feature engine
## 第一列修改


In [7]:
import pandas as pd

def process_complaint_numbers(df):
    """
    处理投诉编号列，将形如 '24005.txt' 的格式转换为整数 24005
    
    Parameters:
    df (pandas.DataFrame): 包含投诉数据的 DataFrame
    
    Returns:
    pandas.DataFrame: 处理后的 DataFrame
    """
    # 复制 DataFrame 以避免修改原始数据
    df = df.copy()
    
    # 提取第一列的数字部分并转换为整数
    df.iloc[:, 0] = df.iloc[:, 0].str.extract('(\d+)').astype(int)
    
    return df

def main(input_file, output_file):
    """
    主函数：读取 CSV 文件，处理数据，并保存结果
    
    Parameters:
    input_file (str): 输入 CSV 文件路径
    output_file (str): 输出 CSV 文件路径
    """
    # 读取 CSV 文件
    df = pd.read_csv(input_file)
    
    # 处理数据
    df = process_complaint_numbers(df)
    
    # 保存处理后的数据
    df.to_csv(output_file, index=False)
    
    print(f"数据处理完成！结果已保存到 {output_file}")

# 使用示例
if __name__ == "__main__":
    input_file = "/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/24_decision.csv"   # 替换为你的输入文件路径
    output_file = "1.csv" # 替换为你想要保存的输出文件路径
    main(input_file, output_file)

数据处理完成！结果已保存到 1.csv


In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
from enum import Enum

# 定义枚举类
class ComplaintType(Enum):
    UNKNOWN = "Unknown"
    # 示例类型，后续可添加具体类别
    TYPE_A = "Type A"
    TYPE_B = "Type B"
    TYPE_C = "Type C"

class AdvertType(Enum):
    UNKNOWN = "Unknown"
    # 示例类型，后续可添加具体类别
    TYPE_1 = "Type 1"
    TYPE_2 = "Type 2"
    TYPE_3 = "Type 3"

def convert_to_enum(value, enum_class):
    """转换值为枚举类型"""
    if pd.isna(value):
        return enum_class.UNKNOWN.value
    
    try:
        # 尝试直接匹配枚举值
        return enum_class[str(value).upper()].value
    except:
        return enum_class.UNKNOWN.value

def clean_and_convert_data(df):
    """
    清理CSV数据并转换数据类型
    
    参数:
    df (pandas.DataFrame): 输入的数据框
    
    返回:
    pandas.DataFrame: 处理后的数据框
    """
    # 复制DataFrame以避免修改原始数据
    df_clean = df.copy()
    
    # 第一步：处理逗号
    def escape_commas(text):
        """处理文本中的逗号"""
        if pd.isna(text):
            return text
        return str(text).replace(',', '|')
    
    # 处理complaints和adver列的逗号
    for col in ['complaints', 'adver']:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].apply(escape_commas)
    
    # 第二步：转换数据类型
    # 转换decision_id为int
    df_clean['decision_id'] = pd.to_numeric(df_clean['decision_id'], errors='coerce').astype('Int64')
    df_clean['adver_id'] = pd.to_numeric(df_clean['adver_id'], errors= 'coerce' ).astype('Int64')
    # 确保complaint_number为字符串
    df_clean['complaint_number'] = df_clean['complaint_number'].astype(str)
    
    # 确保advertiser和advertisement为字符串
    df_clean['advertiser'] = df_clean['advertiser'].fillna('').astype(str)
    df_clean['advertisement'] = df_clean['advertisement'].fillna('').astype(str)
    
    # 转换date_of_meeting为日期格式
    def convert_date(date_str):
        if pd.isna(date_str):
            return None
        try:
            formats = ['%Y-%m-%d', '%d-%b-%Y', '%Y-%m-%d %H:%M:%S', 
                      '%d %B %Y', '%B %Y', '%d %b %Y']
            for fmt in formats:
                try:
                    return pd.to_datetime(date_str, format=fmt)
                except:
                    continue
            return pd.to_datetime(date_str)
        except:
            return None
            
    df_clean['date_of_meeting'] = df_clean['date_of_meeting'].apply(convert_date)
    
    # 转换complaints_type为枚举类型
    if 'complaints_type' in df_clean.columns:
        df_clean['complaints_type'] = df_clean['complaints_type'].apply(
            lambda x: convert_to_enum(x, ComplaintType)
        )
    
    # 转换adver_type为枚举类型（更正拼写错误后的列名）
    if 'adver_type' in df_clean.columns:
        df_clean['adver_type'] = df_clean['adver_type'].apply(
            lambda x: convert_to_enum(x, AdvertType)
        )
    
    # 计算len_complaint（统计complaints列字数）
    df_clean['len_complaint'] = df_clean['complaints'].fillna('').str.len()
    
    # 计算len_adver（统计adver列字数）
    df_clean['len_adver'] = df_clean['adver'].fillna('').str.len()
    
    # 删除原来的len(adver)列（如果存在）
    if 'len(adver)' in df_clean.columns:
        df_clean = df_clean.drop('len(adver)', axis=1)
    
    return df_clean

if __name__ == "__main__":
    try:
        # 读取并处理文件
        print("正在读取文件...")
        df = pd.read_csv('/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/test_1.csv')
        
        print("正在处理数据...")
        df_cleaned = clean_and_convert_data(df)
        
        # 保存回原文件
        print("正在保存到原文件...")
        df_cleaned.to_csv('/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/model_BERT/test_1.csv', index=False)
        
        # 打印数据类型信息
        print("\n数据类型转换后的信息:")
        print(df_cleaned.dtypes)
        
        # 打印枚举类型列的唯一值
        if 'complaints_type' in df_cleaned.columns:
            print("\ncomplaints_type唯一值:")
            print(df_cleaned['complaints_type'].unique())
        
        if 'adver_type' in df_cleaned.columns:
            print("\nadver_type唯一值:")
            print(df_cleaned['adver_type'].unique())
        
        print("\n处理完成！")
        
    except Exception as e:
        print(f"处理过程中出现错误: {str(e)}")

正在读取文件...
正在处理数据...
正在保存到原文件...

数据类型转换后的信息:
decision_id                  Int64
complaint_number            object
advertiser                  object
advertisement               object
date_of_meeting     datetime64[ns]
outcome                     object
complaint_id                 int64
complaints                  object
complaints_type             object
len_complaint                int64
adver_id                     Int64
adver                       object
adver_tyoe                 float64
ttarr(i)_adver              object
len_adver                    int64
dtype: object

complaints_type唯一值:
['Unknown']

处理完成！


In [11]:
import pandas as pd
import numpy as np

# 输入和输出文件路径（请根据实际情况修改）
input_csv = '/Users/niwenyu/Desktop/OCR_PDF_EXTRACT/processed_24_1 copy.csv'
output_csv = 'test_1.csv'

# 1. 从CSV文件读入数据
df = pd.read_csv(input_csv)

# 确保有 'adver' 列
if 'adver' not in df.columns:
    raise ValueError("The input CSV file must contain an 'adver' column.")

# 2. 筛选出 adver 不为空的行
df_fill = df[~df['adver'].isna() & (df['adver'] != '')].copy()

# 3. 在df_fill中为adver_id赋值
df_fill['adver_id'] = np.nan
prev_adver = None
prev_index = None
current_id = 24000  # 初始值：下一条新adver将使用24001开始

for i in df_fill.index:
    current_adver = df_fill.loc[i, 'adver']
    if prev_adver is None:
        # 第一条非空的adver
        current_id += 1  # current_id = 24001
        df_fill.loc[i, 'adver_id'] = current_id
    else:
        if current_adver == prev_adver:
            # 与上一条adver相同
            df_fill.loc[i, 'adver_id'] = df_fill.loc[prev_index, 'adver_id']
        else:
            # 与上一条adver不同
            df_fill.loc[i, 'adver_id'] = df_fill.loc[prev_index, 'adver_id'] + 1

    prev_adver = current_adver
    prev_index = i

# 4. 将df_fill中的adver_id映射回df
if 'adver_id' not in df.columns:
    df['adver_id'] = np.nan

# 利用索引对齐，将df_fill的adver_id赋回df
df.loc[df_fill.index, 'adver_id'] = df_fill['adver_id']

# 5. 将结果保存到新的CSV文件
df.to_csv(output_csv, index=False)

print("Processing complete! The updated DataFrame has been saved to:", output_csv)


Processing complete! The updated DataFrame has been saved to: test_1.csv
