# 1. 构建训练数据集


In [1]:
# 读取数据
'''
原始文本文件 `Lexis-Nexis_LRA.csv`，其中包含:
- 'Title': 新闻标题
- 'Source': 新闻来源
- 'Time': 新闻时间
- 'Year': 新闻年份
- 'Month': 新闻月份
- 'Day': 新闻日期
- 'Text': 新闻正文
'''
import pandas as pd
text_df = pd.read_csv('./data/Lexis-Nexis_LRA.csv')
text_df.head()

Unnamed: 0,Title,Source,Time,Year,Month,Day,Text
0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",2012,March,19,['The International Criminal Court has reached...
1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST",2012,March,31,"[""Geneva, Mar 31 (IBNS) The United Nations ref..."
2,'Now Katine is famous': how a Guardian project...,The Guardian(London),"December 30, 2017 Saturday 9:00 AM GMT",2017,December,30,"[""Joyce Abiro was among the first to join the ..."
3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,2012,July,15,"['For 25 years, Joseph Kony and his Lord\'s Re..."
4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",2011,February,15,['Displaced northern Ugandans have flocked hom...


In [2]:
# 添加ID列
text_df.insert(0, 'ID', range(len(text_df))) # 添加 id 列
text_df.head()

Unnamed: 0,ID,Title,Source,Time,Year,Month,Day,Text
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",2012,March,19,['The International Criminal Court has reached...
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST",2012,March,31,"[""Geneva, Mar 31 (IBNS) The United Nations ref..."
2,2,'Now Katine is famous': how a Guardian project...,The Guardian(London),"December 30, 2017 Saturday 9:00 AM GMT",2017,December,30,"[""Joyce Abiro was among the first to join the ..."
3,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,2012,July,15,"['For 25 years, Joseph Kony and his Lord\'s Re..."
4,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",2011,February,15,['Displaced northern Ugandans have flocked hom...


In [3]:
'''
`Text`去除两端空格
去除`Year`、`Month`、`Day`列
就`Text`列进行编码与格式检查
就`Text`列进行去重
'''
cleaned_df = text_df.copy()
cleaned_df['Text'] = cleaned_df['Text'].str.strip()
cleaned_df.drop(columns=['Year', 'Month', 'Day'], inplace=True)
from tools import is_legal_text
cleaned_df = cleaned_df.loc[cleaned_df.Text.apply(is_legal_text)]
cleaned_df.drop_duplicates('Text', inplace=True)
# cleaned_df.to_csv('./data/cleaned.csv', index=False)
cleaned_df.head()

Unnamed: 0,ID,Title,Source,Time,Text
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",['The International Criminal Court has reached...
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST","[""Geneva, Mar 31 (IBNS) The United Nations ref..."
3,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,"['For 25 years, Joseph Kony and his Lord\'s Re..."
4,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",['Displaced northern Ugandans have flocked hom...
6,6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday","[""LOCAL POACHERS'"", 'FROM TIMOTHY EVANS IN BAN..."


In [4]:
'''
经处理数据 `cleaned.csv`，其中包含:
- 'ID': 原数据ID
- 'Title': 新闻标题
- 'Source': 新闻来源
- 'Time': 新闻时间
- 'Text': 新闻正文
'''
cleaned_df = pd.read_csv('./data/cleaned.csv')
from ast import literal_eval
cleaned_df['Text'] = cleaned_df.Text.apply(literal_eval)
cleaned_df.head()

Unnamed: 0,ID,Title,Source,Time,Text
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",[The International Criminal Court has reached ...
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST","[Geneva, Mar 31 (IBNS) The United Nations refu..."
2,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,"[For 25 years, Joseph Kony and his Lord's Resi..."
3,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",[Displaced northern Ugandans have flocked home...
4,6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday","[LOCAL POACHERS', FROM TIMOTHY EVANS IN BANGUI..."


In [5]:
train_df = cleaned_df.copy()
train_df.head()

Unnamed: 0,ID,Title,Source,Time,Text
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",[The International Criminal Court has reached ...
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST","[Geneva, Mar 31 (IBNS) The United Nations refu..."
2,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,"[For 25 years, Joseph Kony and his Lord's Resi..."
3,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",[Displaced northern Ugandans have flocked home...
4,6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday","[LOCAL POACHERS', FROM TIMOTHY EVANS IN BANGUI..."


In [6]:
'''
筛选出包含以下字段的文本片段：
- "Uganda"
- "Sudan"
- "Central African Republic", "CAR"
- "Democratic Republic of the Congo", "DRC"
'''
PF_targets = [
    ("Uganda",),
    ("Sudan",),
    ("Central African Republic", "CAR"),
    ("Democratic Republic of the Congo", "DRC",),
]

from tools import abstract_text
PF_matched = train_df.Text.apply(lambda s: abstract_text(s, PF_targets[0]))
for PF_target in PF_targets[1:]:
    PF_matched = map(lambda _: list(map(any, zip(_[0], _[1]))), zip(PF_matched, train_df.Text.apply(lambda s: abstract_text(s, PF_target))))
train_df['PF_TARGET'] = list(PF_matched)
train_df.head()

Unnamed: 0,ID,Title,Source,Time,Text,PF_TARGET
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",[The International Criminal Court has reached ...,"[False, False, False, True, True, True, True, ..."
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST","[Geneva, Mar 31 (IBNS) The United Nations refu...","[True, True, True, True, True, True, True, Tru..."
2,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,"[For 25 years, Joseph Kony and his Lord's Resi...","[False, False, False, False]"
3,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",[Displaced northern Ugandans have flocked home...,"[True, True, True, True, False, False, False, ..."
4,6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday","[LOCAL POACHERS', FROM TIMOTHY EVANS IN BANGUI...","[False, False, True, True, True, False, False,..."


In [7]:
'''
筛选出包含以下字段的文本片段：
- "Lord's Resistance Army", "LRA"
'''
LRA_targets = [
    ("Lord's Resistance Army", "LRA",),
]

LRA_matched = train_df.Text.apply(lambda s: abstract_text(s, LRA_targets[0]))
for LRA_target in LRA_targets[1:]:
    LRA_matched = map(lambda _: list(map(any, zip(_[0], _[1]))), zip(LRA_matched, train_df.Text.apply(lambda s: abstract_text(s, LRA_target))))
train_df['LRA_TARGET'] = list(LRA_matched)
train_df.head()

Unnamed: 0,ID,Title,Source,Time,Text,PF_TARGET,LRA_TARGET
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",[The International Criminal Court has reached ...,"[False, False, False, True, True, True, True, ...","[False, False, False, True, True, True, False,..."
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST","[Geneva, Mar 31 (IBNS) The United Nations refu...","[True, True, True, True, True, True, True, Tru...","[True, True, True, True, True, True, True, Tru..."
2,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,"[For 25 years, Joseph Kony and his Lord's Resi...","[False, False, False, False]","[True, True, False, False]"
3,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",[Displaced northern Ugandans have flocked home...,"[True, True, True, True, False, False, False, ...","[False, False, True, True, True, True, True, T..."
4,6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday","[LOCAL POACHERS', FROM TIMOTHY EVANS IN BANGUI...","[False, False, True, True, True, False, False,...","[False, False, False, False, False, False, Fal..."


In [None]:
# train_df.to_csv('./data/train.csv', index=False)

In [8]:
'''
筛选出同时包含PF和LRA目标的300个文本片段
'''
size = 300
target_indices = train_df.PF_TARGET.apply(lambda _: any(_)) & train_df.LRA_TARGET.apply(lambda _: any(_))
import numpy as np
target_df = train_df.loc[target_indices, ['ID', 'Text']].iloc[np.linspace(0, target_indices.sum(), num=300, endpoint=False, dtype=int)]
target_df[['ASK', 'ANSWER']] = pd.DataFrame(columns=['ASK', 'ANSWER'])
target_df.head()


Unnamed: 0,ID,Text,ASK,ANSWER
0,0,[The International Criminal Court has reached ...,,
32,41,[As Rwanda marks the 18th Anniversary of the 1...,,
68,83,[Addis Ababa April 24/2015 African Scholars sa...,,
103,129,"[OVERALL ASSESSMENT, The president, Yoweri Mus...",,
138,175,"[ New Delhi, Mar 17 -- UGANDA'S LONG-RULING P...",,


In [9]:
'''
消息模板
'''
ASK_TEMPLATE = """
{}

请根据以下六个基本标准，对上文进行0-1标注

基本标准：
(1)“PF_score”，报告提到伙伴部队的成功
(2)“PF_US”，报告提到美国与伙伴部队合作
(3)“PF_neg”，报告美国伙伴部队的负面情况
(4)“Threat_up”，报告提到上帝抵抗军的威胁增加
(5)“Threat_down”，上帝抵抗军的威胁是否减少
(6)“Citizen_impact”，公民是否受到上帝抵抗军暴力的影响
"""

In [None]:
# # 尝试标注
# _id, _text, *_ = target_df.loc[target_df.ANSWER.isna()].iloc[0]
# ask = ASK_TEMPLATE.format(_text)
# answer = ask_deepseek(ask)
# target_df.loc[target_df.index[0], ['ASK', 'ANSWER']] = [ask, answer]
# target_df.head()

In [None]:
# # 进行标注
# for _index, _id, _text, *_ in tqdm(list(target_df.loc[target_df.ANSWER.isna()].itertuples())):
#     ask = ASK_TEMPLATE.format(_text)
#     answer = ask_deepseek(ask)
#     target_df.loc[_index, ['ASK', 'ANSWER']] = [ask, answer]
# target_df.head()

In [None]:
# # 提取标注结果
# import re
# _target_dict = target_df.ANSWER.apply(re.compile("([PFThreatCitizen]+_[\w]+)[^\d]+?(\d)").findall).apply(lambda _: {key: value for key, value in _[:6]}).to_dict()
# len(_target_dict)

In [None]:
# _target_df = pd.DataFrame.from_dict(
#     _target_dict, orient='index'
# )
# _target_df.head()

In [None]:
# target_df[_target_df.columns] = _target_df
# target_df.head()

In [None]:
# target_df.to_csv('./data/target_df.csv', index=False)

In [None]:
# TARGET_MD = """ # ID-{}.
# - ASK:

# ```
# {}

# 请根据以下六个基本标准，对上文进行0-1标注

# 基本标准：
# (1)“PF_score”，报告提到伙伴部队的成功
# (2)“PF_US”，报告提到美国与伙伴部队合作
# (3)“PF_neg”，报告美国伙伴部队的负面情况
# (4)“Threat_up”，报告提到上帝抵抗军的威胁增加
# (5)“Threat_down”，上帝抵抗军的威胁是否减少
# (6)“Citizen_impact”，公民是否受到上帝抵抗军暴力的影响
# ```

# - ANSWER:

# ```
# {}
# ```
# ---
# """

In [None]:
# from itertools import chain
# with open('./data/target_df.md', 'w') as f:
#     f.write((TARGET_MD * len(target_df)).format(*chain(*zip(target_df.ID, target_df.Text, target_df.ANSWER))))

In [None]:
# train_target_df = train_df.merge(target_df[['ID', 'PF_score', 'PF_US', 'PF_neg', 'Threat_up', 'Threat_down', 'Citizen_impact']], on='ID', how='right')
# train_target_df.head()

In [None]:
# train_target_df.to_csv('./data/train_target_df.csv', index=False)

# 2. 模型 & 训练

In [None]:
! nohup python main.py > ./data/log.log 2>&1 &

# 3. 获取评估结果

In [1]:
from model.text_assessment.text_assessor import TextAssessor
model = TextAssessor.loads(save_model='./data/A/text_assessor_B(350).pth', device='cpu')
print(model)

loaded model from ./data/A/text_assessor_B(350).pth

TextAssessor(
  (text_encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
       

In [11]:
from model.text_assessment import PRETRAINED_MODEL_NAME
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
print(tokenizer)


BertTokenizer(name_or_path='./model/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [5]:
import pandas as pd
data_df = pd.read_csv('./data/cleaned.csv')
from ast import literal_eval
data_df['Text'] = data_df.Text.apply(literal_eval)
data_df.head()

Unnamed: 0,ID,Title,Source,Time,Text
0,0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",[The International Criminal Court has reached ...
1,1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST","[Geneva, Mar 31 (IBNS) The United Nations refu..."
2,3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,"[For 25 years, Joseph Kony and his Lord's Resi..."
3,4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",[Displaced northern Ugandans have flocked home...
4,6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday","[LOCAL POACHERS', FROM TIMOTHY EVANS IN BANGUI..."


In [29]:
from model.text_assessment.text_assessor import predict_one
from model.text_assessment import MAX_LEN
from tqdm.auto import tqdm
# 这里发现，数据中存在Text列为空列表的情况
items = {}
for _id, *_, _text in tqdm(data_df.itertuples(index=False)):
    pred = predict_one(data_text=_text, model=model, tokenizer=tokenizer, max_len=MAX_LEN, device='cpu') \
        if _text else {
        "pred.values.PF_score": None,
        "pred.values.PF_US": None,
        "pred.values.PF_neg": None,
        "pred.values.Threat_up": None,
        "pred.values.Threat_down": None,
        "pred.values.Citizen_impact": None,
    }
    items[_id] = pred
items_df = pd.DataFrame.from_dict(items, orient='index')
items.head()

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [12]:
data_df.columns

Index(['ID', 'Title', 'Source', 'Time', 'Text'], dtype='object')