### REF https://pytorch.ac.cn/tutorials/intermediate/char_rnn_classification_tutorial.html

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device=device)
torch.get_default_device()

device(type='cuda', index=0)

In [2]:
import string # REF https://docs.python.org/zh-cn/3.11/library/string.html
import unicodedata # REF https://docs.python.org/zh-cn/3.11/library/unicodedata.html

#### 1. 数据清理

将Unicode转换为纯ASCII以限制输入
将Unicode字符串替换为ASCII并只允许一小部分允许的字符

In [3]:
# 使用 “_” 表示非词汇表的字符与模型未处理字符
allowed_characters = string.ascii_letters + string.digits + ".,;" + "_"
n_letters = len(allowed_characters)

In [4]:
# 删除Python unicode字符串中的重音符号
# REF https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

#### 2. 文本 to 张量

使用 \<1 $\times$ n_letters\> 的 one-hot 编码，表示单个字符。

使用一个 2D 矩阵 \<line_length $\times$ 1 $\times$ n_letters\> 表示一个单词

额外的维度 1 是因为PyTorch假定所有内容都是批量的，这里使用批量大小为 1

In [5]:
# 字符索引定位
def letterToIndex(letter):
    return allowed_characters.find(letter in allowed_characters and letter or '_')

In [6]:
# 词的张量表示
def lineToTensor(line):
    indices = torch.tensor([letterToIndex(letter=letter) for letter in line])
    return F.one_hot(indices, num_classes=n_letters)

#### 3. 构建数据集

使用 Dataset 与 DataLoader 存储数据集，实现 `__init__`, `__len__`, `__getitem__`

In [7]:
import re
from pathlib import Path
import pandas as pd
import numpy as np
import glob
import os
import time

from torch.utils.data import Dataset

In [8]:
data_path = Path('./data/Lexis-Nexis_LRA.csv')

In [9]:
df_data = pd.read_csv(filepath_or_buffer=data_path)
df_data.head()

Unnamed: 0,Title,Source,Time,Year,Month,Day,Text
0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",2012,March,19,['The International Criminal Court has reached...
1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST",2012,March,31,"[""Geneva, Mar 31 (IBNS) The United Nations ref..."
2,'Now Katine is famous': how a Guardian project...,The Guardian(London),"December 30, 2017 Saturday 9:00 AM GMT",2017,December,30,"[""Joyce Abiro was among the first to join the ..."
3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,2012,July,15,"['For 25 years, Joseph Kony and his Lord\'s Re..."
4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",2011,February,15,['Displaced northern Ugandans have flocked hom...


In [10]:
df_data.Text.__len__()

14664

In [11]:
df_data.drop_duplicates('Text').Text.apply(lambda text: bool(re.search(r'[^\x00-\x7F]',  text)) ^ True).sum()

np.int64(10940)

In [12]:
df_data.drop_duplicates('Text').loc[df_data.Text.apply(lambda text: bool(re.search(r'[^\x00-\x7F]',  text)) ^ True)]

Unnamed: 0,Title,Source,Time,Year,Month,Day,Text
0,War criminal walking around in plain sight,The Calgary Herald (Alberta),"March 19, 2012 Monday",2012,March,19,['The International Criminal Court has reached...
1,UN warns of growing attacks in Central Africa,IBNS,"March 31, 2012 Saturday 6:30 AM EST",2012,March,31,"[""Geneva, Mar 31 (IBNS) The United Nations ref..."
3,Review: COVER STORY: KONY 2012: What happened ...,The Observer (London),15-Jul-12,2012,July,15,"['For 25 years, Joseph Kony and his Lord\'s Re..."
4,Uganda 's war-scarred north set for peacetime ...,Agence France Presse -- English,"February 15, 2011 Tuesday 9:21 AM GMT",2011,February,15,['Displaced northern Ugandans have flocked hom...
6,BRITISH SAFARI MAN ACCUSED OF MASS MURDER 'FRAMED,MAIL ON SUNDAY (London),"May 13, 2012 Sunday",2012,May,13,"[""LOCAL POACHERS'"", 'FROM TIMOTHY EVANS IN BAN..."
...,...,...,...,...,...,...,...
14659,5 Reasons Why Central Africans Are Living At T...,The Huffington Post,"January 11, 2014 Saturday 12:43 AM EST",2014,January,11,"['Jan 10, 2014 (The Huffington Post:http://www..."
14660,AFRICA: WILDLIFE POACHING THOUGHT TO BANKROLL ...,IPS - Inter Press Service,"January 11, 2014 Saturday",2014,January,11,['Top diplomats and retired U.S. military offi...
14661,"As 'South Park' writer explores Africa, she go...",St. Louis Post-Dispatch (Missouri),"May 25, 2014 Sunday",2014,May,25,['Jane Bussmann\'s blistering tale of life in ...
14662,Statement by the spokesperson on the transfer ...,Premium Official News,"January 21, 2015 Wednesday",2015,January,21,[' The European Union has issued the following...


In [13]:
class sDataset(Dataset):
    def __init__(self, df_data):
        self.df = df_data.drop_duplicates('Text')
        legal_indices = self.df.Text.apply(self.is_legal_string)
        self.data = self.df.loc[legal_indices]
        self.length = legal_indices.sum()

    # 正确编码
    def is_legal_string(self, text):
        return bool(re.search(r'[^\x00-\x7F]',  text)) ^ True

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        return self.data.iloc[index]


In [14]:
sdata = sDataset(df_data=df_data)
len(sdata), sdata[0]

(10940,
 Title            War criminal walking around in plain sight
 Source                         The Calgary Herald (Alberta)
 Time                                  March 19, 2012 Monday
 Year                                                   2012
 Month                                                 March
 Day                                                      19
 Text      ['The International Criminal Court has reached...
 Name: 0, dtype: object)

In [None]:
ASK_TEMPLATE = """
- ASK: 
``` txt
{}

请根据以下六个基本标准，对上文进行0-1标注

基本标准：
(1)“PF_score”，报告提到伙伴部队的成功
(2)“PF_US”，报告提到美国与伙伴部队合作
(3)“PF_neg”，报告美国伙伴部队的负面情况
(4)“Threat_up”，报告提到上帝抵抗军的威胁增加
(5)“Threat_down”，上帝抵抗军的威胁是否减少
(6)“Citizen_impact”，公民是否受到上帝抵抗军暴力的影响
```

- RESPONSE:
```

```
---
"""

In [16]:
# size = 100
# with open('./data/MarkingByDeepSeek.md', 'w') as f:
#     f.write((ASK_TEMPLATE * size) .format(*sdata[:size].Text))

In [17]:
with open('./data/MarkingByDeepSeek.md', 'r') as f:
    Marking = f.read()

In [203]:
def markingTodict(marking):
    pattern = re.compile("""\s- ASK:\s\n```\stxt\n([\w\W]+?)\n\n请根据以下六个基本标准，对上文进行0-1标注\n\n基本标准：\n\(1\)“PF_score”，报告提到伙伴部队的成功\n\(2\)“PF_US”，报告提到美国与伙伴部队合作\n\(3\)“PF_neg”，报告美国伙伴部队的负面情况\n\(4\)“Threat_up”，报告提到上帝抵抗军的威胁增加\n\(5\)“Threat_down”，上帝抵抗军的威胁是否减少\n\(6\)“Citizen_impact”，公民是否受到上帝抵抗军暴力的影响\n```\n\n-\sRESPONSE:\n```\n([\w\W]+?)\n```\n""")
    items = {}
    for idx, (text, response) in enumerate(pattern.findall(marking)):
        item = re.compile("([PFThreatCitizen]+_[\w]+).+?(\d)").findall(response)[:6]
        items[idx] = {
            'text': text,
            'response': response,
            **dict(item)
        }
    return items


In [206]:
df_result = pd.DataFrame.from_dict(markingTodict(Marking), orient='index')
df_result

Unnamed: 0,text,response,PF_score,PF_US,PF_neg,Threat_up,Threat_down,Citizen_impact
0,['The International Criminal Court has reached...,根据提供的文本和六个基本标准，以下是0-1标注结果：\n\n(1) “PF_score”，报...,0,0,0,1,0,1
1,"[""Geneva, Mar 31 (IBNS) The United Nations ref...",根据提供的文本和六个基本标准，以下是 **0-1 标注结果**： \n\n| **标准**...,1,1,0,1,0,1
2,"['For 25 years, Joseph Kony and his Lord\'s Re...",根据提供的文本和六个基本标准，以下是 **0-1 标注结果**： \n\n| **标准**...,0,1,0,0,0,1
3,['Displaced northern Ugandans have flocked hom...,根据提供的文本和六个基本标准，以下是 **0-1 标注结果**： \n\n| **标准**...,0,0,0,0,1,1
4,"[""LOCAL POACHERS'"", 'FROM TIMOTHY EVANS IN BAN...","根据提供的文本和六个基本标准，以下是0-1标注结果：\n\n(1) ""PF_score""，报...",0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
95,"[""Overview: There was no credible evidence to ...",根据提供的文本和六个基本标准，以下是0-1标注结果：\n\n(1) “PF_score”，报...,1,0,0,1,0,1
96,"[""The U.S. Department of State's Bureau Public...",根据提供的文本和六个基本标准，以下是0-1标注结果： \n\n**(1) “PF_scor...,1,1,0,1,0,1
97,"['Rep. Chris Smith, R-N.J. (4th CD), issued th...",### **标注结果（0-1）** \n\n#### **(1) “PF_score”，报...,1,1,0,0,1,1
98,"['Get to the bottom of the tragic air crash', ...",### **标注结果（0-1）** \n\n#### **(1) “PF_score”，报...,0,0,0,0,0,0


In [None]:
# df_result.iloc[:, [0, 2, 3, 4, 5, 6, 7]].to_csv('./data/MarkingResult.csv', index=False)


In [209]:
df_result = pd.read_csv('./data/MarkingResult.csv')
df_result

Unnamed: 0,text,PF_score,PF_US,PF_neg,Threat_up,Threat_down,Citizen_impact
0,['The International Criminal Court has reached...,0,0,0,1,0,1
1,"[""Geneva, Mar 31 (IBNS) The United Nations ref...",1,1,0,1,0,1
2,"['For 25 years, Joseph Kony and his Lord\'s Re...",0,1,0,0,0,1
3,['Displaced northern Ugandans have flocked hom...,0,0,0,0,1,1
4,"[""LOCAL POACHERS'"", 'FROM TIMOTHY EVANS IN BAN...",0,0,0,1,0,1
...,...,...,...,...,...,...,...
95,"[""Overview: There was no credible evidence to ...",1,0,0,1,0,1
96,"[""The U.S. Department of State's Bureau Public...",1,1,0,1,0,1
97,"['Rep. Chris Smith, R-N.J. (4th CD), issued th...",1,1,0,0,1,1
98,"['Get to the bottom of the tragic air crash', ...",0,0,0,0,0,0
