## Full fine-tuning EleutherAI/pythia-160m to predict

### Dataset

In [None]:
!pip install transformers



In [3]:
!pip install datasets



In [21]:
import os
currentPath = os.getcwd().replace('\\','/')    # get current path

print(currentPath)

/home/Arthur/aicup


In [22]:
from datasets import load_dataset, Features, Value

dataset = load_dataset("csv", data_files="PublicDataset_phase3/merged_20231120(2).tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['fid', 'idx', 'content', 'label'],
        num_rows: 138930
    })
})

In [4]:
dataset['train'][110]

{'fid': '102',
 'idx': 2408,
 'content': '-  320 mm benign leiomyoma, probably arising from the broad ligament.',
 'label': 'PHI:Null'}

In [5]:
dataset['train'][1]

{'fid': '10',
 'idx': 25,
 'content': '091016.NMT',
 'label': 'MEDICALRECORD:091016.NMT'}

In [9]:
dataset['train'][7]

{'fid': '10',
 'idx': 114,
 'content': 'D.O.B:  24/8/1993',
 'label': 'DATE: 24/8/1993=>1993-08-24'}

For demonstration purpose, we only use the randomly sampled 20000 instances.

In [24]:
import torch
# sub_datasets = torch.utils.data.random_split(dataset['train'], [20000, 65736])
sub_datasets2 = dataset['train']
#print(len(sub_datasets[0]))
#for i in range(4): print(sub_datasets[0][i])
print(len(sub_datasets2))
for i in range(4): print(sub_datasets2[i])

138930
{'fid': '10', 'idx': 1, 'content': 'Episode No:  09F016547J', 'label': 'IDNUM: 09F016547J'}
{'fid': '10', 'idx': 25, 'content': '091016.NMT', 'label': 'MEDICALRECORD: 091016.NMT'}
{'fid': '10', 'idx': 37, 'content': 'SIZAR, HOWARD', 'label': 'PATIENT: SIZAR, HOWARD'}
{'fid': '10', 'idx': 52, 'content': 'Lab No:  09F01654', 'label': 'IDNUM: 09F01654'}


### Data loader

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-160m" #"EleutherAI/pythia-70m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

<|pad|>: 50278


In [4]:
!pip install islab-opendeid



In [26]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(sub_datasets2)
train_dataloader = DataLoader(train_data, batch_size=3, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

torch.Size([3, 23])


(tensor([[50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278,     0, 10118,  1621,    27, 50276,  2693,    39,
            520, 29195, 50279,  1838, 20872,    27, 15630,    39,   520, 29195,
            209, 50277],
         [50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
          50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278, 50278,
              0, 14311,  4379, 50279,  1267,  1848,  2025,    27, 14311,  4379,
            209, 50277],
         [    0,   416,  1400, 42525, 50276,    53,  1719, 50276,  1235,  2759,
          50279,    36,  7400,    27,   416,  1400, 42525,    61,    79, 19247,
             27,   308,  1719,    61,    79,    59,  3123,    27,  2456,  2759,
            209, 50277]]),
 tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,     0, 10118,  1621,    27, 50276,  2693,    39,
            520, 29195, 50279,  1838, 20872

In [27]:
results = tokenizer(["Lab No: 14H02780", "“STOCKDALE” 653 MONAGHAN RD"], padding=True)
print(results['input_ids'])
print()
print(results['input_ids'][0])
print(tokenizer.decode(results['input_ids'][0]))
print(results['input_ids'][1])
print(tokenizer.decode(results['input_ids'][1]))

[[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438], [1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]]

[50278, 50278, 50278, 50278, 50278, 50278, 21663, 1621, 27, 1638, 41, 16604, 1438]


2023-12-02 13:25:03.431993: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-02 13:25:03.433069: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-02 13:25:03.447111: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 13:25:03.447134: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 13:25:03.447147: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>Lab No: 14H02780
[1628, 1267, 9466, 37, 23502, 668, 721, 3357, 33995, 2696, 41, 1539, 28613]
“STOCKDALE” 653 MONAGHAN RD


In [28]:
results = tokenizer(
    [f"{bos} 9364819.RAN\\nMINTANIA, JEFFRY {sep} ID: 9364819.RAN\\nNAME: MINTANIA, JEFFRY {eos}",
     f"{bos} This is a sentence {sep} PHI: NULL {eos}"],
    padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print(tokenizer.decode(results['input_ids'][0]))
print(tokenizer.decode(results['input_ids'][1]))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<|endoftext|> 9364819.RAN\nMINTANIA, JEFFRY 

####

 ID: 9364819.RAN\nNAME: MINTANIA, JEFFRY <|END|>
<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|endoftext|> This is a sentence 

####

 PHI: NULL <|END|>


In [14]:
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 10
bucket_train_dataloader = DataLoader(train_data,
                                     batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                                     pin_memory=True)

# for idx, batch in enumerate(bucket_train_dataloader):
#     print(batch)
#     print(batch[0].shape)
#     print(batch[1].shape)
#     break

### Model

In [16]:
from transformers import AutoConfig
# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Use Schedualr CosineAnnealingLR

In [18]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 100  # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(), lr=2e-5)  # YOU CAN ADJUST LEARNING RATE

# You may want to adjust the number of warmup steps based on your specific use case
total_steps = len(bucket_train_dataloader) * EPOCHS
warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

---

In [12]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 100 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=2e-5) # YOU CAN ADJUST LEARNING RATE

model.resize_token_embeddings(len(tokenizer))
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

In [19]:
from tqdm import tqdm, trange

global_step = 0
total_loss = 0

model.train()
for epoch in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate

    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))


Epoch:   1%|          | 1/100 [05:35<9:14:23, 335.99s/it]

Average train loss: 2.5601930281843592


Epoch:   2%|▏         | 2/100 [11:11<9:08:36, 335.88s/it]

Average train loss: 1.4473368288339243


Epoch:   3%|▎         | 3/100 [16:48<9:03:22, 336.11s/it]

Average train loss: 1.2778326748282882


Epoch:   4%|▍         | 4/100 [22:24<8:57:53, 336.18s/it]

Average train loss: 1.173478786338528


Epoch:   5%|▌         | 5/100 [28:00<8:52:28, 336.30s/it]

Average train loss: 1.0916304101480105


Epoch:   6%|▌         | 6/100 [33:37<8:46:55, 336.33s/it]

Average train loss: 1.0201415771444595


Epoch:   7%|▋         | 7/100 [39:13<8:41:14, 336.28s/it]

Average train loss: 0.9572178069157553


Epoch:   8%|▊         | 8/100 [44:49<8:35:31, 336.21s/it]

Average train loss: 0.9007840338526745


Epoch:   9%|▉         | 9/100 [50:26<8:30:10, 336.38s/it]

Average train loss: 0.8523634562806075


Epoch:  10%|█         | 10/100 [56:02<8:24:31, 336.36s/it]

Average train loss: 0.8115654499344767


Epoch:  11%|█         | 11/100 [1:01:39<8:18:59, 336.40s/it]

Average train loss: 0.771610991274625


Epoch:  12%|█▏        | 12/100 [1:07:15<8:13:33, 336.51s/it]

Average train loss: 0.7310598299796437


Epoch:  13%|█▎        | 13/100 [1:12:52<8:07:50, 336.44s/it]

Average train loss: 0.700012907116612


Epoch:  14%|█▍        | 14/100 [1:18:28<8:02:08, 336.37s/it]

Average train loss: 0.6757130681490234


Epoch:  15%|█▌        | 15/100 [1:24:05<7:56:39, 336.47s/it]

Average train loss: 0.656273893564178


Epoch:  16%|█▌        | 16/100 [1:29:41<7:50:55, 336.38s/it]

Average train loss: 0.6402844757834856


Epoch:  17%|█▋        | 17/100 [1:35:17<7:45:09, 336.26s/it]

Average train loss: 0.6267141600986164


Epoch:  18%|█▊        | 18/100 [1:40:53<7:39:25, 336.17s/it]

Average train loss: 0.6159600615093829


Epoch:  19%|█▉        | 19/100 [1:46:29<7:33:40, 336.06s/it]

Average train loss: 0.6061150962038396


Epoch:  20%|██        | 20/100 [1:52:05<7:28:17, 336.22s/it]

Average train loss: 0.5983424324552742


Epoch:  21%|██        | 21/100 [1:57:41<7:22:37, 336.16s/it]

Average train loss: 0.5922606237731964


Epoch:  22%|██▏       | 22/100 [2:03:18<7:17:07, 336.25s/it]

Average train loss: 0.5859436112136466


Epoch:  23%|██▎       | 23/100 [2:08:54<7:11:31, 336.25s/it]

Average train loss: 0.581218396375094


Epoch:  24%|██▍       | 24/100 [2:14:30<7:05:58, 336.30s/it]

Average train loss: 0.5764841385942414


Epoch:  25%|██▌       | 25/100 [2:20:08<7:00:47, 336.63s/it]

Average train loss: 0.5724400024844204


Epoch:  26%|██▌       | 26/100 [2:25:46<6:55:41, 337.04s/it]

Average train loss: 0.5690928642820833


Epoch:  27%|██▋       | 27/100 [2:31:23<6:50:13, 337.17s/it]

Average train loss: 0.5656771854983029


Epoch:  28%|██▊       | 28/100 [2:37:00<6:44:37, 337.19s/it]

Average train loss: 0.5630730386612388


Epoch:  29%|██▉       | 29/100 [2:42:39<6:39:22, 337.51s/it]

Average train loss: 0.5607425888407301


Epoch:  30%|███       | 30/100 [2:48:17<6:34:08, 337.83s/it]

Average train loss: 0.5581429246672285


Epoch:  31%|███       | 31/100 [2:53:56<6:28:58, 338.24s/it]

Average train loss: 0.5562751325305728


Epoch:  32%|███▏      | 32/100 [2:59:35<6:23:34, 338.45s/it]

Average train loss: 0.5541048622644892


Epoch:  33%|███▎      | 33/100 [3:05:15<6:18:10, 338.67s/it]

Average train loss: 0.552552593377096


Epoch:  34%|███▍      | 34/100 [3:10:51<6:11:54, 338.10s/it]

Average train loss: 0.5509466727016932


Epoch:  35%|███▌      | 35/100 [3:16:27<6:05:33, 337.44s/it]

Average train loss: 0.5487107232149739


Epoch:  36%|███▌      | 36/100 [3:22:03<5:59:28, 337.01s/it]

Average train loss: 0.5475236859864656


Epoch:  37%|███▋      | 37/100 [3:27:40<5:53:40, 336.83s/it]

Average train loss: 0.545828739492071


Epoch:  38%|███▊      | 38/100 [3:33:16<5:47:54, 336.68s/it]

Average train loss: 0.5447940610133056


Epoch:  39%|███▉      | 39/100 [3:38:52<5:42:07, 336.51s/it]

Average train loss: 0.5431834370464157


Epoch:  40%|████      | 40/100 [3:44:28<5:36:24, 336.41s/it]

Average train loss: 0.5419275552685425


Epoch:  41%|████      | 41/100 [3:50:05<5:30:45, 336.37s/it]

Average train loss: 0.5405835720397334


Epoch:  42%|████▏     | 42/100 [3:55:41<5:25:18, 336.52s/it]

Average train loss: 0.5395310722886207


Epoch:  43%|████▎     | 43/100 [4:01:18<5:19:35, 336.40s/it]

Average train loss: 0.5388481446484128


Epoch:  44%|████▍     | 44/100 [4:06:53<5:13:49, 336.24s/it]

Average train loss: 0.5377035442596806


Epoch:  45%|████▌     | 45/100 [4:12:30<5:08:11, 336.21s/it]

Average train loss: 0.5369210922872713


Epoch:  46%|████▌     | 46/100 [4:18:06<5:02:39, 336.28s/it]

Average train loss: 0.5358950880197326


Epoch:  47%|████▋     | 47/100 [4:23:42<4:57:04, 336.32s/it]

Average train loss: 0.5352059559014268


Epoch:  48%|████▊     | 48/100 [4:29:18<4:51:23, 336.23s/it]

Average train loss: 0.5341924305750236


Epoch:  49%|████▉     | 49/100 [4:34:55<4:45:46, 336.22s/it]

Average train loss: 0.533172744634187


Epoch:  50%|█████     | 50/100 [4:40:31<4:40:11, 336.23s/it]

Average train loss: 0.5324954117042907


Epoch:  51%|█████     | 51/100 [4:46:07<4:34:33, 336.20s/it]

Average train loss: 0.5316611349320965


Epoch:  52%|█████▏    | 52/100 [4:51:43<4:28:59, 336.24s/it]

Average train loss: 0.5305069284314498


Epoch:  53%|█████▎    | 53/100 [4:57:20<4:23:25, 336.29s/it]

Average train loss: 0.5295653991164695


Epoch:  54%|█████▍    | 54/100 [5:02:56<4:17:47, 336.25s/it]

Average train loss: 0.5287456500164927


Epoch:  55%|█████▌    | 55/100 [5:08:32<4:12:09, 336.22s/it]

Average train loss: 0.5281905297490002


Epoch:  56%|█████▌    | 56/100 [5:14:09<4:06:37, 336.32s/it]

Average train loss: 0.5279876952940613


Epoch:  57%|█████▋    | 57/100 [5:19:45<4:01:01, 336.31s/it]

Average train loss: 0.5275664393067626


Epoch:  58%|█████▊    | 58/100 [5:25:21<3:55:23, 336.27s/it]

Average train loss: 0.5267361606669196


Epoch:  59%|█████▉    | 59/100 [5:30:57<3:49:43, 336.17s/it]

Average train loss: 0.5263412073372568


Epoch:  60%|██████    | 60/100 [5:36:33<3:44:10, 336.27s/it]

Average train loss: 0.5258057124675165


Epoch:  61%|██████    | 61/100 [5:42:10<3:38:36, 336.31s/it]

Average train loss: 0.5252148738297309


Epoch:  62%|██████▏   | 62/100 [5:47:46<3:33:02, 336.39s/it]

Average train loss: 0.5245514608357059


Epoch:  63%|██████▎   | 63/100 [5:53:23<3:27:25, 336.38s/it]

Average train loss: 0.5237261419963658


Epoch:  64%|██████▍   | 64/100 [5:58:59<3:21:47, 336.32s/it]

Average train loss: 0.5229606457445566


Epoch:  65%|██████▌   | 65/100 [6:04:35<3:16:11, 336.34s/it]

Average train loss: 0.5227027161102233


Epoch:  66%|██████▌   | 66/100 [6:10:12<3:10:38, 336.41s/it]

Average train loss: 0.5223028501059319


Epoch:  67%|██████▋   | 67/100 [6:15:49<3:05:02, 336.45s/it]

Average train loss: 0.5216541679619344


Epoch:  68%|██████▊   | 68/100 [6:21:25<2:59:27, 336.48s/it]

Average train loss: 0.5209023887921013


Epoch:  69%|██████▉   | 69/100 [6:27:01<2:53:49, 336.42s/it]

Average train loss: 0.5203210657523748


Epoch:  70%|███████   | 70/100 [6:32:37<2:48:07, 336.26s/it]

Average train loss: 0.5199536644423753


Epoch:  71%|███████   | 71/100 [6:38:14<2:42:33, 336.34s/it]

Average train loss: 0.5190599049415842


Epoch:  72%|███████▏  | 72/100 [6:43:50<2:36:57, 336.33s/it]

Average train loss: 0.518729240564836


Epoch:  73%|███████▎  | 73/100 [6:49:27<2:31:21, 336.37s/it]

Average train loss: 0.5179577427920586


Epoch:  74%|███████▍  | 74/100 [6:55:02<2:25:40, 336.18s/it]

Average train loss: 0.5175388302512718


Epoch:  75%|███████▌  | 75/100 [7:00:38<2:20:03, 336.15s/it]

Average train loss: 0.5168864992641264


Epoch:  76%|███████▌  | 76/100 [7:06:14<2:14:25, 336.05s/it]

Average train loss: 0.5164350028275341


Epoch:  77%|███████▋  | 77/100 [7:11:50<2:08:47, 335.98s/it]

Average train loss: 0.5159432510887261


Epoch:  78%|███████▊  | 78/100 [7:17:26<2:03:13, 336.05s/it]

Average train loss: 0.515351508126464


Epoch:  79%|███████▉  | 79/100 [7:23:02<1:57:35, 335.96s/it]

Average train loss: 0.5147330040636489


Epoch:  80%|████████  | 80/100 [7:28:38<1:51:58, 335.94s/it]

Average train loss: 0.5143622893507311


Epoch:  81%|████████  | 81/100 [7:34:13<1:46:19, 335.79s/it]

Average train loss: 0.5136500190537782


Epoch:  82%|████████▏ | 82/100 [7:39:49<1:40:46, 335.91s/it]

Average train loss: 0.5131237574333546


Epoch:  83%|████████▎ | 83/100 [7:45:25<1:35:10, 335.90s/it]

Average train loss: 0.5125026343257482


Epoch:  84%|████████▍ | 84/100 [7:51:01<1:29:33, 335.83s/it]

Average train loss: 0.5120550097987074


Epoch:  85%|████████▌ | 85/100 [7:56:37<1:23:58, 335.92s/it]

Average train loss: 0.511348391439368


Epoch:  86%|████████▌ | 86/100 [8:02:13<1:18:23, 335.93s/it]

Average train loss: 0.5109604348797979


Epoch:  87%|████████▋ | 87/100 [8:07:49<1:12:45, 335.84s/it]

Average train loss: 0.5103326064389622


Epoch:  88%|████████▊ | 88/100 [8:13:24<1:07:09, 335.79s/it]

Average train loss: 0.5097275880999788


Epoch:  89%|████████▉ | 89/100 [8:19:00<1:01:34, 335.84s/it]

Average train loss: 0.509140028299791


Epoch:  90%|█████████ | 90/100 [8:24:36<55:58, 335.80s/it]  

Average train loss: 0.5084720283624983


Epoch:  91%|█████████ | 91/100 [8:30:12<50:21, 335.75s/it]

Average train loss: 0.5079433069580692


Epoch:  92%|█████████▏| 92/100 [8:35:48<44:46, 335.83s/it]

Average train loss: 0.507269284287668


Epoch:  93%|█████████▎| 93/100 [8:41:23<39:10, 335.81s/it]

Average train loss: 0.5065837897864968


Epoch:  94%|█████████▍| 94/100 [8:46:59<33:35, 335.87s/it]

Average train loss: 0.5059154569906412


Epoch:  95%|█████████▌| 95/100 [8:52:35<27:58, 335.78s/it]

Average train loss: 0.505381978239001


Epoch:  96%|█████████▌| 96/100 [8:58:11<22:23, 335.89s/it]

Average train loss: 0.504607014822733


Epoch:  97%|█████████▋| 97/100 [9:03:47<16:47, 335.73s/it]

Average train loss: 0.5038631665693657


Epoch:  98%|█████████▊| 98/100 [9:09:23<11:11, 335.81s/it]

Average train loss: 0.5032372264254388


Epoch:  99%|█████████▉| 99/100 [9:14:58<05:35, 335.85s/it]

Average train loss: 0.502499226151981


Epoch: 100%|██████████| 100/100 [9:20:34<00:00, 336.35s/it]

Average train loss: 0.5017529559689524





In [13]:
from tqdm import tqdm,trange

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    # Training loop
    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

Epoch:   1%|▏         | 1/80 [04:36<6:03:30, 276.08s/it]

Average train loss: 1.442842539335673


Epoch:   2%|▎         | 2/80 [09:12<5:58:48, 276.00s/it]

Average train loss: 1.1517535237822736


Epoch:   4%|▍         | 3/80 [13:47<5:54:09, 275.97s/it]

Average train loss: 1.0169482673895678


Epoch:   5%|▌         | 4/80 [18:25<5:50:07, 276.41s/it]

Average train loss: 0.9128214155309896


Epoch:   6%|▋         | 5/80 [23:01<5:45:25, 276.34s/it]

Average train loss: 0.8273356141181993


Epoch:   8%|▊         | 6/80 [27:36<5:40:09, 275.80s/it]

Average train loss: 0.7596628005095692


Epoch:   9%|▉         | 7/80 [32:11<5:35:36, 275.85s/it]

Average train loss: 0.7116693098095077


Epoch:  10%|█         | 8/80 [36:46<5:30:31, 275.44s/it]

Average train loss: 0.6783327898962754


Epoch:  11%|█▏        | 9/80 [41:23<5:26:32, 275.94s/it]

Average train loss: 0.6546442962513381


Epoch:  12%|█▎        | 10/80 [45:59<5:21:53, 275.90s/it]

Average train loss: 0.6372504577500203


Epoch:  14%|█▍        | 11/80 [50:36<5:17:52, 276.42s/it]

Average train loss: 0.6231584953995278


Epoch:  15%|█▌        | 12/80 [55:12<5:12:49, 276.02s/it]

Average train loss: 0.6118689715451462


Epoch:  16%|█▋        | 13/80 [59:48<5:08:21, 276.14s/it]

Average train loss: 0.6029626823329057


Epoch:  18%|█▊        | 14/80 [1:04:24<5:03:51, 276.24s/it]

Average train loss: 0.5958739893932777


Epoch:  19%|█▉        | 15/80 [1:09:01<4:59:29, 276.45s/it]

Average train loss: 0.5904915366870539


Epoch:  20%|██        | 16/80 [1:13:38<4:54:56, 276.51s/it]

Average train loss: 0.5848746387857614


Epoch:  21%|██▏       | 17/80 [1:18:15<4:50:28, 276.64s/it]

Average train loss: 0.5811601310291612


Epoch:  22%|██▎       | 18/80 [1:22:50<4:45:20, 276.14s/it]

Average train loss: 0.5771183906600512


Epoch:  24%|██▍       | 19/80 [1:27:25<4:40:27, 275.86s/it]

Average train loss: 0.5738044111598788


Epoch:  25%|██▌       | 20/80 [1:32:02<4:36:05, 276.10s/it]

Average train loss: 0.5712252082750023


Epoch:  26%|██▋       | 21/80 [1:36:38<4:31:30, 276.10s/it]

Average train loss: 0.568978878365364


Epoch:  28%|██▊       | 22/80 [1:41:15<4:27:16, 276.50s/it]

Average train loss: 0.5666784457194148


Epoch:  29%|██▉       | 23/80 [1:45:50<4:22:02, 275.83s/it]

Average train loss: 0.5645325814412552


Epoch:  30%|███       | 24/80 [1:50:25<4:17:15, 275.64s/it]

Average train loss: 0.563038740850563


Epoch:  31%|███▏      | 25/80 [1:55:00<4:12:38, 275.61s/it]

Average train loss: 0.5608579501071204


Epoch:  32%|███▎      | 26/80 [1:59:36<4:08:04, 275.63s/it]

Average train loss: 0.5592844972351241


Epoch:  34%|███▍      | 27/80 [2:04:13<4:03:49, 276.03s/it]

Average train loss: 0.5581695574487258


Epoch:  35%|███▌      | 28/80 [2:08:48<3:59:03, 275.84s/it]

Average train loss: 0.55625007318752


Epoch:  36%|███▋      | 29/80 [2:13:24<3:54:27, 275.83s/it]

Average train loss: 0.5555531319279842


Epoch:  38%|███▊      | 30/80 [2:18:00<3:49:57, 275.95s/it]

Average train loss: 0.5540764739377618


Epoch:  39%|███▉      | 31/80 [2:22:35<3:44:57, 275.45s/it]

Average train loss: 0.5529049945636574


Epoch:  40%|████      | 32/80 [2:27:12<3:40:40, 275.85s/it]

Average train loss: 0.5522103007210326


Epoch:  41%|████▏     | 33/80 [2:31:49<3:36:26, 276.31s/it]

Average train loss: 0.5513242121071237


Epoch:  42%|████▎     | 34/80 [2:36:23<3:31:18, 275.63s/it]

Average train loss: 0.5503606746664336


Epoch:  44%|████▍     | 35/80 [2:40:59<3:26:44, 275.65s/it]

Average train loss: 0.5495187734229263


Epoch:  45%|████▌     | 36/80 [2:45:35<3:22:20, 275.93s/it]

Average train loss: 0.5490748877053212


Epoch:  46%|████▋     | 37/80 [2:50:13<3:18:07, 276.45s/it]

Average train loss: 0.5483137404891283


Epoch:  48%|████▊     | 38/80 [2:54:49<3:13:26, 276.35s/it]

Average train loss: 0.5472644096885751


Epoch:  49%|████▉     | 39/80 [2:59:26<3:08:55, 276.48s/it]

Average train loss: 0.546560507553768


Epoch:  50%|█████     | 40/80 [3:04:04<3:04:39, 277.00s/it]

Average train loss: 0.5457671216720404


Epoch:  51%|█████▏    | 41/80 [3:08:40<2:59:57, 276.85s/it]

Average train loss: 0.5452311390794776


Epoch:  52%|█████▎    | 42/80 [3:13:19<2:55:36, 277.28s/it]

Average train loss: 0.5446167657835024


Epoch:  54%|█████▍    | 43/80 [3:17:56<2:50:59, 277.27s/it]

Average train loss: 0.5441867661547765


Epoch:  55%|█████▌    | 44/80 [3:22:32<2:46:07, 276.88s/it]

Average train loss: 0.5434798066919638


Epoch:  56%|█████▋    | 45/80 [3:27:08<2:41:22, 276.64s/it]

Average train loss: 0.5427655400351984


Epoch:  57%|█████▊    | 46/80 [3:31:44<2:36:36, 276.36s/it]

Average train loss: 0.5424133726573669


Epoch:  59%|█████▉    | 47/80 [3:36:20<2:31:55, 276.24s/it]

Average train loss: 0.5419986181391037


Epoch:  60%|██████    | 48/80 [3:40:57<2:27:27, 276.47s/it]

Average train loss: 0.5413753943967294


Epoch:  61%|██████▏   | 49/80 [3:45:33<2:22:45, 276.32s/it]

Average train loss: 0.5408088721817507


Epoch:  62%|██████▎   | 50/80 [3:50:09<2:18:08, 276.29s/it]

Average train loss: 0.5405163120439219


Epoch:  64%|██████▍   | 51/80 [3:54:44<2:13:23, 276.00s/it]

Average train loss: 0.5402503996492327


Epoch:  65%|██████▌   | 52/80 [3:59:19<2:08:38, 275.67s/it]

Average train loss: 0.5395953111232691


Epoch:  66%|██████▋   | 53/80 [4:03:54<2:03:52, 275.29s/it]

Average train loss: 0.5395099081568439


Epoch:  68%|██████▊   | 54/80 [4:08:30<1:59:23, 275.52s/it]

Average train loss: 0.5388398020774893


Epoch:  69%|██████▉   | 55/80 [4:13:06<1:54:53, 275.74s/it]

Average train loss: 0.5385240430529646


Epoch:  70%|███████   | 56/80 [4:17:42<1:50:22, 275.92s/it]

Average train loss: 0.5379123234145525


Epoch:  71%|███████▏  | 57/80 [4:22:20<1:45:56, 276.37s/it]

Average train loss: 0.5378547423196848


Epoch:  72%|███████▎  | 58/80 [4:26:56<1:41:18, 276.27s/it]

Average train loss: 0.537496828408516


Epoch:  74%|███████▍  | 59/80 [4:31:32<1:36:44, 276.41s/it]

Average train loss: 0.5372486995871996


Epoch:  75%|███████▌  | 60/80 [4:36:09<1:32:06, 276.33s/it]

Average train loss: 0.5367629111670317


Epoch:  76%|███████▋  | 61/80 [4:40:44<1:27:25, 276.10s/it]

Average train loss: 0.5355501326486196


Epoch:  78%|███████▊  | 62/80 [4:45:21<1:22:52, 276.25s/it]

Average train loss: 0.5358519975628924


Epoch:  79%|███████▉  | 63/80 [4:49:57<1:18:16, 276.27s/it]

Average train loss: 0.5362295084695066


Epoch:  80%|████████  | 64/80 [4:54:33<1:13:39, 276.19s/it]

Average train loss: 0.5353636642095214


Epoch:  81%|████████▏ | 65/80 [4:59:10<1:09:06, 276.42s/it]

Average train loss: 0.5348666255950159


Epoch:  82%|████████▎ | 66/80 [5:03:45<1:04:24, 276.05s/it]

Average train loss: 0.5348157185093035


Epoch:  84%|████████▍ | 67/80 [5:08:20<59:42, 275.55s/it]  

Average train loss: 0.5344709430949537


Epoch:  85%|████████▌ | 68/80 [5:12:56<55:09, 275.76s/it]

Average train loss: 0.5342056549821546


Epoch:  86%|████████▋ | 69/80 [5:17:32<50:34, 275.85s/it]

Average train loss: 0.5341419651498693


Epoch:  88%|████████▊ | 70/80 [5:22:07<45:57, 275.73s/it]

Average train loss: 0.5335946372256974


Epoch:  89%|████████▉ | 71/80 [5:26:43<41:20, 275.57s/it]

Average train loss: 0.5336648525222984


Epoch:  90%|█████████ | 72/80 [5:31:18<36:44, 275.61s/it]

Average train loss: 0.5337227619187017


Epoch:  91%|█████████▏| 73/80 [5:35:53<32:08, 275.49s/it]

Average train loss: 0.5329698195508652


Epoch:  92%|█████████▎| 74/80 [5:40:30<27:34, 275.67s/it]

Average train loss: 0.5329976256239264


Epoch:  94%|█████████▍| 75/80 [5:45:05<22:57, 275.54s/it]

Average train loss: 0.5327764942997166


Epoch:  95%|█████████▌| 76/80 [5:49:41<18:23, 275.83s/it]

Average train loss: 0.5324056061692296


Epoch:  96%|█████████▋| 77/80 [5:54:15<13:45, 275.33s/it]

Average train loss: 0.5326473743606974


Epoch:  98%|█████████▊| 78/80 [5:58:53<09:12, 276.10s/it]

Average train loss: 0.5320236786312514


Epoch:  99%|█████████▉| 79/80 [6:03:30<04:36, 276.41s/it]

Average train loss: 0.5318413260266524


Epoch: 100%|██████████| 80/80 [6:08:07<00:00, 276.10s/it]

Average train loss: 0.5319153038941056





In [29]:
import torch
model = torch.load('./models/1201_final.pt')
model.eval() # start eval

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 768)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=768, out_features=2304, bias=True)
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=768, out_features=3072, bias=True)
          (dense_4h_to_h): Linear(in_features=3072, out_features=768, bias=True)
          

### Read the final file to be predicted

In [30]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="final/opendid_test.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

[{'fid': '1097', 'idx': 1, 'content': '433475.RDC', 'label': None},
 {'fid': '1097', 'idx': 12, 'content': 'Timmins, ELDEN', 'label': None},
 {'fid': '1097', 'idx': 27, 'content': '43J47561,43J47561', 'label': None},
 {'fid': '1097',
  'idx': 46,
  'content': 'Last edited : 7/9/2063  Page: 2',
  'label': None},
 {'fid': '1097', 'idx': 78, 'content': 'CLINICAL:', 'label': None},
 {'fid': '1097',
  'idx': 88,
  'content': 'Metastatic cancer ?colorectal primary.',
  'label': None},
 {'fid': '1097', 'idx': 128, 'content': 'MACROSCOPIC:', 'label': None},
 {'fid': '1097',
  'idx': 141,
  'content': 'Specimen labelled "Omentum secondary", consists of a piece of omentum 120 x 100 x 30mm.',
  'label': None},
 {'fid': '1097',
  'idx': 230,
  'content': 'On sectioning there are multiple fibrotic white ill-defined nodules identified.',
  'label': None},
 {'fid': '1097',
  'idx': 312,
  'content': 'Blocks: 1 to 5 - representative sections from the nodules.',
  'label': None},
 {'fid': '1097',
  'id

In [23]:
for entry in valid_list:
    if entry['content'] is None or entry['content'] == "":
        print(entry)
        break
else:
    print(" 'content' row have no Null value。")


 'content' 列中没有空值。


In [31]:
from tqdm import tqdm
from islab.aicup import aicup_predict
import io
BATCH_SIZE = 32

with open("./final/1202_ver2.txt",'w',encoding='utf8') as f:
#with io.open("answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            outputs = aicup_predict(model, tokenizer, input=seeds)
            for o in outputs:
                f.write(o)
                f.write('\n')

100%|██████████| 2469/2469 [03:36<00:00, 11.43it/s]


In [20]:
model_path = r"models"

In [24]:
import os

In [17]:
import torch
# torch.save(model.state_dict(), os.path.join(model_path , 'GPT_best.pt'))
torch.save(model, os.path.join(model_path , '1201_final.pt'))

NameError: name 'model' is not defined

## Data post processing

In [32]:
f = open("./final/1202_ver2.txt", "r", encoding='utf8')
w = open("./final/finalsub_revised1202_ver2.txt", "w", encoding='utf8')

for line in f.readlines():
    if "Null" not in line.strip() and "NULL" not in line.strip():
        w.write(line)

f.close()
w.close()


In [33]:
#predicted answer
answer_path = "/home/Arthur/aicup/final/finalsub_revised1202_ver2.txt"
#test data tsv
tsv_path = "/home/Arthur/aicup/final/opendid_test.tsv"
#output
# w = open("/home/Arthur/aicup/answer/answer2", "w")
submission = "/home/Arthur/aicup/final/answer3.txt"

In [34]:
sub = open(submission, 'w', encoding="utf-8")
tsv = open(tsv_path, 'r', encoding='utf-8')

In [35]:
def detect(private, line):
  A_index = line.find(private[0])
  index = [A_index]

  #find first phi
  if A_index >= 0:
    for i in range(1, len(private)):
      index.append(line.find(private[i][:-1], index[i-1]+1))

  #can't find first phi
  else:
    for i in range(1, len(private)):
      index.append(line.find(private[i][:-1], index[i-1]+1))
  #print(private, index)
  return index

In [36]:
#tsv = open(tsv_path, 'r', encoding='utf-8')
phi = ['PATIENT', 'DOCTOR', 'USERNAME', 'PROFESSION', 'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER', 'AGE', 'DATE',
      'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDR', 'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT', 'LICENSE', 'VECHICLE', 'DEVICE', 'BIOID', 'IDNUM', 'OTHER']
time = ['DATE', 'TIME', 'DURATION', 'SET']
with open(answer_path, 'r', encoding="utf-8") as answer:
  tsv_line = tsv.readlines()
  tsv_index = 0
  #print(tsv_line)
  for line in answer.readlines():
    words = line.split('\t')
    file_name = words[0]
    phi_name = words[1]
    start = words[2]
    end = words[3]
    data = '\t'.join(words[4:])
    #print(line)
    #drop phi not included in rules
    if phi_name not in phi:
      continue

    #multiple phi in one line
    if '\\n' in data:
      counts = data.split('\\n')
      phi_type, phi_data, normalized = [phi_name], [], []

      if phi_name in time:
        time_split = counts[0].split('\t')
        phi_data.append(time_split[0])
        normalized.append('\t'.join(time_split[1:]).strip())
      else:
        phi_data.append(counts[0])

      for i in range(1, len(counts)):
        temp_phi = counts[i].split(':')[0]
        if temp_phi not in phi:
          continue
        if temp_phi in time:
          time_split = counts[i].split('\t')
          temp = ':'.join(time_split[0].split(':')[1:])
          normalized.append('\t'.join(time_split[1:]).strip())
        else:
          temp = ':'.join(counts[i].split(':')[1:])
        phi_type.append(temp_phi)
        phi_data.append(temp.strip())

      time_count = 0
      count = 0
      #tsv and answer not in the smae file
      while file_name > tsv_line[tsv_index].split('\t')[0]:
        tsv_index += 1
      while int(tsv_line[tsv_index].split('\t')[1]) < int(start):
        tsv_index += 1
      tsv_index -= 1
      temp_index = tsv_index
      while count < 3 and file_name == tsv_line[tsv_index].split('\t')[0]:
        split_tsv = tsv_line[tsv_index].split('\t')
        line_start = int(split_tsv[1])
        data_tsv = '\t'.join(split_tsv[2:])
        return_index = detect(phi_data, data_tsv)


        #can't find phi in this line
        if len(set(return_index)) == 1 and return_index[0] == -1:
          if int(tsv_line[tsv_index].split('\t')[1]) > int(start) and file_name == tsv_line[tsv_index].split('\t')[0]:
            #print(file_name,int(tsv_line[tsv_index].split('\t')[1]) , int(start))
            count += 1
          tsv_index += 1

        #find in this line
        else:
          #calculate start and end index
          split_tsv = tsv_line[tsv_index].split('\t')
          data_tsv = '\t'.join(split_tsv[2:])
          line_start = int(split_tsv[1])

          #print(line_start,phi_data)
          for i in range(len(return_index)):
            if return_index[i] < 0:
              continue
            temp = phi_data[i]
            phi_name = phi_type[i]
            index = return_index[i]
            if '"' in data_tsv:
              index -= 1
            start = line_start + index
            if '\n' in temp:
              end = start + len(temp) -1
              temp = temp[:-1]
            else:
              end = start + len(temp)
            #write into file
            if phi_name in time:
              sub.write('\t'.join([file_name, phi_name, str(start), str(end), temp, normalized[time_count]])+'\n')
              #print('\t'.join([file_name, phi_name, str(start), str(end), temp, normalized[time_count]]))
              time_count += 1
            else:
              sub.write('\t'.join([file_name, phi_name, str(start), str(end), temp])+'\n')
              #print('\t'.join([file_name, phi_name, str(start), str(end), temp]))
          tsv_index += 1
          break
      tsv_index = temp_index

    #no problem line
    else:
      #write into file
      sub.write('\t'.join([file_name, phi_name, str(start), str(end), data]))
      #print('\t'.join([file_name, phi_name, str(start), str(end), data]))


In [37]:
sub.close()
tsv.close()

In [39]:
def check_countries_in_text(file_path):
    # List of common country names
    countries = [
    "USA", "Japan", "China", "India", "Germany", "France", "Brazil", "Australia", "Canada", "United Kingdom",
    "Italy", "Russia", "South Korea", "Mexico", "Indonesia", "Turkey", "Netherlands", "Saudi Arabia", "Switzerland",
    "Spain", "Argentina", "Belgium", "Sweden", "Norway", "Austria", "Poland", "Iran", "Thailand", "Denmark", "South Africa",
    "Singapore", "Malaysia", "Ireland", "Israel", "Egypt", "Greece", "Philippines", "Finland", "New Zealand", "Portugal",
    "Pakistan", "Vietnam", "Colombia", "Chile", "Czech Republic", "Romania", "Hungary", "Ukraine", "Peru", "Belarus",
    "Norway", "Bangladesh", "Nigeria", "Kenya", "Morocco", "Venezuela", "Cuba", "Ecuador", "Norway", "Qatar",
    "Lebanon", "Algeria", "Tunisia", "Malta", "Kuwait", "Bahrain", "Oman", "Jordan", "Cyprus", "Sri Lanka",
    "Nepal", "Bhutan", "Mongolia", "Laos", "Cambodia", "Fiji", "Vanuatu", "Samoa", "Tonga", "Solomon Islands",
    "Palau", "Micronesia", "Marshall Islands", "Tuvalu", "Kiribati", "Comoros", "Seychelles", "Maldives", "Antarctica"
]

# You can use this expanded list in your Python script.


# You can use this expanded list in the previous script for checking countries in a text file.

    # Read the content of the text file
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text_content = file.read()

        # Check if any country name is present in the text
        found_countries = [country for country in countries if country in text_content]

        if found_countries:
            print("The following countries were found in the text:")
            for country in found_countries:
                print(country)
        else:
            print("No country names found in the text.")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")


# Example usage: Replace 'your_text_file.txt' with the path to your text file
check_countries_in_text('/home/Arthur/aicup/final/opendid_test.tsv')


The following countries were found in the text:
USA
Australia
Israel
Chile
Laos


In [61]:
f = open("./answer/answer7.txt", "r")
w = open("./answer/answer8.txt", "w")



for line in f.readlines():
    # To split columns on each line, use split() instead of split('\t')
    fields = line.split()
    
    # Check whether the number of fields is 4, if not, write it into new file
    if len(fields) != 4:
        w.write(line)

f.close()
w.close()

