In [10]:
import os, sys
import numpy as np

# utils setup
current_directory = os.getcwd()
root_directory = os.path.abspath(os.path.join(current_directory, os.pardir))
sys.path.append(root_directory)

# custom utils
from utils.io import Predictions
from utils.metrics import LMR_Metrics
from utils.io import LMR_BILOU_Scrapper, LMR_JSON_Scrapper
from utils.preprocessing import Preprocess
from utils.stratify import MultiLabelNERStratify

# hg utils
from transformers import BertTokenizer, BertForTokenClassification, BertConfig
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import torch

In [12]:
tokenizer = BertTokenizer.from_pretrained("rsuwaileh/IDRISI-LMR-EN-random-typebased")
#config = BertConfig.from_pretrained("rsuwaileh/IDRISI-LMR-EN-random-typebased", num_labels=9) 
model = BertForTokenClassification.from_pretrained("rsuwaileh/IDRISI-LMR-EN-random-typebased", ignore_mismatched_sizes=True)

Some weights of the model checkpoint at rsuwaileh/IDRISI-LMR-EN-random-typebased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from simpletransformers.ner import NERModel, NERArgs
import pandas as pd

# Define model arguments
model_args = NERArgs()
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.num_train_epochs = 3
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.save_steps = -1
model_args.save_model_every_epoch = False
model_args.save_eval_checkpoints = False
model_args.evaluate_during_training = True
model_args.use_multiprocessing = False

# Load the pre-trained NER model
model = NERModel(
    "bert", 
    "rsuwaileh/IDRISI-LMR-EN-random-typebased",
    use_cuda=False,
    args=model_args,
    ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at rsuwaileh/IDRISI-LMR-EN-random-typebased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at rsuwaileh/IDRISI-LMR-EN-random-typebased and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([49, 1024]) in the checkpoint and torch.Size([9, 1024]) in the model instantiat

In [5]:
df_context = pd.read_csv('../data/provided/Test.csv')
df_context = Preprocess.remove_non_ascii(df_context, column_name='text')
df_context = Preprocess.remove_usertag(df_context, column_name='text')
df_context = Preprocess.reformat_hashtag(df_context, column_name='text')
df_context = Preprocess.remove_prefix(df_context, df_type="test", text_column='text')
df_context = Preprocess.reformat_useless_char(df_context, column_name='text')
df_context.head(10)

Unnamed: 0,tweet_id,text
0,ID_1001154804658286592,What is happening to the infrastructure in New...
1,ID_1001155505459486720,SOLDER MISSING IN FLOOD.. PRAY FOR EDDISON HER...
2,ID_1001155756371136512,Police searching for missing person after deva...
3,ID_1001159445194399744,Flash Flood Tears Through Maryland Town For Se...
4,ID_1001164907587538944,Ellicott City FLOODING Pictures: Maryland Gove...
5,ID_1001178904617476096,Our Harts gos out to a Fellow Soldier missing ...
6,ID_1001179909245587456,CRAZY VIDEO. Roaring flash floods struck a Mar...
7,ID_1001180876548591616,I liked a video BREAKING: Devastating floodi...
8,ID_1001182906130280448,Thank you to the first responders who are taki...
9,ID_1001185240256311296,Ellicott City floods: Maryland officials asses...


In [6]:
ids = df_context["tweet_id"].values
tweets = df_context["text"].values

# Make prediction
predictions, raw_outputs = model.predict(tweets)

# Save submission file
results = []
for sentence in predictions:
    result = " ".join([word for d in sentence for word, tag in d.items() if tag != 'O'])
    if result == "":
        result = " "
    results.append(result)

  0%|          | 0/2942 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/184 [00:00<?, ?it/s]

In [8]:
raw_outputs

[[{'What': [[0.03784256,
     0.19766283,
     -0.37185577,
     0.07712317,
     1.2361199,
     0.3053896,
     -0.73114747,
     -0.25138906,
     0.30232707]]},
  {'is': [[-0.020103313,
     0.32187343,
     -0.3307772,
     0.13694805,
     1.2305558,
     0.31087786,
     -0.8604961,
     -0.5738647,
     0.45922554]]},
  {'happening': [[0.027061777,
     0.2799042,
     -0.27152506,
     0.10237,
     1.2139256,
     0.342583,
     -0.9086,
     -0.46250215,
     0.41861114]]},
  {'to': [[-0.052371968,
     0.28595436,
     -0.33315676,
     0.20459457,
     1.2346951,
     0.24596669,
     -1.000634,
     -0.46841517,
     0.3240819]]},
  {'the': [[-0.15472904,
     0.30694345,
     -0.38831896,
     0.14016593,
     1.1011196,
     0.29686335,
     -0.96535,
     -0.49485192,
     0.42604482]]},
  {'infrastructure': [[0.042218402,
     0.025269171,
     -0.5109633,
     0.0517679,
     1.0303974,
     0.23733483,
     -0.921813,
     -0.49281716,
     0.45033342]]},
  {'in': [

In [9]:
Predictions.to_csv(ids, results)

Saved predictions to ../submissions/submission_15.csv


In [None]:
# Some Quick postprocessing
def remove_duplicate_words(file_path):
    df = pd.read_csv(file_path)
    def remove_duplicates(location):
        if pd.isna(location):
            return location
        words = location.split()
        unique_words = list(dict.fromkeys(words))
        loc = ' '.join(unique_words)
        return loc if loc != '' else ' '
    df['location'] = df['location'].apply(remove_duplicates)
    return df

# Usage
df_cleaned = remove_duplicate_words('../submissions/submission_15.csv')
df_cleaned.to_csv('../submissions/submission_15_post.csv', index=False)

In [14]:
location = "toto est toto toto un enfant"
words = location.split()
unique_words = list(dict.fromkeys(words))
loc = ' '.join(unique_words)
print(loc)

toto est un enfant
