<a href="https://colab.research.google.com/github/HAL22/Kaggle-Competitions/blob/main/Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Notebook for this kaggle competitions**: [disaster-tweets](https://www.kaggle.com/competitions/nlp-getting-started)


*   Data can be found on the kaggle competition
*   Different versions of this notebook are on kaggle


**Best score of notebook**: 0.83512

**Best rank**: 102/1167

In [None]:
%%capture
!pip install datasets transformers[sentencepiece]
!apt install git-lfs
!pip install evaluate
!pip install -U huggingface_hub
!pip install transformers --upgrade

In [None]:
%%capture
import evaluate
from datasets import load_dataset
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoConfig
import torch

In [None]:
model = "distilbert-base-uncased"
output_dir="disaster",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,

In [None]:
pd_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
pd_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")

pd_train.head()

In [None]:
# Finding duplicates in training set
duplicates_train = pd_train[pd_train[["text"]].duplicated()]
duplicates_train.head()

In [None]:
# Finding duplicates in test set
duplicates_test = pd_test[pd_test[["text"]].duplicated()]
duplicates_test.head()

In [None]:
# Drop duplicate rows in training
pd_train = pd_train.drop_duplicates(subset=["text"],keep=False)
pd_train.head()

In [None]:
pd_train.rename(columns = {"target":"label"}, inplace = True)

In [None]:
pd_train.head()

In [None]:
# Splitting the data
pd_train = pd_train[['text', 'label']].copy()
df_train, df_test = train_test_split(pd_train, test_size=0.2)

In [None]:
train = Dataset.from_pandas(df_train)
test = Dataset.from_pandas(df_test)

dataset = DatasetDict()
dataset['train'] = train
dataset['test'] = test

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    model, num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
     output_dir= "disaster",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
pd_test

In [None]:
config = AutoConfig.from_pretrained("disaster/checkpoint-1488")
tokenizer = AutoTokenizer.from_pretrained("disaster/checkpoint-1488")
trained_model = AutoModelForSequenceClassification.from_pretrained("disaster/checkpoint-1488")

In [None]:
target = []

pd_test = pd_test[['text']].copy()

for tx in pd_test.text.tolist():
    inps = tokenizer(tx, return_tensors="pt")
    with torch.no_grad():
        logits = trained_model(**inps).logits
    predicted_class_id = logits.argmax().item()
    target.append(int(predicted_class_id))

In [None]:
pd_subm = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
pd_subm = pd_subm[['id']].copy()
pd_subm['target'] = target

In [None]:
pd_subm.to_csv("/kaggle/working/submission.csv", index=False)