In [1]:
%%capture
!pip install evaluate

In [28]:
from datasets import Dataset, DatasetDict, concatenate_datasets
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login
from tqdm import tqdm

In [41]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
df = pd.read_csv("./data/train.csv", index_col='id')
df.rename(columns={'target': 'label'}, inplace=True)

In [13]:
columns_to_remove = ['keyword', 'location']
df.drop(columns=columns_to_remove, inplace=True)

In [14]:
df

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
10869,Two giant cranes holding a bridge collapse int...,1
10870,@aria_ahrary @TheTawniest The out of control w...,1
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,Police investigating after an e-bike collided ...,1


In [15]:
train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df.label, random_state=42)
train_df.shape, eval_df.shape

((6090, 2), (1523, 2))

In [21]:
ds_dict = {'train' : Dataset.from_pandas(train_df),
           'eval' : Dataset.from_pandas(eval_df)}
dataset = DatasetDict(ds_dict)

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 6090
    })
    eval: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 1523
    })
})

In [23]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [25]:
tokenized_tweets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# Batch of Examples

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load Evaluation Metric

In [27]:
accuracy = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Function that computes accuracy given (prediction, label)

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [29]:
id2label = {0: "Normal", 1: "Disaster"}
label2id = {"Normal": 0, "Disaster": 1}

In [30]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
training_args = TrainingArguments(
                    output_dir = "disaster_tweet_distilbert",
                    learning_rate = 2e-5,
                    per_device_train_batch_size = 16,
                    per_device_eval_batch_size = 16,
                    num_train_epochs = 2,
                    weight_decay = 0.01,
                    evaluation_strategy = "epoch",
                    save_strategy = "epoch",
                    load_best_model_at_end = True,
                    push_to_hub=True,
                                )

In [42]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tweets['train'],
    eval_dataset=tokenized_tweets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [43]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.409856,0.850295
2,No log,0.438578,0.840446


Checkpoint destination directory disaster_tweet_distilbert/checkpoint-191 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory disaster_tweet_distilbert/checkpoint-382 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=382, training_loss=0.23959334608147906, metrics={'train_runtime': 68.0293, 'train_samples_per_second': 179.04, 'train_steps_per_second': 5.615, 'total_flos': 183083398662768.0, 'train_loss': 0.23959334608147906, 'epoch': 2.0})

In [44]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/HilariusJeremy/disaster_tweet_distilbert/commit/da917dd016ab53cf945b9a2c9fb8ab8975fc558a', commit_message='End of training', commit_description='', oid='da917dd016ab53cf945b9a2c9fb8ab8975fc558a', pr_url=None, pr_revision=None, pr_num=None)

# Load The Fine-Tuned Model

In [11]:
from transformers import pipeline
classifier = pipeline(model="HilariusJeremy/disaster_tweet_distilbert")

# Inference and Submission

In [25]:
test_df = pd.read_csv("./data/test.csv")
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [29]:
tqdm.pandas()
output_df = pd.DataFrame()
output_df['id'] = test_df['id']
output_df['target'] = test_df['text'].progress_apply(lambda x: int(classifier(x)[0]['score'] > 0.5))

100%|██████████| 3263/3263 [02:58<00:00, 18.24it/s]


In [31]:
csv_filename = '/kaggle/working/output.csv'
output_df.to_csv(csv_filename, index=False)