In [1]:
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, AutoModel, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler

In [2]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
print(df.shape)
df.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
texts = df['text'].values
labels = df['target'].values

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.01)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", model_max_length=50)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [5]:
class DisasterAnalysisDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
train_dataset = DisasterAnalysisDataset(train_encodings, train_labels)
val_dataset = DisasterAnalysisDataset(val_encodings, val_labels)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.cuda.empty_cache()

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
model = model.to(device=device)

#model.train()

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [7]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(),lr=5e-5)
lr_scheduler = get_scheduler("linear",optimizer=optim,num_warmup_steps=0,num_training_steps=8 * len(train_dataloader))



In [8]:
for epoch in range(5):
    for batch in tqdm(train_dataloader):
        optim.zero_grad()
        input_ids= batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
    print(f"Loss for epoch {epoch + 1} is {loss}")

#model.eval()

100%|██████████| 471/471 [00:48<00:00,  9.77it/s]


Loss for epoch 1 is 0.1970321089029312


100%|██████████| 471/471 [00:47<00:00,  9.95it/s]


Loss for epoch 2 is 0.39129799604415894


100%|██████████| 471/471 [00:47<00:00,  9.94it/s]


Loss for epoch 3 is 0.26869767904281616


100%|██████████| 471/471 [00:47<00:00,  9.94it/s]


Loss for epoch 4 is 0.04912209138274193


100%|██████████| 471/471 [00:47<00:00,  9.90it/s]

Loss for epoch 5 is 0.015449672937393188





In [9]:
save_directory = '/kaggle/working/'
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)
model = model.to(device='cpu')

# load the model on CPU 
#model = AutoModel.from_pretrained(save_directory).to(device=device)

In [10]:
from transformers import pipeline
classifier =pipeline('text-classification', model=model, tokenizer=tokenizer)

In [11]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
# pred = classifier(test_df.text[0])[0]
# pred = pred['score']
# pred = [1 if pred>0.5 else 0]
# pred = pred[0]
# pred

In [13]:
preds = []
ids = []
for index, rows in tqdm(test_df.iterrows()):
    pred_dict = classifier(rows['text'])[0]
    score = pred_dict['score']
    pred = [1 if score>0.5 else 0]
    pred = pred[0]
    preds.append(pred)
    ids.append(rows['id'])

18it [00:01,  8.27it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (79 > 50). Running this sequence through the model will result in indexing errors
3263it [06:51,  7.93it/s]


In [14]:
submission = pd.DataFrame()
submission['id'] = ids
submission['target'] = preds
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [15]:
submission.to_csv('submission.csv',index=False)