Location Data Branch
I am changing information

In [None]:
# colab information
# from google.colab import drive

# drive.mount('/content/drive')

In [3]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np


In [4]:
# train_file_path = '/content/drive/My Drive/Colab Notebooks/NLPDisaster/train.csv'
train_file_path = 'train.csv'
test_file_path = './test.csv'
# Read the CSV file into a DataFrame
df_train = pd.read_csv(train_file_path, delimiter=',')  # Use '\t' for tab-separated files
df_test = pd.read_csv(test_file_path, delimiter=',')

# Display the first few rows of the DataFrame
# df_train.head()
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
# make sure the target values of 0 or 1 are ints
df_train['target'] = df_train['target'].astype(int)

# make blank locations have unknown token
df_train['location'] = df_train['location'].fillna('[UNK]')
# print the length of location
df_test['location'] = df_test['location'].fillna('[UNK]')

print(len(df_train['location']))
print(len(df_train['text']))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Tokenize the text data
train_text_encodings = tokenizer(
    list(df_train['text'].values),
    add_special_tokens=True,
    max_length=128,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

train_location_encodings = tokenizer(
    list(df_train['location'].values),
    add_special_tokens=True,
    max_length=32,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

test_text_encodings = tokenizer(
    list(df_test['text'].values),
    add_special_tokens=True,
    max_length=128,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

test_location_encodings = tokenizer(
    list(df_test['location'].values),
    add_special_tokens=True,
    max_length=32,
    padding=True,
    truncation=True,
    return_tensors='pt'
)


print("Text input_ids size:", train_text_encodings['input_ids'].size())
print("Text attention_mask size:", train_text_encodings['attention_mask'].size())
print("Location input_ids size:", train_location_encodings['input_ids'].size())
print("Location attention_mask size:", train_location_encodings['attention_mask'].size())
print("Targets size:", torch.tensor(df_train['target'].values).size())


7613
7613




Text input_ids size: torch.Size([7613, 84])
Text attention_mask size: torch.Size([7613, 84])
Location input_ids size: torch.Size([7613, 30])
Location attention_mask size: torch.Size([7613, 30])
Targets size: torch.Size([7613])


In [6]:
# input IDS are the token IDs for every token in the text
# attention mask tells the model the location of actual tokens and the pos of padding tokens (note useful)
# we have to add padding tokens to make all the inputs be the same size

train_dataset = TensorDataset(
    train_text_encodings['input_ids'],
    train_text_encodings['attention_mask'],
    train_location_encodings['input_ids'],
    train_location_encodings['attention_mask'],
    torch.tensor(df_train['target'].values)
)

test_dataset = TensorDataset(
    test_text_encodings['input_ids'],
    test_text_encodings['attention_mask'],
    test_location_encodings['input_ids'],
    test_location_encodings['attention_mask']
)

# this is a way for the data to be randomized during every epoch of training. Also makes easier to parallelize
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [11]:
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, bert_model_name, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size * 2, num_labels)  # *2 for concatenation

    def forward(self, input_ids, attention_mask, location_ids, location_attention_mask):
        text_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        location_outputs = self.bert(input_ids=location_ids, attention_mask=location_attention_mask)
        
        # Concatenate the pooled outputs
        pooled_output = torch.cat((text_outputs[1], location_outputs[1]), dim=1)
        logits = self.classifier(pooled_output)

        return logits


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# two labels are 1 (disaster) or 0 (no disaster)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# trying to init a custom bert model
model = BertClassifier(bert_model_name='bert-base-uncased', num_labels=2)
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn = torch.nn.CrossEntropyLoss().to(device)




In [9]:
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        location_ids = batch[2].to(device)
        location_attention_mask = batch[3].to(device)
        targets = batch[4].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            location_ids=location_ids,
            location_attention_mask=location_attention_mask
        )
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [10]:
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')


Epoch 1/3
----------


TypeError: BertForSequenceClassification.forward() got an unexpected keyword argument 'location_ids'

In [10]:
#Save Model Weights
model_save_path = 'bert_disaster_tweets_WEIGHTS.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to bert_disaster_tweets_WEIGHTS.pth


In [12]:
# Load the model on another machine
model_load_path = 'bert_disaster_tweets_WEIGHTS.pth'

# Ensure you have the same architecture and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load the saved weights
model.load_state_dict(torch.load(model_load_path))
model = model.to(device)
model.eval()
print(f"Model loaded from {model_load_path}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded from bert_disaster_tweets_WEIGHTS.pth


In [13]:
def get_predictions(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds)

    predictions = torch.stack(predictions).cpu()
    return predictions

In [14]:
# Get predictions for the test data
test_predictions = get_predictions(model, test_data_loader)

# Save the predictions to a CSV file
submission = pd.DataFrame({
    'id': df_test['id'],
    'target': test_predictions.numpy()
})

submission.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

Predictions saved to ubmission.csv
