In [1]:
import pandas as pd
data = pd.read_csv("Disaster.csv")
data.columns

Index(['Name', 'UserName', 'Timestamp', 'Verified', 'Tweets', 'Comments',
       'Retweets', 'Likes', 'Impressions', 'Tags', 'Tweet Link', 'Tweet ID',
       'Disaster'],
      dtype='object')

In [2]:
texts = data['Tweets'].tolist()
labels = data['Disaster'].tolist()

In [3]:
data.Disaster.value_counts()

Disaster
Drought       770
Wildfire      540
Earthquake    500
Floods        436
Hurricanes    178
Tornadoes     135
Name: count, dtype: int64

In [4]:
# Create a mapping dictionary for disaster types
disaster_mapping = {
    'Drought': 0,
    'Wildfire': 1,
    'Earthquake': 2,
    'Floods': 3,
    'Hurricanes': 4,
    'Tornadoes': 5
}

# Apply the mapping to the Disaster column
data['Disaster'] = data['Disaster'].map(disaster_mapping)

In [5]:
data.Disaster.value_counts()

Disaster
0    770
1    540
2    500
3    436
4    178
5    135
Name: count, dtype: int64

In [6]:
import torch
from sklearn.model_selection import train_test_split
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Load the model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Data Preparation: Select 1000 random samples from the dataset
data = data.sample(1000, random_state=42 )

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Tweets'], data['Disaster'], test_size=0.2, random_state=42)

In [8]:
# Tokenize the texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [9]:
# Convert to torch tensors
class DisasterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = DisasterDataset(train_encodings, train_labels.tolist())
test_dataset = DisasterDataset(test_encodings, test_labels.tolist())

# Create a DataLoader
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

In [11]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(3):  # Training for 3 epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        inputs = {key: val for key, val in batch.items() if key != 'labels'}
        labels = batch['labels']
        
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        print(f"Epoch: {epoch}, Loss: {loss.item()}")



Epoch: 0, Loss: 1.7568838596343994
Epoch: 0, Loss: 1.7284153699874878
Epoch: 0, Loss: 1.6571383476257324
Epoch: 0, Loss: 1.6723576784133911
Epoch: 0, Loss: 1.6334164142608643
Epoch: 0, Loss: 1.5641052722930908
Epoch: 0, Loss: 1.5680629014968872
Epoch: 0, Loss: 1.4901506900787354
Epoch: 0, Loss: 1.516200065612793
Epoch: 0, Loss: 1.3928720951080322
Epoch: 0, Loss: 1.2260676622390747
Epoch: 0, Loss: 1.4737558364868164
Epoch: 0, Loss: 1.391014575958252
Epoch: 0, Loss: 1.257218360900879
Epoch: 0, Loss: 1.576425552368164
Epoch: 0, Loss: 1.099271297454834
Epoch: 0, Loss: 1.1797534227371216
Epoch: 0, Loss: 1.0894882678985596
Epoch: 0, Loss: 0.9825670123100281
Epoch: 0, Loss: 1.081568717956543
Epoch: 0, Loss: 0.7816423177719116
Epoch: 0, Loss: 0.8939167261123657
Epoch: 0, Loss: 0.9672996401786804
Epoch: 0, Loss: 0.9492265582084656
Epoch: 0, Loss: 0.611672580242157
Epoch: 0, Loss: 0.7284276485443115
Epoch: 0, Loss: 0.6656630039215088
Epoch: 0, Loss: 0.6229923963546753
Epoch: 0, Loss: 0.650982141

In [12]:
# Switch the model to evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Create DataLoader for the test set
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize lists to store true labels and predictions
predictions, true_labels = [], []

# Evaluate the model
for batch in test_dataloader:
    inputs = {key: val for key, val in batch.items() if key != 'labels'}
    labels = batch['labels']
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

In [14]:
# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.975
Precision: 0.976047619047619
Recall: 0.975
F1-Score: 0.9749450454511307


In [15]:
model_save = "disaster_model.pth"
torch.save(model.state_dict(), model_save)

In [16]:
tokenizer.save_pretrained("tokenizer/")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [17]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
model.load_state_dict(torch.load(model_save))
model.eval()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("tokenizer/")

# Example prediction
new_texts = ["The smoke from the wildfire is affecting air quality in nearby cities."]
new_encodings = tokenizer(new_texts, truncation=True, padding=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1)
    print(predictions.item()) 


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_save))


1


In [18]:
import numpy as np
unique, counts = np.unique(train_labels, return_counts=True)
print(dict(zip(unique, counts)))

{0: 263, 1: 164, 2: 144, 3: 138, 4: 49, 5: 42}


In [19]:
texts_to_predict = [
    "The prolonged drought is severely affecting agricultural output in the region.",
    "The earthquake caused extensive damage to buildings and infrastructure in the city.",
    "Wildfires are raging through the forest, threatening homes and wildlife.",
    "Heavy rains have caused severe flooding in the downtown area.",
    "The hurricane made landfall last night, causing widespread power outages.",
    "A series of tornadoes have torn through the region, causing widespread destruction.",
    "The hurricane's strong winds and heavy rains have led to significant damage.",
    "Emergency shelters have been set up to accommodate those displaced by the hurricane."
]

In [20]:
# Tokenize the texts
encodings = tokenizer(texts_to_predict, truncation=True, padding=True, return_tensors="pt")
model.eval()  # Switch to evaluation mode

with torch.no_grad():  # Disable gradient calculations
    outputs = model(**encodings)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)  # Get the index of the highest logit for each example

# Convert predictions to a list
predicted_labels = predictions.tolist()


In [21]:
# Reverse the disaster mapping
reverse_disaster_mapping = {v: k for k, v in disaster_mapping.items()}

# Convert numeric labels to disaster types
predicted_disasters = [reverse_disaster_mapping[label] for label in predicted_labels]

# Print predictions
for text, disaster in zip(texts_to_predict, predicted_disasters):
    print(f"Text: {text}")
    print(f"Predicted Disaster: {disaster}")
    print("-" * 50)

Text: The prolonged drought is severely affecting agricultural output in the region.
Predicted Disaster: Drought
--------------------------------------------------
Text: The earthquake caused extensive damage to buildings and infrastructure in the city.
Predicted Disaster: Earthquake
--------------------------------------------------
Text: Wildfires are raging through the forest, threatening homes and wildlife.
Predicted Disaster: Wildfire
--------------------------------------------------
Text: Heavy rains have caused severe flooding in the downtown area.
Predicted Disaster: Floods
--------------------------------------------------
Text: The hurricane made landfall last night, causing widespread power outages.
Predicted Disaster: Hurricanes
--------------------------------------------------
Text: A series of tornadoes have torn through the region, causing widespread destruction.
Predicted Disaster: Tornadoes
--------------------------------------------------
Text: The hurricane's stro