In [None]:

# Importing libraries
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification  # For tokenization and sequence classification
from sklearn.metrics import accuracy_score  # For evaluating model accuracy
import pandas as pd  # For data manipulation
import torch  # For PyTorch deep learning framework
import spacy  # For NLP tasks like lemmatization and entity recognition

In [None]:
# Defining the device to use GPU if available
device = torch.device('cuda')  # Use GPU for faster computations

In [None]:
# Mounting Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Reading the CSV dataset, selecting only relevant columns
df = pd.read_csv("/content/drive/MyDrive/ML_Project/tweets.csv", usecols=['text', 'target'])

In [None]:
# Loading spaCy English model for NLP
nlp = spacy.load('en_core_web_sm')

In [None]:
# Lemmatizing and removing stop words from the text column
df['text'] = df['text'].apply(lambda x: ' '.join([
    token.lemma_ for token in nlp(x) if not token.is_stop and token.is_alpha
]))

In [None]:
# Initializing DistilBERT tokenizer and model for sequence classification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

In [None]:
# Moving the model to the specified device (GPU)
model = model.to(device)

In [None]:
# Tokenizing the text data with padding, truncation, and max length
inputs = tokenizer(
    df['text'].tolist(), 
    return_tensors='pt', 
    truncation=True, 
    padding=True, 
    max_length=128
)

In [None]:
# Converting target labels to tensors
labels = torch.tensor(df['target'].tolist())

# Initializing the optimizer with a learning rate of 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
# Fine-tuning DistilBERT on the dataset
for epoch in range(1):  # Running for one epoch
    for i in range(len(inputs['input_ids'])):
        # Extracting input IDs and attention masks for each sample
        input_id = inputs['input_ids'][i].to(device)
        attention_mask = inputs['attention_mask'][i].to(device)
        label = labels[i].to(device)
        
        # Zeroing gradients for each step
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(
            input_ids=input_id.unsqueeze(0),  # Add batch dimension
            attention_mask=attention_mask.unsqueeze(0),
            labels=label.unsqueeze(0)
        )
        loss = outputs.loss  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters
        
        # Logging progress every 100 samples
        if (i + 1) % 100 == 0:
            print(f'Processed {i+1} out of {len(inputs["input_ids"])}')

In [None]:
# Setting the model to evaluation mode
model.eval()

In [None]:
# Performing inference on the dataset to make predictions
predictions = []
for i in range(len(inputs['input_ids'])):
    input_id = inputs['input_ids'][i].to(device)
    attention_mask = inputs['attention_mask'][i].to(device)
    
    # Forward pass without gradient computation
    with torch.no_grad():
        outputs = model(
            input_ids=input_id.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
    prediction = torch.argmax(outputs.logits, dim=-1)  # Predicting the class
    predictions.append(prediction.item())

In [None]:
# Calculating the accuracy of predictions
accuracy = accuracy_score(labels.tolist(), predictions)
print(f'Accuracy: {accuracy}')

In [None]:
# Testing a single sentence
test_sentence = "There is a cyclone in Florida"
test_input = tokenizer(
    test_sentence, 
    return_tensors='pt', 
    truncation=True, 
    padding=True
)
test_input = {k: v.to(device) for k, v in test_input.items()}  # Moving data to GPU
test_output = model(**test_input)  # Model inference
test_prediction = torch.argmax(test_output.logits, dim=-1)
print(f'Test sentence: \"{test_sentence}\" is {"a disaster" if test_prediction.item() else "not a disaster"}')


Test sentence: "There is a cyclone in Florida" is a disaster


In [None]:
# Saving the model using pickle
import pickle
with open('/content/drive/MyDrive/ML_Project/mlmodel.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Performing NER on a test sentence to extract locations
test_sentence = "There is a cyclone in Florida"
doc = nlp(test_sentence)  # Processing sentence with spaCy
locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']  # Extracting named entities labeled as GPE (Geo-political entity)
print("Disaster Locations:", locations)

Disaster Locations: ['Florida']


In [None]:
# Testing another sentence
test_sentence = "My life is a cyclone"
test_input = tokenizer(
    test_sentence, 
    return_tensors='pt', 
    truncation=True, 
    padding=True
)
test_input = {k: v.to(device) for k, v in test_input.items()}
test_output = model(**test_input)
test_prediction = torch.argmax(test_output.logits, dim=-1)
print(f'Test sentence: \"{test_sentence}\" is {"a disaster" if test_prediction.item() else "not a disaster"}')


Test sentence: "My life is a cyclone" is not a disaster
