In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/twitter_disaster_tweets.csv", usecols=['text', 'target'])

df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df['target'].mean()

np.float64(0.4296597924602653)

# Fom huggin Face

In [4]:
df = df.rename(columns={'target': 'label'})

In [5]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2)

dataset['train'][0]

{'text': '@steveycheese99 @MapMyRun where you being electrocuted all the way round? The map sure looks like it.',
 'label': 0}

# Tokenization

In [6]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [7]:
def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=100)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 6090/6090 [00:00<00:00, 21151.01 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 22847.15 examples/s]


# Building Model

In [8]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

label2id = {'not disaster': 0, 'disaster': 1}
id2label = {0: 'not disaster', 1: 'disaster'}

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.522988,0.77019
2,No log,0.474471,0.793828
3,0.527700,0.454917,0.807617
4,0.527700,0.444244,0.806303
5,0.527700,0.443287,0.80893


TrainOutput(global_step=955, training_loss=0.4820806213698462, metrics={'train_runtime': 411.8957, 'train_samples_per_second': 73.926, 'train_steps_per_second': 2.319, 'total_flos': 69927811858800.0, 'train_loss': 0.4820806213698462, 'epoch': 5.0})

In [11]:
trainer.evaluate()

{'eval_loss': 0.4432869851589203,
 'eval_accuracy': 0.8089297439264609,
 'eval_runtime': 3.0632,
 'eval_samples_per_second': 497.185,
 'eval_steps_per_second': 15.67,
 'epoch': 5.0}

# Model Savings

In [12]:
trainer.save_model('tinybert-disaster-tweet')

In [None]:
from transformers import pipeline
import torch

data = ['There is a fire in the building', 'I am happy today', 'I am sad today', 
          'I am not feeling well','There is a flood in the city, go to higher ground']

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-disaster-tweet', device=device)

classifier(data)

Device set to use cpu


[{'label': 'disaster', 'score': 0.8440995216369629},
 {'label': 'not disaster', 'score': 0.8731791973114014},
 {'label': 'not disaster', 'score': 0.879679799079895},
 {'label': 'not disaster', 'score': 0.8797347545623779},
 {'label': 'disaster', 'score': 0.8601070642471313}]

# Uploading to s3

In [14]:
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'magnolima-mlops-2025'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-disaster-tweet', 'ml-models/tinybert-disaster-tweet')