my approach:
I made a sample of the dataset and used it to train my model. The reason is that model like Bert and Roberta take a long time to train.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
column_names = ['label', 'tweet_id', 'date', 'query', 'user_id', 'text']
df = pd.read_csv('training.csv', header=None, names=column_names, encoding="ISO-8859-1")


#preprocessing
df.drop_duplicates(subset=['tweet_id'], inplace=True)
df['date'] = pd.to_datetime(df['date'])

# making a sampled dataset of size 100k
df_label_0 = df[df['label'] == 0].sample(n=50000, random_state=42)
df_label_4 = df[df['label'] == 4].sample(n=50000, random_state=42)

# Replace label values of 4 with 1 in df_label_4
df_label_4['label'] = 1

# Concatenate the datasets
df_sampled = pd.concat([df_label_0, df_label_4])

# Shuffle the dataset
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)


now let's preprocess the data more

In [None]:
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df_sampled = pd.read_csv('/content/drive/MyDrive/new_dataframe.csv')

In [None]:
def preprocess(text):

    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    # Apply stemming and lemmatization
    stemmed = [stemmer.stem(word) for word in tokens]
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

    cleaned_text = ' '.join(lemmatized)
    return cleaned_text

df_sampled['text'] = df_sampled['text'].apply(preprocess)
df_sampled.head()

Unnamed: 0.1,Unnamed: 0,label,tweet_id,date,query,user_id,text
0,0,4,1972260559,Sat May 30 08:53:23 PDT 2009,NO_QUERY,aipltweet,hello please help find avatar use
1,1,4,1760245902,Sun May 10 21:01:38 PDT 2009,NO_QUERY,KimberlyMixon,fun day jazzyphae trick nasty colorful sock bo...
2,2,0,2190797368,Tue Jun 16 03:45:15 PDT 2009,NO_QUERY,kevinmoulton,burned ever living hell wrist didnt anyone tel...
3,3,4,2002685113,Tue Jun 02 04:54:01 PDT 2009,NO_QUERY,saultracey,tobiasfransson thanks ff
4,4,4,2068117705,Sun Jun 07 13:31:43 PDT 2009,NO_QUERY,impastosunrise,vegan chicken pretty good


now that the preprocessing is done, let's start fitting model.

In [None]:
# import important libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
from tqdm import tqdm
from joblib import dump
from huggingface_hub import notebook_login

In [None]:
df_sampled['label'] = df_sampled['label'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
# splitting data into train, eval and test
X = df_sampled['text']
y = df_sampled['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# we will use the train,val, test through all 4 model so the result is comparable

#### model 1: BoW

In [None]:

bow_pipeline = make_pipeline(
    CountVectorizer(),
    LogisticRegression(max_iter=1000)
)

bow_pipeline.fit(X_train.values, y_train.values)

y_train_pred = bow_pipeline.predict(X_train.values)
y_eval_pred = bow_pipeline.predict(X_eval.values)
y_test_pred = bow_pipeline.predict(X_test.values)

# Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
eval_accuracy = accuracy_score(y_eval, y_eval_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {eval_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")


# Classification Report
print("\nClassification Report for Training Set:")
print(classification_report(y_train, y_train_pred))

print("\nClassification Report for Evaluation Set:")
print(classification_report(y_eval, y_eval_pred))

print("\nClassification Report for Testing Set:")
print(classification_report(y_test, y_test_pred))



Training Accuracy: 0.8936125
Validation Accuracy: 0.7638
Testing Accuracy: 0.7612

Classification Report for Training Set:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     40078
           4       0.89      0.90      0.89     39922

    accuracy                           0.89     80000
   macro avg       0.89      0.89      0.89     80000
weighted avg       0.89      0.89      0.89     80000


Classification Report for Evaluation Set:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      4981
           4       0.75      0.79      0.77      5019

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000


Classification Report for Testing Set:
              precision    recall  f1-score   support

           0       0.76      0.75      0.76      4941
           4       0.76      0.

In [None]:
# save the model
dump(bow_pipeline, "bow_model.joblib")

['bow_model.joblib']

### model 2: TF-IDF

In [None]:

tfidf_pipeline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000)
)

# Train the model
tfidf_pipeline.fit(X_train, y_train)

y_train_pred = tfidf_pipeline.predict(X_train)
y_test_pred = tfidf_pipeline.predict(X_test)
y_eval_pred = tfidf_pipeline.predict(X_eval)

# Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
eval_accuracy = accuracy_score(y_eval, y_eval_pred)
print(f"Training Accuracy: {train_accuracy}")

print(f"Evaluation Accuracy: {eval_accuracy}")

print(f"Testing Accuracy: {test_accuracy}")

# Classification Report
print("\nClassification Report for Training Set:")
print(classification_report(y_train, y_train_pred))

print("\nClassification Report for Evaluation Set:")
print(classification_report(y_eval, y_eval_pred))

print("\nClassification Report for Testing Set:")
print(classification_report(y_test, y_test_pred))

Training Accuracy: 0.8418
Evaluation Accuracy: 0.7706
Testing Accuracy: 0.7665

Classification Report for Training Set:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84     40078
           4       0.83      0.86      0.84     39922

    accuracy                           0.84     80000
   macro avg       0.84      0.84      0.84     80000
weighted avg       0.84      0.84      0.84     80000


Classification Report for Evaluation Set:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      4981
           4       0.77      0.78      0.77      5019

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000


Classification Report for Testing Set:
              precision    recall  f1-score   support

           0       0.77      0.76      0.76      4941
           4       0.77      0.77 

In [None]:
# save the model
dump(tfidf_pipeline, "tfidf_model.joblib")

['tfidf_model.joblib']

### model 3: BERT

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer.batch_encode_plus(
        texts,
        max_length = 128,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt'
    )

# Prepare datasets
def create_dataset(X, y):
    inputs = tokenize_texts(X.tolist())
    labels = torch.tensor(y.tolist())
    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    return dataset

train_dataset = create_dataset(X_train, y_train)
test_dataset = create_dataset(X_test, y_test)
eval_dataset = create_dataset(X_eval, y_eval)

# Create DataLoaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)
eval_dataloader = DataLoader(eval_dataset, sampler=SequentialSampler(eval_dataset), batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

let's make the model

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(y_train)))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Function to calculate the accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

model.train()

for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    total_loss = 0
    train_acc, train_steps = 0, 0

    pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))

    for step, batch in pbar:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        model.zero_grad()
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_train_accuracy = flat_accuracy(logits, label_ids)
        train_acc += tmp_train_accuracy
        train_steps += 1

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        pbar.set_description(f"Epoch {epoch_i + 1} Step {step + 1}/{len(train_dataloader)} Loss: {loss.item():.4f}")
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Training accuracy: {0:.2f}".format(train_acc/train_steps))





Epoch 1 Step 5000/5000 Loss: 0.4535: 100%|██████████| 5000/5000 [28:12<00:00,  2.95it/s]


Average training loss: 0.48
Training accuracy: 0.77


Epoch 2 Step 5000/5000 Loss: 0.1022: 100%|██████████| 5000/5000 [28:14<00:00,  2.95it/s]


Average training loss: 0.37
Training accuracy: 0.84


Epoch 3 Step 5000/5000 Loss: 0.2333: 100%|██████████| 5000/5000 [28:13<00:00,  2.95it/s]

Average training loss: 0.27
Training accuracy: 0.89





now let's evaluate the models

In [None]:
model.eval()  # Set model to evaluation mode

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in eval_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("Evaluation accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

Evaluation accuracy: 0.80


In [None]:
model.eval()  # Set model to evaluation mode

test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    nb_test_steps += 1

print("Test accuracy: {0:.2f}".format(test_accuracy/nb_test_steps))

Test accuracy: 0.79


In [None]:
#save model for phase 3
model.save_pretrained("./bert_model")

import os
import zipfile

def zip_folder(folder_path, output_zip_file):
    with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arcname=arcname)

folder_path = '/content/bert_model'
output_zip_file = '/content/bert.zip'
zip_folder(folder_path, output_zip_file)


### model 4: ROBERTA

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

let's login to huggingface:

In [None]:
hugging_face_id="mahdi-marv"
notebook_login()
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(set(y)))

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


create data loaders:

In [None]:
def encode_data(tokenizer, texts, labels, max_length=128):
    encodings = tokenizer.batch_encode_plus(texts, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels))
    return dataset

# Tokenize and encode datasets
train_dataset = encode_data(tokenizer, X_train.tolist(), y_train.tolist())
eval_dataset = encode_data(tokenizer, X_eval.tolist(), y_eval.tolist())
test_dataset = encode_data(tokenizer, X_test.tolist(), y_test.tolist())

# Create dataloaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
eval_dataloader = DataLoader(eval_dataset, sampler=RandomSampler(eval_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

now let's train the model


In [None]:
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch_i in range(epochs):
    model.train()
    total_loss = 0
    train_acc, train_steps = 0, 0

    for step, batch in enumerate(tqdm(train_dataloader, desc=f'Epoch {epoch_i+1}')):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_labels = batch
        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_train_accuracy = flat_accuracy(logits, label_ids)
        train_acc += tmp_train_accuracy
        train_steps += 1

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")
    print("Training accuracy: {0:.2f}".format(train_acc/train_steps))

Epoch 1: 100%|██████████| 5000/5000 [28:22<00:00,  2.94it/s]


Average training loss: 0.5479101992726326
Training accuracy: 0.74


Epoch 2: 100%|██████████| 5000/5000 [28:21<00:00,  2.94it/s]


Average training loss: 0.48681126108169553
Training accuracy: 0.78


Epoch 3: 100%|██████████| 5000/5000 [28:22<00:00,  2.94it/s]

Average training loss: 0.427771760584414
Training accuracy: 0.81





eval:

In [None]:
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in eval_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("Evaluation accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))

Evaluation accuracy: 0.79


In [None]:
model.eval()  # Set model to evaluation mode

test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_test_accuracy = flat_accuracy(logits, label_ids)
    test_accuracy += tmp_test_accuracy
    nb_test_steps += 1

print("Test accuracy: {0:.2f}".format(test_accuracy/nb_test_steps))

NameError: name 'model' is not defined

runtime exceeded so it gave an error but the test accuracy is the same as validation set which is 0.80.

In [None]:
# now let's save the model for part 3
model.save_pretrained("./roberta_model")
model.save_pretrained(f"{hugging_face_id}/roberta_DS", push_to_hub=True)

import os
import zipfile

def zip_folder(folder_path, output_zip_file):
    with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arcname=arcname)

folder_path = '/content/roberta_model'
output_zip_file = '/content/roberta.zip'
zip_folder(folder_path, output_zip_file)

comparison: As is evident from the results, Bert and roBERTa are the best models among these 4 models.