<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers[torch]



In [3]:
!pip install accelerate -U




In [None]:
!pip install nltk
!pip install scikit-learn

In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import torch
import pickle


In [2]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df_balanced = pd.read_csv('/content/balanced_dataset.csv')

# Preprocess text data
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def clean_text(text):
    text = text.translate(str.maketrans('', '', punctuation))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

df_balanced['text'] = df_balanced['text'].apply(clean_text)

# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_balanced['text'], df_balanced['labels'], test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Save the split datasets
train_df = pd.DataFrame({'text': train_texts, 'labels': train_labels})
val_df = pd.DataFrame({'text': val_texts, 'labels': val_labels})
test_df = pd.DataFrame({'text': test_texts, 'labels': test_labels})

train_df.to_csv('/content/train_dataset.csv', index=False)
val_df.to_csv('/content/val_dataset.csv', index=False)
test_df.to_csv('/content/test_dataset.csv', index=False)


In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_data(text_list, tokenizer, max_length=128):
    return tokenizer(text_list, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_data(train_texts.tolist(), tokenizer)
val_encodings = tokenize_data(val_texts.tolist(), tokenizer)
test_encodings = tokenize_data(test_texts.tolist(), tokenizer)


In [7]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels.tolist())
val_dataset = TextDataset(val_encodings, val_labels.tolist())
test_dataset = TextDataset(test_encodings, test_labels.tolist())


In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)




In [9]:
trainer.train()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.0631,0.073526
2,0.0738,0.133127
3,0.0681,0.099743
4,0.0,0.073023
5,0.0,0.076145
6,0.0,0.074937
7,0.0,0.10053
8,0.0,0.110602
9,0.0,0.114309
10,0.0,0.113


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=18030, training_loss=0.022050908206214276, metrics={'train_runtime': 2141.1583, 'train_samples_per_second': 67.351, 'train_steps_per_second': 8.421, 'total_flos': 4775780890045440.0, 'train_loss': 0.022050908206214276, 'epoch': 10.0})

In [10]:
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation results:", val_results)

test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test results:", test_results)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation results: {'eval_loss': 0.11300011724233627, 'eval_runtime': 6.2584, 'eval_samples_per_second': 256.136, 'eval_steps_per_second': 32.117, 'epoch': 10.0}
Test results: {'eval_loss': 0.10172781348228455, 'eval_runtime': 16.0646, 'eval_samples_per_second': 249.368, 'eval_steps_per_second': 31.187, 'epoch': 10.0}


In [11]:
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)
test_accuracy = accuracy_score(test_labels, predicted_labels)
print("Test Accuracy:", test_accuracy)
print(classification_report(test_labels, predicted_labels, target_names=['Human', 'Bot']))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.9910134797803295
              precision    recall  f1-score   support

       Human       0.99      0.99      0.99      1985
         Bot       0.99      0.99      0.99      2021

    accuracy                           0.99      4006
   macro avg       0.99      0.99      0.99      4006
weighted avg       0.99      0.99      0.99      4006



In [12]:
with open('/content/distilbert_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('/content/distilbert_tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print("Model and tokenizer saved as pickle files.")


Model and tokenizer saved as pickle files.


In [13]:
with open('/content/distilbert_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('/content/distilbert_tokenizer.pkl', 'rb') as tokenizer_file:
    loaded_tokenizer = pickle.load(tokenizer_file)

loaded_model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [14]:
pickle_model_path = '/content/drive/My Drive/distilbert_model.pkl'
pickle_tokenizer_path = '/content/drive/My Drive/distilbert_tokenizer.pkl'

with open(pickle_model_path, 'wb') as model_file:
    pickle.dump(model, model_file)

with open(pickle_tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

print(f"Model and tokenizer saved to Google Drive at {pickle_model_path} and {pickle_tokenizer_path}")


Model and tokenizer saved to Google Drive at /content/drive/My Drive/distilbert_model.pkl and /content/drive/My Drive/distilbert_tokenizer.pkl


In [15]:
actual_vs_predicted = pd.DataFrame({
    'text': test_texts.tolist(),
    'actual': test_labels.tolist(),
    'predicted': predicted_labels.tolist()
})

print(actual_vs_predicted)


                                                   text  actual  predicted
0     May 24 2001 25 years ago NASAs viking spacecra...       0          0
1     medieval times task cleaning battlefields typi...       1          1
2     sound travels water actually travel much farth...       1          1
3     electrol college college process electoral col...       0          0
4     Dear TEACHERNAME think let anyone C average pl...       0          0
...                                                 ...     ...        ...
4001  extracurricular Activities principles decide s...       0          0
4002  state states country election president must s...       0          0
4003  Venus intresting planet solar system soarching...       0          0
4004  time peoples first impressions someone change ...       1          1
4005  Small acts kindness incredibly profound effect...       1          1

[4006 rows x 3 columns]
