<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/roberta_mutation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import random
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Step 1: Read and Prepare Data
df = pd.read_csv('/content/balanced_dataset.csv')
X = df['text'].values
y = df['labels'].values

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting data into train, validation, test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Define Mutation Functions

# Identify articles, adjectives, and adverbs
articles = {'a', 'an', 'the'}
adjectives_adverbs_tags = {'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}

# Character-Level Substitution
def substitute_random_char(word):
    if len(word) == 0:
        return word
    pos = random.randint(0, len(word) - 1)
    char = chr(random.randint(97, 122))
    return word[:pos] + char + word[pos + 1:]

# Word-Level Substitution for articles, adjectives, adverbs
random_articles = list(articles)
random_adjectives_adverbs = ["very", "quickly", "eager", "bravely", "beautiful", "happy"]

def substitute_random_word(word, tag):
    if word.lower() in articles:
        return random.choice(random_articles)
    elif tag in adjectives_adverbs_tags:
        return random.choice(random_adjectives_adverbs)
    return word

# Apply Character and Word-Level Substitutions
def apply_mutations(data):
    mutated_data = []
    for text in data:
        words = word_tokenize(text)
        pos_tags = pos_tag(words)
        mutated_words = []
        for word, tag in pos_tags:
            if word.lower() in articles or tag in adjectives_adverbs_tags:
                # Apply word-level substitution
                word = substitute_random_word(word, tag)
                # Apply character-level substitution
                word = substitute_random_char(word)
            mutated_words.append(word)
        mutated_text = ' '.join(mutated_words)
        mutated_data.append(mutated_text)
    return mutated_data

# Step 3: Apply Mutations
X_train_mutated = apply_mutations(X_train)
X_val_mutated = apply_mutations(X_val)
X_test_mutated = apply_mutations(X_test)

# Step 4: Text Preprocessing (tokenization, sequence conversion)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def encode_data(texts, labels, max_len=100):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,  # Ensure the sequences are truncated to max_len
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'].flatten())
        attention_masks.append(encoded_dict['attention_mask'].flatten())

    return {'input_ids': input_ids, 'attention_mask': attention_masks, 'labels': labels}

# Encode data
max_len = 100  # Ensure max_len is applied consistently
train_encoded = encode_data(X_train_mutated, y_train, max_len=max_len)
val_encoded = encode_data(X_val_mutated, y_val, max_len=max_len)
test_encoded = encode_data(X_test_mutated, y_test, max_len=max_len)

train_dataset = Dataset.from_dict(train_encoded)
val_dataset = Dataset.from_dict(val_encoded)
test_dataset = Dataset.from_dict(test_encoded)

# Step 5: Define and compile your RoBERTa model
roberta_base_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_encoder.classes_),
)

# Define the training arguments with reduced batch size
training_args = TrainingArguments(
    output_dir="./roberta_base_model",
    num_train_epochs=20,
    per_device_train_batch_size=32,  # Reduce the batch size here
    per_device_eval_batch_size=32,    # Reduce the batch size here
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=1e-4,
    load_best_model_at_end=True,
)

# Define the Trainer with the updated training arguments
trainer = Trainer(
    model=roberta_base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))},
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
val_results = trainer.evaluate()
print("Validation Accuracy:", val_results['eval_accuracy'])
print("Validation Loss:", val_results['eval_loss'])

# Step 6: Test Evaluation
test_results = trainer.predict(test_dataset)
test_accuracy = accuracy_score(test_results.label_ids, test_results.predictions.argmax(-1))
test_loss = test_results.metrics['test_loss']
print(f'Test accuracy: {test_accuracy}')
print(f'Test loss: {test_loss}')

# Confusion Matrix
cm = confusion_matrix(test_results.label_ids, test_results.predictions.argmax(-1))
print('Confusion Matrix:')
print(cm)



# Actual vs Predicted Outputs
results = pd.DataFrame({'Actual': test_results.label_ids, 'Predicted': test_results.predictions.argmax(-1)})
print('Actual vs Predicted:')
print(results)



# Save results to a pickle file
with open('model_results.pkl', 'wb') as f:
    pickle.dump(results, f)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
100,0.3663,0.283036,0.911119
200,0.1706,0.16932,0.947403
300,0.1636,0.194267,0.951731
400,0.2169,0.105455,0.974035
500,0.1519,0.074944,0.977031
600,0.1229,0.258545,0.943742
700,0.1036,0.180783,0.948735
800,0.1099,0.093122,0.978362
900,0.0772,0.070817,0.982357
1000,0.0575,0.083401,0.983023


In [None]:
# fasttext with roberta

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import fasttext.util
import nltk
from nltk import word_tokenize
from nltk.corpus import wordnet
nltk.download('punkt')

# Load FastText embeddings
fasttext.util.download_model('en', if_exists='ignore')  # download English model
ft = fasttext.load_model('cc.en.300.bin')  # load FastText model

# Step 1: Read and Prepare Data
df = pd.read_csv('/content/balanced_dataset.csv')
X = df['text'].values
y = df['labels'].values

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Splitting data into train, validation, test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Text Preprocessing with FastText Embeddings
def preprocess_with_fasttext(texts):
    embeddings = []
    for text in texts:
        # Tokenize text
        tokens = word_tokenize(text)
        # Get FastText embeddings for each token
        token_embeddings = [ft.get_word_vector(token) for token in tokens]
        # Average embeddings for the entire text
        text_embedding = np.mean(token_embeddings, axis=0)
        embeddings.append(text_embedding)
    return embeddings

# Preprocess data with FastText embeddings
X_train_embeddings = preprocess_with_fasttext(X_train)
X_val_embeddings = preprocess_with_fasttext(X_val)
X_test_embeddings = preprocess_with_fasttext(X_test)

# Step 3: Tokenization and Encoding with RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def encode_data_with_embeddings(texts, embeddings, labels, max_len=100):
    input_ids = []
    attention_masks = []

    for text, embedding in zip(texts, embeddings):
        # Combine text with FastText embeddings
        text_with_embedding = f"{text} {' '.join(map(str, embedding))}"

        # Tokenize and encode
        encoded_dict = tokenizer.encode_plus(
            text_with_embedding,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'].flatten())
        attention_masks.append(encoded_dict['attention_mask'].flatten())

    return {'input_ids': input_ids, 'attention_mask': attention_masks, 'labels': labels}

# Encode data with RoBERTa tokenizer
max_len = 100
train_encoded = encode_data_with_embeddings(X_train, X_train_embeddings, y_train, max_len=max_len)
val_encoded = encode_data_with_embeddings(X_val, X_val_embeddings, y_val, max_len=max_len)
test_encoded = encode_data_with_embeddings(X_test, X_test_embeddings, y_test, max_len=max_len)

train_dataset = Dataset.from_dict(train_encoded)
val_dataset = Dataset.from_dict(val_encoded)
test_dataset = Dataset.from_dict(test_encoded)

# Step 4: Define and Compile Your RoBERTa Model
roberta_base_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_encoder.classes_),
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./roberta_base_model",
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=1e-4,
    load_best_model_at_end=True,
)

# Define the Trainer with the updated training arguments
trainer = Trainer(
    model=roberta_base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))},
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
val_results = trainer.evaluate()
print("Validation Accuracy:", val_results['eval_accuracy'])
print("Validation Loss:", val_results['eval_loss'])

# Step 5: Test Evaluation
test_results = trainer.predict(test_dataset)
test_accuracy = accuracy_score(test_results.label_ids, test_results.predictions.argmax(-1))
test_loss = test_results.metrics['test_loss']
print(f'Test accuracy: {test_accuracy}')
print(f'Test loss: {test_loss}')

# Confusion Matrix
cm = confusion_matrix(test_results.label_ids, test_results.predictions.argmax(-1))
print('Confusion Matrix:')
print(cm)

# Actual vs Predicted Outputs
results = pd.DataFrame({'Actual': test_results.label_ids, 'Predicted': test_results.predictions.argmax(-1)})
print('Actual vs Predicted:')
print(results)

# Save results to a pickle file
with open('model_results.pkl', 'wb') as f:
    pickle.dump(results, f)
