In [1]:
!pip install accelerate -U
!pip install datasets
!pip install transformers
!pip install seqeval




In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the dataset
file_path = 'IMDB Dataset.csv'  # Update the path to your dataset
df = pd.read_csv(file_path)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
train_df, test_df = train_test_split(df, test_size=0.25)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the input
def tokenize(batch):
    return tokenizer(batch['review'], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)




(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/37500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

In [3]:
from transformers import TrainerCallback, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

class CustomTrainerCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if 'eval_loss' in state.log_history[-1]:
            print(f"Epoch {state.epoch}: Evaluation Accuracy: {state.log_history[-1]['eval_accuracy']}")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {'accuracy': accuracy_score(labels, preds)}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Running the model for 10 epochs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',  # Evaluate after each epoch
    save_strategy='epoch',        # Save after each epoch
    load_best_model_at_end=True   # Load the best model at the end
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[CustomTrainerCallback()]
)

# Train the model
trainer.train()


# Save the model
model.save_pretrained('./distilbert_imdb_model')

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2434,0.221745,0.92272
2,0.0967,0.282688,0.93104
3,0.1043,0.372195,0.92584
4,0.0005,0.486651,0.92056
5,0.0541,0.505049,0.92352
6,0.0804,0.556048,0.92376
7,0.0562,0.743158,0.92072
8,0.0,0.673098,0.9248
9,0.0,0.759693,0.92392
10,0.0,0.722667,0.92704


In [None]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('./distilbert_imdb_model', output_attentions=True)

# Function to get attention weights
def get_attention_weights(text):
    inputs = tokenizer(text, return_tensors='pt')
    outputs = model(**inputs)
    attentions = outputs[-1]  # Get the attentions from the model output
    return attentions

def visualize_attention(text):
    attentions = get_attention_weights(text)

    # Choose which layer and head to visualize, or average them
    # Example: Averaging over all heads in the first layer
    attention = attentions[0].mean(dim=1)[0].detach().numpy()  # Averaging across heads

    # Create a heatmap
    plt.figure(figsize=(10, 10))
    sns.heatmap(attention, annot=False, cmap='viridis')
    plt.title("Attention Weights")
    plt.xlabel("Tokens in Sequence")
    plt.ylabel("Tokens in Sequence")
    plt.show()

text = "movie is great."
visualize_attention(text)



