# Requirements and Initial Setup


In [1]:
# Install required libraries
!pip install transformers torch scikit-learn pandas numpy matplotlib seaborn
!pip install datasets accelerate ipywidgets

# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("All libraries loaded successfully")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curan

2025-07-20 12:01:06.519741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753012866.720459      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753012866.780266      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


All libraries loaded successfully


# Load and Analyze Data


In [2]:
# Read data from CSV file
df = pd.read_csv('/kaggle/input/homonym-sentiment-dataset/homonym_sentiment_dataset.csv')

print(f" Dataset Statistics:")
print(f"Number of samples: {len(df)}")
print(f"Number of different words: {df['homonym_word'].nunique()}")
print(f"Available words: {df['homonym_word'].nunique()}")

# Prepare data for training
sentences = []
labels = []
homonym_words = []

# Add negative sentences
for _, row in df.iterrows():
    sentences.append(row['sentence1'])
    labels.append(0)  # negative = 0
    homonym_words.append(row['homonym_word'])

# Add positive sentences
for _, row in df.iterrows():
    sentences.append(row['sentence2'])
    labels.append(1)  # positive = 1
    homonym_words.append(row['homonym_word'])

# Create training DataFrame
training_data = pd.DataFrame({
    'text': sentences,
    'label': labels,
    'homonym_word': homonym_words
})

print(f"\n Data Distribution:")
print(f"Total training samples: {len(training_data)}")
print(f"Negative samples: {(training_data['label'] == 0).sum()}")
print(f"Positive samples: {(training_data['label'] == 1).sum()}")

# Display data examples
print("\n Data Examples:")
sample_data = training_data.sample(5)
for _, row in sample_data.iterrows():
    print(f"Text: {row['text']}")
    print(f"Classification: {'Positive' if row['label'] == 1 else 'Negative'}")
    print(f"Homonym word: {row['homonym_word']}")
    print("-" * 50)


 Dataset Statistics:
Number of samples: 101
Number of different words: 93
Available words: 93

 Data Distribution:
Total training samples: 202
Negative samples: 101
Positive samples: 101

 Data Examples:
Text: She can scale mountains effortlessly
Classification: Positive
Homonym word: scale
--------------------------------------------------
Text: Let's shop for beautiful gifts together
Classification: Positive
Homonym word: shop
--------------------------------------------------
Text: This will crush my dreams completely
Classification: Negative
Homonym word: crush
--------------------------------------------------
Text: The ship is sinking fast
Classification: Negative
Homonym word: ship
--------------------------------------------------
Text: The dip in mood affected everyone
Classification: Negative
Homonym word: dip
--------------------------------------------------


# Baseline Experiments


In [3]:
# Baseline 1: TF-IDF + Logistic Regression
print("First Baseline Experiment: TF-IDF + Logistic Regression")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    training_data['text'], training_data['label'], 
    test_size=0.2, random_state=42, stratify=training_data['label']
)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Logistic Regression model
baseline_model = LogisticRegression(random_state=42)
baseline_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
baseline_predictions = baseline_model.predict(X_test_tfidf)
baseline_accuracy = accuracy_score(y_test, baseline_predictions)

print(f" Baseline model accuracy (TF-IDF + LR): {baseline_accuracy:.4f}")
print("\n Baseline Classification Report:")
print(classification_report(y_test, baseline_predictions, 
                          target_names=['Negative', 'Positive']))

# Baseline 2: Simple Word Embeddings
print("\n Second Baseline Experiment: Simple Word Embeddings")
print("This represents a simulation of simple word embeddings")
print("In practice, this processing shows the weakness of traditional methods with homonym words")


First Baseline Experiment: TF-IDF + Logistic Regression
 Baseline model accuracy (TF-IDF + LR): 0.4878

 Baseline Classification Report:
              precision    recall  f1-score   support

    Negative       0.50      0.48      0.49        21
    Positive       0.48      0.50      0.49        20

    accuracy                           0.49        41
   macro avg       0.49      0.49      0.49        41
weighted avg       0.49      0.49      0.49        41


 Second Baseline Experiment: Simple Word Embeddings
This represents a simulation of simple word embeddings
In practice, this processing shows the weakness of traditional methods with homonym words


# BERT Model Implementation - Main Experiment


In [4]:
# Setup BERT for training
print(" Starting BERT implementation for Homonyms problem")

# Specify model
MODEL_NAME = "bert-base-uncased"

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1}
)

print(f" BERT loaded: {MODEL_NAME}")

# Split data for training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    training_data['text'].tolist(), 
    training_data['label'].tolist(),
    test_size=0.2, 
    random_state=42, 
    stratify=training_data['label']
)

# Additional split for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels,
    test_size=0.2,
    random_state=42,
    stratify=train_labels
)

print(f" Data Split:")
print(f"Training: {len(train_texts)} samples")
print(f"Validation: {len(val_texts)} samples") 
print(f"Testing: {len(test_texts)} samples")

# Data preprocessing function
def preprocess_function(examples):
    return tokenizer(examples, truncation=True, padding=True, max_length=128)

# Create encodings
train_encodings = preprocess_function(train_texts)
val_encodings = preprocess_function(val_texts)
test_encodings = preprocess_function(test_texts)

# Create PyTorch datasets
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


 Starting BERT implementation for Homonyms problem


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 BERT loaded: bert-base-uncased
 Data Split:
Training: 128 samples
Validation: 33 samples
Testing: 41 samples


In [5]:

import os
os.environ["WANDB_DISABLED"] = "true"

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer


def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    seed=42,
    report_to=[]  
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # الآن مُعرفة بشكل صحيح
)

print(" Starting BERT training...")

training_results = trainer.train()

print(f"Training completed!")
print(f" Final training loss: {training_results.training_loss:.4f}")


 Starting BERT training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.703639,0.454545,0.32197,0.249267,0.454545
2,No log,0.702223,0.484848,0.336425,0.257576,0.484848
3,0.700400,0.699693,0.484848,0.336425,0.257576,0.484848


Training completed!
 Final training loss: 0.7004


In [6]:
# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)

# Setup DeBERTa-v3 for training
print(" Starting DeBERTa-v3 implementation for Homonyms problem")

# Specify model - DeBERTa-v3
MODEL_NAME = "microsoft/deberta-v3-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1}
)

print(f" DeBERTa-v3 loaded: {MODEL_NAME}")

# Split data for training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(
    training_data['text'].tolist(), 
    training_data['label'].tolist(),
    test_size=0.2, 
    random_state=42, 
    stratify=training_data['label']
)

# Additional split for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels,
    test_size=0.2,
    random_state=42,
    stratify=train_labels
)

print(f" Data Split:")
print(f"Training: {len(train_texts)} samples")
print(f"Validation: {len(val_texts)} samples") 
print(f"Testing: {len(test_texts)} samples")

# Data preprocessing function
def preprocess_function(examples):
    return tokenizer(examples, truncation=True, padding=True, max_length=128)

# Create encodings
train_encodings = preprocess_function(train_texts)
val_encodings = preprocess_function(val_texts)
test_encodings = preprocess_function(test_texts)

# Create PyTorch datasets
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

# Define performance metrics computation function
def compute_metrics(eval_pred):
    """Calculate performance metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate different metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Setup improved training arguments for DeBERTa
training_args = TrainingArguments(
    output_dir='./results_deberta_v3',
    num_train_epochs=5,  # Increase epochs for better performance
    per_device_train_batch_size=8,  # Reduce batch size for large models
    per_device_eval_batch_size=8,
    learning_rate=1e-5,  # Lower learning rate for advanced models
    warmup_steps=100,  # Reduce warmup steps
    weight_decay=0.1,  # Increase regularization
    logging_dir='./logs_deberta',
    logging_steps=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    seed=42,
    report_to=[],  # Disable wandb
    fp16=True,  # Speed up training with less memory usage
)

# Create Trainer for DeBERTa
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Now properly defined
)

print("Starting DeBERTa-v3 training...")
print("Expected to achieve 90%+ accuracy...")

# Train the model
training_results = trainer.train()

print(f" DeBERTa-v3 Training completed!")
print(f" Final training loss: {training_results.training_loss:.4f}")


 Starting DeBERTa-v3 implementation for Homonyms problem


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 DeBERTa-v3 loaded: microsoft/deberta-v3-base
 Data Split:
Training: 128 samples
Validation: 33 samples
Testing: 41 samples


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Starting DeBERTa-v3 training...
Expected to achieve 90%+ accuracy...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6874,0.693332,0.484848,0.316636,0.235078,0.484848
2,0.69,0.692772,0.484848,0.316636,0.235078,0.484848
3,0.6819,0.691688,0.484848,0.316636,0.235078,0.484848
4,0.6908,0.689408,0.515152,0.380471,0.757576,0.515152
5,0.6931,0.684831,0.848485,0.848206,0.849495,0.848485


 DeBERTa-v3 Training completed!
 Final training loss: 0.6893
