In [None]:
!pip uninstall accelerate transformers

In [None]:
!pip install accelerate transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device being used: {device}")

In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Reviews_Dataset/Dataset_v5.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Display the DataFrame
df.head()


In [None]:
import numpy as np

columns_to_check = [
    'ProcessedValence',
    'ProcessedUnit',
    'ProcessedType',
    'Gender',
    'Ethnicity',
    'Age',
    'Comment',
    'Hospital',
    'ProcessedUnit',
    'Age',
    'Employment Status',
    'Access to Transportation',
    'Income/Poverty Level'
    ]


df_cleaned = df.dropna(subset=columns_to_check)

df_filtered = df_cleaned[df_cleaned['Comment'].str.strip().astype(bool)]

df_final = df_filtered[~(df_filtered[columns_to_check].eq("").any(axis=1))]

In [None]:
columns_to_drop = ['CommentLength', 'Type', 'CleanedComment', 'ExperienceDate', 'ExperienceDateString', 'Valence', 'Unit', 'Code', 'Entities', 'ProcessedComment', 'CovidRelated', 'CovidPeriod', 'Day']

df_final.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
df_final.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# List of columns to encode
columns_to_encode = [
    'ProcessedUnit',
    'ProcessedType',
    'Gender',
    'Ethnicity',
    'Age',
    'Hospital',
    'ProcessedUnit',
    'Age',
    'Employment Status',
    'Access to Transportation',
    'Income/Poverty Level',
    'ProcessedValence'
]

for column in tqdm(columns_to_encode, desc="Encoding Columns"):
    # Check if the column exists in the DataFrame to avoid errors
    if column in df_final.columns:
        df_final[f'Encoded{column.replace(" ", "_") if " " in column else column}'] = label_encoder.fit_transform(df_final[column])
        print(f"Encoded {column} successfully.")
    else:
        print(f"{column} does not exist in the DataFrame.")

In [None]:
columns_to_keep = ['Comment',
       'EncodedProcessedUnit', 'EncodedProcessedType', 'EncodedGender',
       'EncodedEthnicity', 'EncodedAge', 'EncodedHospital',
       'EncodedEmployment_Status', 'EncodedAccess_to_Transportation',
       'EncodedIncome/Poverty_Level', 'EncodedProcessedValence']

df_encoded = df_final[columns_to_keep]

In [None]:
df_encoded.head()

In [None]:
# Split data into training, validation, and test sets
train_data, temp_data = train_test_split(df_encoded, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Inspect the split
print(f"Train data: {len(train_data)}, Validation data: {len(val_data)}, Test data: {len(test_data)}")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch
import pandas as pd
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Load pre-trained BERT model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Modify the classifier to fit the number of labels
model.classifier = nn.Linear(model.config.hidden_size, 4)
model.num_labels = 4

In [None]:
def tokenize_data(data, tokenizer, max_length=128):
    return tokenizer.batch_encode_plus(
        data['Comment'].tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

In [None]:
# Tokenize training, validation, and test data
train_encodings = tokenize_data(train_data, tokenizer)
val_encodings = tokenize_data(val_data, tokenizer)
test_encodings = tokenize_data(test_data, tokenizer)

In [None]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, sensitive_features):
        self.encodings = encodings
        self.labels = labels
        self.sensitive_features = sensitive_features

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        for key in self.sensitive_features.keys():
            item[key] = torch.tensor(self.sensitive_features[key].iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = ReviewDataset(train_encodings, train_data['EncodedProcessedValence'].values,
                              {'encoded_gender': train_data['EncodedGender'],
                               'encoded_ethnicity': train_data['EncodedEthnicity'],
                               'encoded_income': train_data['EncodedIncome/Poverty_Level']})
val_dataset = ReviewDataset(val_encodings, val_data['EncodedProcessedValence'].values,
                            {'encoded_gender': val_data['EncodedGender'],
                             'encoded_ethnicity': val_data['EncodedEthnicity'],
                             'encoded_income': val_data['EncodedIncome/Poverty_Level']})
test_dataset = ReviewDataset(test_encodings, test_data['EncodedProcessedValence'].values,
                             {'encoded_gender': test_data['EncodedGender'],
                              'encoded_ethnicity': test_data['EncodedEthnicity'],
                              'encoded_income': test_data['EncodedIncome/Poverty_Level']})

In [None]:
import torch
import torch.nn.functional as F

def compute_fairness_loss(inputs, logits, labels, genders):
    device = logits.device  # Get the device of the logits tensor

    # Convert logits to probabilities
    probs = torch.nn.Softmax(dim=1)(logits)
    predicted_labels = torch.argmax(probs, dim=1)

    # Calculate masks for each group
    group_0_mask = (genders == 0).to(device)
    group_1_mask = (genders == 1).to(device)

    # Initialize accumulators for TPR and FPR differences
    tpr_diff_sum = torch.tensor(0.0, device=device)
    fpr_diff_sum = torch.tensor(0.0, device=device)
    num_labels = 4  # Number of labels: 0, 1, 2, 3

    for label in range(num_labels):
        # True Positives
        true_positives_group_0 = torch.sum((predicted_labels == label) & (labels == label) & group_0_mask)
        true_positives_group_1 = torch.sum((predicted_labels == label) & (labels == label) & group_1_mask)

        # False Positives
        false_positives_group_0 = torch.sum((predicted_labels == label) & (labels != label) & group_0_mask)
        false_positives_group_1 = torch.sum((predicted_labels == label) & (labels != label) & group_1_mask)

        # Positives in labels
        positives_group_0 = torch.sum((labels == label) & group_0_mask)
        positives_group_1 = torch.sum((labels == label) & group_1_mask)

        # Negatives in labels
        negatives_group_0 = torch.sum((labels != label) & group_0_mask)
        negatives_group_1 = torch.sum((labels != label) & group_1_mask)

        # Calculate TPR for each label and each group, handling division by zero
        tpr_group_0 = true_positives_group_0.float() / positives_group_0.float() if positives_group_0 != 0 else torch.tensor(0.0, device=device)
        tpr_group_1 = true_positives_group_1.float() / positives_group_1.float() if positives_group_1 != 0 else torch.tensor(0.0, device=device)

        # Calculate FPR for each label and each group, handling division by zero
        fpr_group_0 = false_positives_group_0.float() / negatives_group_0.float() if negatives_group_0 != 0 else torch.tensor(0.0, device=device)
        fpr_group_1 = false_positives_group_1.float() / negatives_group_1.float() if negatives_group_1 != 0 else torch.tensor(0.0, device=device)

        # Calculate the absolute differences for TPR and FPR
        tpr_diff = torch.abs(tpr_group_0 - tpr_group_1)
        fpr_diff = torch.abs(fpr_group_0 - fpr_group_1)

        # Accumulate the differences
        tpr_diff_sum += tpr_diff
        fpr_diff_sum += fpr_diff

    # Average the differences
    avg_tpr_diff = tpr_diff_sum / num_labels
    avg_fpr_diff = fpr_diff_sum / num_labels

    # Combine TPR and FPR differences to form the fairness loss
    fairness_loss = (avg_tpr_diff + avg_fpr_diff) / 2  # Averaging the two differences

    return fairness_loss


In [None]:
def custom_collate(batch):
    collated = {}
    for key in batch[0].keys():
        collated[key] = torch.stack([item[key] for item in batch])
    return collated

In [None]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader

class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            collate_fn=custom_collate,
            shuffle=True,
            num_workers=self.args.dataloader_num_workers,
        )

    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset or self.eval_dataset
        return DataLoader(
            eval_dataset,
            batch_size=self.args.eval_batch_size,
            collate_fn=custom_collate,
            num_workers=self.args.dataloader_num_workers,
        )
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        genders = inputs.pop("encoded_gender")
        ethnicities = inputs.pop("encoded_ethnicity")
        incomes = inputs.pop("encoded_income")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = torch.nn.CrossEntropyLoss()(logits, labels)
        fairness_loss = compute_fairness_loss(inputs, logits, labels, genders)
        fairness_weight = 0.5
        total_loss = loss + fairness_weight * fairness_loss
        return (total_loss, outputs) if return_outputs else total_loss


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

In [None]:
eval_result = custom_trainer.evaluate(eval_dataset=test_dataset)
print(f"Evaluation Results: {eval_result}")

In [None]:
trainer.save_model("./final_model")