In [None]:
import google.colab.drive as drive
import pandas as pd
import copy

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Text Mining/pii-detection-removal-from-educational-data/labeled_data.csv')

In [None]:
data['Text'] = copy.deepcopy(data['tweet'])
data['Label'] = copy.deepcopy(data['class'])

In [None]:
data['Label']

0        2
1        1
2        1
3        1
4        1
        ..
24778    1
24779    2
24780    1
24781    1
24782    2
Name: Label, Length: 24783, dtype: int64

In [None]:
data['Text']

0        !!! RT @mayasolovely: As a woman you shouldn't...
1        !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2        !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3        !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4        !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
                               ...                        
24778    you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779    you've gone and broke the wrong heart baby, an...
24780    young buck wanna eat!!.. dat nigguh like I ain...
24781                youu got wild bitches tellin you lies
24782    ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
Name: Text, Length: 24783, dtype: object

In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn.functional as F
import numpy as np
import string
import copy
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW



# Load pre-trained XLNet tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Assume data is your DataFrame with 'Text' and 'Label' columns
texts = data['Text'].tolist()  # Your list of text samples
labels = data['Label'].tolist()  # Corresponding labels for each text sample

# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
num_labels = len(le.classes_)
# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels_encoded, test_size=0.2, random_state=85)

# Tokenize and encode the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert the tokenized data to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)
val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Load pre-trained XLNet model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
model.to(device)

best_model_state_dict = None
best_val_accuracy = 0  # Track best validation accuracy
best_val_loss = np.inf  # Track best validation loss
early_stopping_patience = 3  # Number of epochs to wait for early stopping
early_stopping_counter = 0
training_losses = []
validation_losses = []
training_accuracies = []
validation_accuracies = []

for epoch in range(10):  # Change the number of epochs as needed
    model.train()
    train_losses = []
    train_preds = []
    train_true = []

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_losses.append(loss.item())

        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs.logits, axis=1)
        train_preds.extend(preds.cpu().numpy())
        train_true.extend(labels.cpu().numpy())

    train_accuracy = accuracy_score(train_true, train_preds)
    avg_train_loss = np.mean(train_losses)

    # Validation
    model.eval()
    val_preds = []
    val_true = []
    val_losses = []

    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        loss = F.cross_entropy(logits, labels)
        val_losses.append(loss.item())

        preds = torch.argmax(logits, axis=1)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true, val_preds)
    avg_val_loss = np.mean(val_losses)

    print(f'Epoch {epoch + 1}: Training Accuracy: {train_accuracy:.4f}, Training Loss: {avg_train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}')
    training_losses.append(avg_train_loss)
    validation_losses.append(avg_val_loss)
    training_accuracies.append(train_accuracy)
    validation_accuracies.append(val_accuracy)
    # Update best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state_dict = copy.deepcopy(model.state_dict())
        print('New Best Model')

    if early_stopping_counter >= early_stopping_patience:
        print('Early stopping triggered.')
        break


In [None]:
model.load_state_dict(best_model_state_dict)
model.eval()
model.to(device)
# Evaluate on validation set
val_preds = []
val_true = []
for batch in val_loader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, axis=1)
    val_preds.extend(preds.cpu().numpy())
    val_true.extend(labels.cpu().numpy())
val_preds = le.inverse_transform(val_preds)
val_true = le.inverse_transform(val_true)
# Print classification report
print(classification_report(val_true, val_preds))

              precision    recall  f1-score   support

           0       0.56      0.23      0.32       273
           1       0.94      0.97      0.95      3867
           2       0.87      0.90      0.88       817

    accuracy                           0.92      4957
   macro avg       0.79      0.70      0.72      4957
weighted avg       0.90      0.92      0.91      4957



**Training with class weights of [5.0, 1.0, 1.0]**

In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn.functional as F
import numpy as np
import string
import copy
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW



# Load pre-trained XLNet tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
 # Adjust weights as needed for your classes

# Assume data is your DataFrame with 'Text' and 'Label' columns
texts = data['Text'].tolist()  # Your list of text samples
labels = data['Label'].tolist()  # Corresponding labels for each text sample

# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
num_labels = len(le.classes_)
# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels_encoded, test_size=0.2, random_state=85)

# Tokenize and encode the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert the tokenized data to PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)
val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Load pre-trained XLNet model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class_weights = torch.tensor([5.0, 1.0, 1.0]).to(device)
# device = torch.device('cpu')
model.to(device)

best_model_state_dict = None
best_val_accuracy = 0  # Track best validation accuracy
best_val_loss = np.inf  # Track best validation loss
early_stopping_patience = 3  # Number of epochs to wait for early stopping
early_stopping_counter = 0
training_losses = []
validation_losses = []
training_accuracies = []
validation_accuracies = []

for epoch in range(10):  # Change the number of epochs as needed
    model.train()
    train_losses = []
    train_preds = []
    train_true = []

    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = F.cross_entropy(outputs.logits, labels, weight=class_weights)
        train_losses.append(loss.item())

        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs.logits, axis=1)
        train_preds.extend(preds.cpu().numpy())
        train_true.extend(labels.cpu().numpy())

    train_accuracy = accuracy_score(train_true, train_preds)
    avg_train_loss = np.mean(train_losses)

    # Validation
    model.eval()
    val_preds = []
    val_true = []
    val_losses = []

    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        loss = F.cross_entropy(logits, labels)
        val_losses.append(loss.item())

        preds = torch.argmax(logits, axis=1)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true, val_preds)
    avg_val_loss = np.mean(val_losses)

    print(f'Epoch {epoch + 1}: Training Accuracy: {train_accuracy:.4f}, Training Loss: {avg_train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}')
    training_losses.append(avg_train_loss)
    validation_losses.append(avg_val_loss)
    training_accuracies.append(train_accuracy)
    validation_accuracies.append(val_accuracy)
    # Update best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state_dict = copy.deepcopy(model.state_dict())
        print('New Best Model')

    if early_stopping_counter >= early_stopping_patience:
        print('Early stopping triggered.')
        break


In [None]:
model.load_state_dict(best_model_state_dict)
model.eval()
model.to(device)
# Evaluate on validation set
val_preds = []
val_true = []
for batch in val_loader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, axis=1)
    val_preds.extend(preds.cpu().numpy())
    val_true.extend(labels.cpu().numpy())
val_preds = le.inverse_transform(val_preds)
val_true = le.inverse_transform(val_true)
# Print classification report
print(classification_report(val_true, val_preds))

              precision    recall  f1-score   support

           0       0.44      0.54      0.49       273
           1       0.95      0.94      0.95      3867
           2       0.89      0.88      0.89       817

    accuracy                           0.91      4957
   macro avg       0.76      0.79      0.77      4957
weighted avg       0.92      0.91      0.91      4957

