# Feature Engineering

## Importing and setting up

In [18]:
# !pip install pytorch-transformers

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting boto3 (from pytorch-transformers)
  Downloading boto3-1.33.5-py3-none-any.whl.metadata (6.7 kB)
Collecting sacremoses (from pytorch-transformers)
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting botocore<1.34.0,>=1.33.5 (from boto3->pytorch-transformers)
  Downloading botocore-1.33.5-py3-none-any.whl.metadata (6.1 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->pytorch-transformers)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.9.0,>=0.8.2 (from boto3->pytorch-transformers)
  Downloading s3transfer-0.8.2-py3-none-any.whl.metadata (1.8 kB)
Collecting urllib3<3,>=1.21.1 (from requests->pytorch-transformers)
  Using cached urllib3-2.0.7-py3-none-any.whl.metadata (6.6 kB)
Downloading

In [19]:
import pandas as pd
import numpy as np
import time
import torch
import jupyterlab

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix

import matplotlib.pyplot as plt

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [32]:
data = pd.read_csv('data/train.csv', nrows=500)
df_cleaned = data.dropna(subset=['comment_text'])
df_train = df_cleaned[['comment_text','target']]

# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_train.copy()
df_train['toxic'] = np.where(df_train['target'] >= 0.50, 1, 0)

df_train_small = df_train.copy()

In [33]:
# Using only 5% of datset
percentage = 5
df_train_small = df_train.sample(frac=percentage / 100, random_state=42)

## Create sentences

In [34]:
df_train_small.columns

Index(['comment_text', 'target', 'toxic'], dtype='object')

In [44]:
df_train_small['toxic'].unique()

array([0, 1])

In [35]:
# Addition of [SEP] and [CLS] tokens
sentences  = []
for sentence in df_train_small['comment_text']:
  sentence = sentence + "[SEP] [CLS]"
  sentences.append(sentence)

## Import all dependencies again

In [56]:
from pytorch_transformers import XLNetTokenizer,XLNetForSequenceClassification,XLNetConfig

In [37]:
from sklearn.model_selection import train_test_split
from pytorch_transformers import AdamW
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler

## Split the data

In [38]:
'''#split the data in train and test
X = df_train_small['comment_text']
y = df_train_small['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)'''

"#split the data in train and test\nX = df_train_small['comment_text']\ny = df_train_small['toxic']\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"

## Inputs
- XLNet tokenizer is used to convert our text into tokens that correspond to XLNet’s vocabulary.
- a sequence of integers identifying each input token to its index number in the XLNet tokenizer
- Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary

In [39]:
tokenizer  = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case=True, remove_space=True,)
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]

In [41]:
ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]

In [45]:
print(ids[0])
labels = df_train_small['toxic'].values
print(labels[0])

[17, 150, 115, 18, 8036, 28, 52, 1902, 9, 10849, 23, 3882, 3158, 4145, 11974, 23, 3158]
0


### We find the maximum length of our sentences so that we can pad the rest

In [46]:
max1 = len(ids[0])
for i in ids:
  if(len(i)>max1):
    max1=len(i)
print(max1)
MAX_LEN = max1

215


### Pad the sentences

In [61]:
input_ids2 = pad_sequences(ids,maxlen=180,dtype="long",truncating="post",padding="post")

xtrain,xtest,ytrain,ytest = train_test_split(input_ids2,labels,test_size=0.15)

print(len(input_ids2[0]))

180


In [62]:
Xtrain = torch.tensor(xtrain)
Ytrain = torch.tensor(ytrain)
Xtest = torch.tensor(xtest)
Ytest = torch.tensor(ytest)

In [63]:
batch_size = 2

In [64]:
train_data = TensorDataset(Xtrain,Ytrain)
test_data = TensorDataset(Xtest,Ytest)
loader = DataLoader(train_data,batch_size=batch_size)
test_loader = DataLoader(test_data,batch_size=batch_size)

In [65]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased",num_labels=2)
model.cuda()

RuntimeError: Trying to create tensor with negative dimension -1: [-1, 768]

In [None]:
optimizer = AdamW(model.parameters(),lr=2e-5)# We pass model parameters

In [None]:

import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [None]:

import numpy as np
def flat_accuracy(preds,labels):  # A function to predict Accuracy
  correct=0
  for i in range(0,len(labels)):
    if(preds[i]==labels[i]):
      correct+=1
  return (correct/len(labels))*100

### Begin training

In [None]:
no_train = 0
epochs = 2
for epoch in range(epochs):
  model.train()
  loss1 = []
  steps = 0
  train_loss = []
  l = []
  for inputs,labels1 in loader :
    inputs.to(device)
    labels1.to(device)
    optimizer.zero_grad()
    outputs = model(inputs.to(device))
    loss = criterion(outputs[0],labels1.to(device)).to(device)
    logits = outputs[1]
    #ll=outp(loss)
    [train_loss.append(p.item()) for p in torch.argmax(outputs[0],axis=1).flatten() ] # our predicted 
    [l.append(z.item()) for z in labels1] # real labels
    loss.backward()
    optimizer.step()
    loss1.append(loss.item())
    no_train += inputs.size(0)
    steps += 1
  print("Current Loss is : {} Step is : {} number of Example : {} Accuracy : {}".format(loss.item(),epoch,no_train,flat_accuracy(train_loss,l)))

# ----------- 

### Function to evaluate model

In [40]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

# Function to evaluate the model
def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predict_probab)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## XLNET Tokenizer

### Tokenization

In [13]:
from transformers import XLNetTokenizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load XLNET tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                           dolowercase=True,
                                           remove_space=True,
                                           bos_token='<s>',
                                           unk_token='<unk>',
                                           )

# https://huggingface.co/docs/transformers/model_doc/xlnet#transformers.XLNetTokenizer

In [17]:
# Tokenize the training and test data
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='tf', add_special_tokens=True) # tf for tensorflow
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='tf', add_special_tokens=True) # tf for tensorflow

In [16]:
X_train_tokens = X_train_tokens + ['<sep>', '<cls>']
X_test_tokens = X_test_tokens + ['<sep>', '<cls>']

TypeError: unsupported operand type(s) for +: 'BatchEncoding' and 'list'

### Load XLNET Model

In [None]:
from transformers import XLNetForSequenceClassification # Transformer wrapper for PyTorch
from transformers import TFXLNetForSequenceClassification # Transformer wrapper for TensorFlow

# Load XLNET model
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)  # Assuming binary classification (toxic or not toxic)

### Training

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.data import Dataset

In [None]:
# Create DataLoader for training and testing
train_dataset = Dataset.from_tensor_slices((X_train_tokens['input_ids'].numpy(),
                                            X_train_tokens['attention_mask'].numpy(),
                                            y_train.values))
test_dataset = Dataset.from_tensor_slices((X_test_tokens['input_ids'].numpy(),
                                           X_test_tokens['attention_mask'].numpy(),
                                           y_test.values))

In [None]:
# Shuffle and batch the training data
train_dataset = train_dataset.shuffle(buffer_size=len(X_train_tokens), seed=42)
train_dataset = train_dataset.batch(16, drop_remainder=True)

# Batch the test data
test_dataset = test_dataset.batch(5, drop_remainder=False) # Set to False to get the last batch 
                                                           # if the total number of samples is not 
                                                           # divisible by the batch size

In [None]:
# Convert to TensorFlow tensors
train_dataset = train_dataset.map(lambda input_ids, attention_mask, labels: (tf.convert_to_tensor(input_ids),
                                                                            tf.convert_to_tensor(attention_mask),
                                                                            tf.convert_to_tensor(labels)))

test_dataset = test_dataset.map(lambda input_ids, attention_mask, labels: (tf.convert_to_tensor(input_ids),
                                                                          tf.convert_to_tensor(attention_mask)))

In [None]:
# Set up optimizer and loss function
optimizer = Adam(learning_rate=2e-2)
criterion = SparseCategoricalCrossentropy(from_logits=True)

In [None]:
# Train the model
for epoch in range(1):  # You can adjust the number of epochs
    for batch in train_dataset:
        input_ids, attention_mask, labels = batch

        with tf.GradientTape() as tape:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, training=True)
            loss = criterion(labels, outputs.logits)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
# Evaluate the model
model.evaluate(test_dataset)

In [None]:
'''from torch.utils.data import DataLoader, TensorDataset

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(X_test_tokens['input_ids'], X_test_tokens['attention_mask'], torch.tensor(y_test.values))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set up optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(3):  # You can adjust the number of epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        # Evaluate performance metrics as needed
'''

In [None]:
# Assuming your model is already trained and in the 'model' variable

# Set the model to evaluation mode
model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Evaluate the model
with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        predicted_probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
        predicted_labels = (predicted_probs >= 0.5).astype(int)
        all_predictions.extend(predicted_labels)
        all_true_labels.extend(batch[2].cpu().numpy())

# Calculate and print accuracy, F1 score, and AUC
accuracy = accuracy_score(all_true_labels, all_predictions)
f1 = f1_score(all_true_labels, all_predictions)
roc_auc = roc_auc_score(all_true_labels, predicted_probs)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {roc_auc:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(all_true_labels, all_predictions))

# Plot ROC curve
fpr, tpr, _ = roc_curve(all_true_labels, predicted_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
