# Feature Engineering

## Importing and setting up

In [8]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix

import matplotlib.pyplot as plt

In [9]:
data = pd.read_csv('data/train.csv')
df_cleaned = data.dropna(subset=['comment_text'])
df_train = df_cleaned[['comment_text','target']]

In [10]:
# Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0
df_train = df_train.copy()
df_train['toxic'] = np.where(df_train['target'] >= 0.50, 1, 0)

In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1804871 entries, 0 to 1804873
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   comment_text  object 
 1   target        float64
 2   toxic         int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 55.1+ MB


## Train-test split

In [11]:
#split the data in train and test

X_train, X_test, y_train, y_test = train_test_split(df_train['comment_text'], df_train['toxic'], random_state=42)

## Function to record different models performance

In [12]:
# initialize dataframe that will include the results
results_df = pd.DataFrame()

In [13]:
def evaluate_model(model, X_train,y_train,X_test,y_test, model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predict_probab)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    return results

## XLNET Tokenizer

### Tokenization

In [14]:
from transformers import XLNetTokenizer

# Load XLNET tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    
# Tokenize the training and test data
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors='pt') # 'pt' for pytorch
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors='pt')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

### Load XLNET Model

In [None]:
from transformers import XLNetForSequenceClassification

# Load XLNET model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)  # Assuming binary classification (toxic or not toxic)


### Training

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train_tokens['input_ids'], X_train_tokens['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(X_test_tokens['input_ids'], X_test_tokens['attention_mask'], torch.tensor(y_test.values))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set up optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(3):  # You can adjust the number of epochs
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        # Evaluate performance metrics as needed


In [None]:
# Assuming your model is already trained and in the 'model' variable

# Set the model to evaluation mode
model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Evaluate the model
with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        predicted_probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
        predicted_labels = (predicted_probs >= 0.5).astype(int)
        all_predictions.extend(predicted_labels)
        all_true_labels.extend(batch[2].cpu().numpy())

# Calculate and print accuracy, F1 score, and AUC
accuracy = accuracy_score(all_true_labels, all_predictions)
f1 = f1_score(all_true_labels, all_predictions)
roc_auc = roc_auc_score(all_true_labels, predicted_probs)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {roc_auc:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(all_true_labels, all_predictions))

# Plot ROC curve
fpr, tpr, _ = roc_curve(all_true_labels, predicted_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
