LSTM using GloVe

In [None]:
import pandas as pd
import numpy as np
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, GlobalMaxPool1D, BatchNormalization, Dropout, Dense
from keras.layers import Bidirectional
from keras.models import Sequential
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import tensorflow as tf

train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

# Assuming you have label columns for different toxic categories
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Define a function to compute the 'non_toxic' label based on absence of any toxic labels
def compute_non_toxic(row):
    if row[label_cols].sum() == 0:
        return 1  # Non-toxic
    else:
        return 0  # Toxic

# Apply the function to create the 'non_toxic' column
train['non_toxic'] = train.apply(compute_non_toxic, axis=1)

# Define label columns including 'non_toxic'
label_cols_with_non_toxic = label_cols + ['non_toxic']

# Clean text function
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip whitespace
    return text

# Apply text cleaning to 'comment_text' column
train['cleaned_text'] = train['comment_text'].apply(clean_text)



In [None]:
# Prepare tokenizer
tokens = Tokenizer()
tokens.fit_on_texts(train['cleaned_text'])
vocab_size = len(tokens.word_index) + 1

# Tokenize and pad sequences
max_len = 300
tokenized_train = tokens.texts_to_sequences(train['cleaned_text'])
padded_train = pad_sequences(tokenized_train, maxlen=max_len, padding='post')

# Load pre-trained GloVe embeddings
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))

with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        word, *vector = line.split()
        if word in tokens.word_index:
            idx = tokens.word_index[word]
            embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

# Build LSTM model with GloVe embeddings
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(LSTM(50, return_sequences = True))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(len(label_cols_with_non_toxic), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define training labels including 'non_toxic'
y_train = train[label_cols_with_non_toxic].values

# Fit the model
history = model.fit(padded_train, y_train, epochs=10, batch_size=128, validation_split=0.3)



In [None]:
# Evaluate the model
y_pred_probs = model.predict(padded_train)
y_pred = np.round(y_pred_probs)

# Calculate evaluation metrics for each label including 'non_toxic'
evaluation_metrics = {}
for i, label in enumerate(label_cols_with_non_toxic):
    accuracy = accuracy_score(y_train[:, i], y_pred[:, i])
    precision = precision_score(y_train[:, i], y_pred[:, i])
    recall = recall_score(y_train[:, i], y_pred[:, i])
    f1 = f1_score(y_train[:, i], y_pred[:, i])
    roc_auc = roc_auc_score(y_train[:, i], y_pred_probs[:, i])

    evaluation_metrics[label] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'ROC AUC': roc_auc
    }

# Print evaluation metrics for each label including 'non_toxic'
for label, metrics in evaluation_metrics.items():
    print(f"Metrics for '{label}':")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print()

# Overall classification report including 'non_toxic'
y_true = train[label_cols_with_non_toxic].values
y_pred_labels = np.round(y_pred_probs)
classification_report_all = classification_report(y_true, y_pred_labels, target_names=label_cols_with_non_toxic)
print("Classification Report (All Labels):")
print(classification_report_all)

# Calculate and print average accuracy across all labels
average_accuracy = accuracy_score(y_true, y_pred_labels)
print(f"Average Accuracy: {average_accuracy}")


BILSTM using GloVe

In [None]:
# Prepare tokenizer
tokens = Tokenizer()
tokens.fit_on_texts(train['cleaned_text'])
vocab_size = len(tokens.word_index) + 1

# Tokenize and pad sequences
max_len = 300
tokenized_train = tokens.texts_to_sequences(train['cleaned_text'])
padded_train = pad_sequences(tokenized_train, maxlen=max_len, padding='post')

# Load pre-trained GloVe embeddings
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))

with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        word, *vector = line.split()
        if word in tokens.word_index:
            idx = tokens.word_index[word]
            embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

# Build LSTM model with GloVe embeddings
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Bidirectional(LSTM(50, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(len(label_cols_with_non_toxic), activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define training labels including 'non_toxic'
y_train = train[label_cols_with_non_toxic].values

# Fit the model
history = model.fit(padded_train, y_train, epochs=10, batch_size=128, validation_split=0.3)



In [None]:
# Evaluate the model
y_pred_probs = model.predict(padded_train)
y_pred = np.round(y_pred_probs)

# Calculate evaluation metrics for each label including 'non_toxic'
evaluation_metrics = {}
for i, label in enumerate(label_cols_with_non_toxic):
    accuracy = accuracy_score(y_train[:, i], y_pred[:, i])
    precision = precision_score(y_train[:, i], y_pred[:, i])
    recall = recall_score(y_train[:, i], y_pred[:, i])
    f1 = f1_score(y_train[:, i], y_pred[:, i])
    roc_auc = roc_auc_score(y_train[:, i], y_pred_probs[:, i])

    evaluation_metrics[label] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'ROC AUC': roc_auc
    }

# Print evaluation metrics for each label including 'non_toxic'
for label, metrics in evaluation_metrics.items():
    print(f"Metrics for '{label}':")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print()

# Overall classification report including 'non_toxic'
y_true = train[label_cols_with_non_toxic].values
y_pred_labels = np.round(y_pred_probs)
classification_report_all = classification_report(y_true, y_pred_labels, target_names=label_cols_with_non_toxic)
print("Classification Report (All Labels):")
print(classification_report_all)

# Calculate and print average accuracy across all labels
average_accuracy = accuracy_score(y_true, y_pred_labels)
print(f"Average Accuracy: {average_accuracy}")


Tfidf - Gradient Boost

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load train dataset
train_df = pd.read_csv('train1.csv')

# Text preprocessing function
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip whitespace
    return text

# Clean text data
train_df['cleaned_text'] = train_df['comment_text'].apply(clean_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words=stopwords.words('english'))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
y_train = train_df['toxic']

# Convert TF-IDF matrix to dense array
X_train = X_train_tfidf.toarray()

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Split the balanced dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_balanced, y_train_balanced, test_size=0.2, random_state=42)




In [None]:
# Train Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train, y_train)

# Evaluate on validation set
y_pred = gb_classifier.predict(X_val)

# Compute evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print("Gradient Boosting Classifier Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_val, y_pred))

LSTM - tfidf

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
from imblearn.over_sampling import SMOTE

# Load your dataset (assuming it has a 'comment_text' column and labels for toxic categories)
train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

# Assuming you have label columns for different toxic categories
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Define a function to clean the text
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip whitespace
    return text

# Apply text cleaning to 'comment_text' column
train['cleaned_text'] = train['comment_text'].apply(clean_text)

# Create a 'non-toxic' label based on absence of any toxic labels
train['non_toxic'] = 1 - train[label_cols].max(axis=1)

# Combine all labels (including 'non-toxic') for multi-label classification
all_label_cols = label_cols + ['non_toxic']

# Prepare TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=200, tokenizer=None, sublinear_tf=True,
                                    min_df=1, norm='l2', encoding='utf-8', lowercase=False,
                                    ngram_range=(1, 2), stop_words='english')

# Vectorize the cleaned text data
X_train_tfidf = tfidf_vectorizer.fit_transform(train['cleaned_text'])
y_train = train[all_label_cols].values

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Apply SMOTE separately to each label column to handle class imbalance
smote = SMOTE(random_state=42)
y_train_resampled = np.zeros_like(y_train)

for i in range(y_train.shape[1]):
    _, y_resampled = smote.fit_resample(X_train, y_train[:, i])
    y_train_resampled[:, i] = y_resampled[:X_train.shape[0]]  # Take only the relevant part of y_resampled

# Convert sparse matrices to dense arrays for LSTM input
X_train_dense = X_train.toarray()
X_valid_dense = X_valid.toarray()

# Reshape input data for LSTM (assuming sequences of TF-IDF vectors)
max_len = X_train_dense.shape[1]  # Max sequence length (number of features in TF-IDF)
X_train_lstm = X_train_dense.reshape(X_train_dense.shape[0], 1, max_len)  # Reshape for LSTM
X_valid_lstm = X_valid_dense.reshape(X_valid_dense.shape[0], 1, max_len)  # Reshape for LSTM

# Build a multi-label classification model using LSTM in Keras
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(32))
    model.add(Dense(len(all_label_cols), activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create and train the LSTM model
lstm_model = build_lstm_model(input_shape=(1, max_len))
lstm_model.fit(X_train_lstm, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_valid_lstm, y_valid), verbose=1)



In [None]:
# Generate predictions for validation data
y_pred_proba = lstm_model.predict(X_valid_lstm)
y_pred_binary = (y_pred_proba > 0.5).astype(int)

# Generate classification report
class_report = classification_report(y_valid, y_pred_binary, target_names=all_label_cols)

# Print the classification report
print("Classification Report:")
print(class_report)

# Calculate accuracy for each label
accuracies = {}
for i, label in enumerate(all_label_cols):
    accuracy = accuracy_score(y_valid[:, i], y_pred_binary[:, i])
    accuracies[label] = accuracy

# Print accuracy for each label
print("\nAccuracy for Each Label:")
for label, acc in accuracies.items():
    print(f"{label}: {acc}")

# Calculate average accuracy across all labels (including 'non-toxic')
average_accuracy = accuracy_score(y_valid, y_pred_binary, normalize=True)

# Print the average accuracy
print("\nAverage Accuracy (Overall):", average_accuracy)


BiLSTM - tfidf

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Dropout
import tensorflow as tf
from imblearn.over_sampling import SMOTE

# Load your dataset (assuming it has a 'comment_text' column and labels for toxic categories)
train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

# Assuming you have label columns for different toxic categories
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Define a function to clean the text
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip whitespace
    return text

# Apply text cleaning to 'comment_text' column
train['cleaned_text'] = train['comment_text'].apply(clean_text)

# Create a 'non-toxic' label based on absence of any toxic labels
train['non_toxic'] = 1 - train[label_cols].max(axis=1)

# Combine all labels (including 'non-toxic') for multi-label classification
all_label_cols = label_cols + ['non_toxic']

# Prepare TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=200, tokenizer=None, sublinear_tf=True,
                                    min_df=1, norm='l2', encoding='utf-8', lowercase=False,
                                    ngram_range=(1, 2), stop_words='english')

# Vectorize the cleaned text data
X_train_tfidf = tfidf_vectorizer.fit_transform(train['cleaned_text'])
y_train = train[all_label_cols].values

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Apply SMOTE separately to each label column to handle class imbalance
smote = SMOTE(random_state=42)
y_train_resampled = np.zeros_like(y_train)

for i in range(y_train.shape[1]):
    _, y_resampled = smote.fit_resample(X_train, y_train[:, i])
    y_train_resampled[:, i] = y_resampled[:X_train.shape[0]]  # Take only the relevant part of y_resampled

# Convert sparse matrices to dense arrays for BiLSTM input
X_train_dense = X_train.toarray()
X_valid_dense = X_valid.toarray()

# Reshape input data for BiLSTM (assuming sequences of TF-IDF vectors)
max_len = X_train_dense.shape[1]  # Max sequence length (number of features in TF-IDF)
X_train_bilstm = X_train_dense.reshape(X_train_dense.shape[0], 1, max_len)  # Reshape for BiLSTM
X_valid_bilstm = X_valid_dense.reshape(X_valid_dense.shape[0], 1, max_len)  # Reshape for BiLSTM

# Build a multi-label classification model using Bidirectional LSTM in Keras
def build_bilstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dense(len(all_label_cols), activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create and train the BiLSTM model
bilstm_model = build_bilstm_model(input_shape=(1, max_len))
bilstm_model.fit(X_train_bilstm, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_valid_bilstm, y_valid), verbose=1)

# Generate predictions for validation data
y_pred_proba = bilstm_model.predict(X_valid_bilstm)
y_pred_binary = (y_pred_proba > 0.5).astype(int)

# Generate classification report
class_report = classification_report(y_valid, y_pred_binary, target_names=all_label_cols)

# Print the classification report
print("Classification Report:")
print(class_report)

# Calculate accuracy for each label
accuracies = {}
for i, label in enumerate(all_label_cols):
    accuracy = accuracy_score(y_valid[:, i], y_pred_binary[:, i])
    accuracies[label] = accuracy

# Print accuracy for each label
print("\nAccuracy for Each Label:")
for label, acc in accuracies.items():
    print(f"{label}: {acc}")

# Calculate average accuracy across all labels (including 'non-toxic')
average_accuracy = accuracy_score(y_valid, y_pred_binary, normalize=True)

# Print the average accuracy
print("\nAverage Accuracy (Overall):", average_accuracy)
