# Get data Ready!

## N_rows and percentage

In [26]:
# Decide how many lines you want to run and the % of it you want to use
# Total lines in the file: 360835
n_rows = 50
percentage_rows = 99

## Importing and setting up

In [27]:
import pandas as pd
import numpy as np
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, classification_report

from transformers import BertTokenizer, BertForSequenceClassification, BertModel, pipeline
from transformers import XLNetTokenizer, XLNetForSequenceClassification
import sentencepiece as spm
import torch
from keras.layers import Dropout
from keras.optimizers import Adam

In [28]:
# Load data
data = pd.read_csv('data/undersampled_data_60_40.csv', nrows=n_rows)
# copy data
df = data.copy()
# Using only # % of datset
df = df.sample(frac=percentage_rows / 100, random_state=42)
# before train_split:
df = df.dropna(subset=['stopwords_punct_lemma'])

# -------------

# LSTM

In [29]:
X = df['comment_text'].values
y = df['toxic'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [7]:
# Pad sequences to a fixed length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

In [8]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_padded, y_train, epochs=4, batch_size=32, validation_data=(X_test_padded, y_test))

### inserting bert

In [30]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
def extract_bert_embeddings(sentences, tokenizer, model):
    """ Extracts BERT embeddings for a list of sentences. """
    model.eval()
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Extract the embeddings from the last hidden layer
        last_hidden_states = outputs.last_hidden_state
        # Pooling: Mean of the token embeddings
        sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
        embeddings.append(sentence_embedding)

    return np.array(embeddings)

In [32]:
# Assuming X_train and X_test are lists of sentences
X_train_embeddings = extract_bert_embeddings(X_train, tokenizer, bert_model)
X_test_embeddings = extract_bert_embeddings(X_test, tokenizer, bert_model)

In [33]:
# Parameters
input_dim = 768  # BERT-base embeddings have a dimension of 768
lstm_units = 64  # Number of units in LSTM layer, can be adjusted
num_classes = 2  # Adjust based on your classification task (binary or multi-class)

# Define LSTM model
model = Sequential()
model.add(LSTM(lstm_units, input_shape=(input_dim,)))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))  # Use 'sigmoid' for binary classification

# Compile the model
model.compile(loss='categorical_crossentropy',  # Use 'binary_crossentropy' for binary classification
              optimizer=Adam(learning_rate=0.001),
              metrics=['accuracy'])

ValueError: Input 0 of layer "lstm_1" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 768)

In [None]:
# Fit the model with BERT embeddings
model.fit(X_train_embeddings, y_train, epochs=4, batch_size=32, validation_data=(X_test_embeddings, y_test))

# MODEL to pkl

In [3]:
import os
import pickle
from tensorflow import keras
from keras.models import load_model

import joblib

In [16]:
nlp = spacy.load('en_core_web_sm')
# Preprocess Function
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

# Sample text
sample_text = "Life is not really beautiful. Apply the preprocessing function to the sample text and display the result."

# Apply the preprocessing function to the sample text
preprocessed_text = preprocess(sample_text)

# Display the result
print("Original Text:")
print(sample_text)
print("\nPreprocessed Text:")
print(preprocessed_text)

Original Text:
Life is not really beautiful. Apply the preprocessing function to the sample text and display the result.

Preprocessed Text:
life beautiful apply preprocessing function sample text display result


In [None]:
predictions = model.predict(sample_text)

In [None]:
predictions_proba = predictions[:, 1]


In [None]:
predictions_proba = model.predict_proba(sample_text)

In [13]:
from keras.models import model_from_json
from keras.utils import register_keras_serializable
from keras.models import load_model

# Register Sequential class for serialization
# register_keras_serializable("keras.engine.Sequential")

In [14]:
'''# Load the architecture from JSON
with open("model.json", "r") as json_file:
    loaded_model_json = json_file.read()

lstm_model = model_from_json(loaded_model_json)

# Load the weights
lstm_model.load_weights("model_weights.h5")'''

In [26]:
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import spacy
import json

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the architecture from JSON
with open("model.json", "r") as json_file:
    model_json = json_file.read()

# Load the model
loaded_model = model_from_json(model_json)

# Load the weights
loaded_model.load_weights("model_weights.h5")

# Find the 'Embedding' layer which contains the tokenizer configuration
embedding_layer = None
for layer in json.loads(model_json)["config"]["layers"]:
    if layer["class_name"] == "Embedding":
        embedding_layer = layer
        break

if embedding_layer is None:
    raise ValueError("Embedding layer not found in the model.")

# Extract the tokenizer configuration from the 'Embedding' layer
tokenizer_config = embedding_layer["config"]["tokenizer_config"]

# Create a new Tokenizer with the loaded configuration
tokenizer = Tokenizer()
tokenizer.__dict__.update(tokenizer_config)

# Sample text
sample_text = "Life is not really beautiful. Apply the preprocessing function to the sample text and display the result."

# Preprocess the text
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

# Apply preprocessing
preprocessed_text = preprocess(sample_text)

# Convert text to sequences using the loaded tokenizer
sequences = tokenizer.texts_to_sequences([preprocessed_text])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')  # Adjust maxlen accordingly

# Make prediction
predictions = loaded_model.predict(np.array(padded_sequences))

print("Predictions:", predictions)


KeyError: 'tokenizer_config'

# Preprocess

In [2]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import string

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Preprocess Function
def preprocess(text):
    # Load English language model and create nlp object from it
    doc = nlp(text)

    # Filter out stopwords, punctuation, and apply lemmatization
    filtered_tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]

    return " ".join(filtered_tokens)

# Sample text
sample_text = "This is a sample text for preprocessing. It includes stopwords and punctuation!"

# Apply the preprocessing function to the sample text
preprocessed_text = preprocess(sample_text)

# Display the result
print("Original Text:")
print(sample_text)
print("\nPreprocessed Text:")
print(preprocessed_text)


Original Text:
This is a sample text for preprocessing. It includes stopwords and punctuation!

Preprocessed Text:
sample text preprocessing include stopword punctuation


In [21]:
nlp = spacy.load('en_core_web_sm')
# Preprocess Function
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
           continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

# Sample text
sample_text = "Life is not really beautiful. Apply the preprocessing function to the sample text and display the result."

# Apply the preprocessing function to the sample text
preprocessed_text = preprocess(sample_text)

# Display the result
print("Original Text:")
print(sample_text)
print("\nPreprocessed Text:")
print(preprocessed_text)

Original Text:
Life is not really beautiful. Apply the preprocessing function to the sample text and display the result.

Preprocessed Text:
life beautiful apply preprocessing function sample text display result


In [18]:
import os
file_path = os.path.abspath('svm_stop_tfidf.pkl')
# Load the model
with open(file_path, 'rb') as f:
    model_ready = pickle.load(f)

In [None]:
text_list_vec = tfidf_vectorizer.transform([preprocessed_text])

In [None]:
model_ready.predict_proba(text_list_vec)