### Ensemble Learning: Stacking Method with CNN, RNN, and RandomForestClassifier

In [51]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed
embedding_dim = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train_cnn = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test_cnn = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train_cnn = train_data['label']
y_test_cnn = test_data['label']

# CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len))
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = cnn_model.evaluate(X_test_cnn, y_test_cnn)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Make predictions
cnn_predictions = cnn_model.predict(X_test_cnn)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.2691027820110321, Test Accuracy: 0.6204933524131775


In [44]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

max_len = 50  # Adjust as needed
embedding_dim = 50  # Adjust as needed

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
X_train_rnn = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=max_len, padding='post')
X_test_rnn = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=max_len, padding='post')

# Labels
y_train_rnn = train_data['label']
y_test_rnn = test_data['label']

# RNN model
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len))
rnn_model.add(SimpleRNN(64, activation='relu'))
rnn_model.add(Dense(64, activation='relu'))
rnn_model.add(Dropout(0.5))
rnn_model.add(Dense(1, activation='sigmoid'))

# optimizer = Adam(learning_rate=0.001)
# rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
# optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)
# optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
# loss = tf.keras.losses.LogCosh()
loss = tf.keras.losses.CategoricalCrossentropy()

# rnn_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

rnn_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# Train the model
rnn_model.fit(X_train_rnn, y_train_rnn, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate on test set
loss, accuracy = rnn_model.evaluate(X_test_rnn, y_test_rnn)
print(f'RNN Test Loss: {loss}, Test Accuracy: {accuracy}')

# Make predictions
rnn_predictions = rnn_model.predict(X_test_rnn)




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
RNN Test Loss: 0.6170258522033691, Test Accuracy: 0.6925995945930481


In [48]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def evaluate_sentence(sentence, model, tokenizer, max_len):
    # Tokenize and pad the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Print for debugging
    print(f'Sequence: {sequence}')
    # print(f'Padded Sequence: {padded_sequence}')

    # Make prediction using the trained model
    likelihood = model.predict(padded_sequence)[0][0]

    return likelihood

# Example usage:
sentence_to_evaluate = "He come no her more."
result = evaluate_sentence(sentence_to_evaluate, rnn_model, tokenizer, max_len)
if result<0.6:
    print("Grammatically \033[1m INCORRECT \033[0m with probability: ",1- result)
else:
    print("Grammatically \033[1m CORRECT \033[0m with probability: ",result)


Sequence: [[12, 216, 90, 38, 25]]
Grammatically [1m CORRECT [0m with probability:  0.6955893


In [18]:
0.7240997

0.7240997

In [56]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Conv1D, GlobalMaxPooling1D

# Make predictions on the test set
rnn_predictions = rnn_model.predict(X_test_rnn)
cnn_predictions = cnn_model.predict(X_test_cnn)

# Create a new dataset with predictions from both models as features
stacking_dataset = np.column_stack((rnn_predictions, cnn_predictions))

# Split the stacking dataset for training the meta-model
stacking_train, stacking_val, y_train_stacking, y_val_stacking = train_test_split(
    stacking_dataset, y_test_rnn, test_size=0.2, random_state=42
)

# Train a randomforest classification meta-model on the stacking dataset
meta_model = RandomForestClassifier()
meta_model.fit(stacking_train, y_train_stacking)

# Make predictions on the validation set
meta_predictions = meta_model.predict(stacking_val)

# Evaluate the stacking ensemble on the validation set
ensemble_accuracy = accuracy_score(y_val_stacking, meta_predictions)
print(f'Stacking Ensemble Accuracy: {ensemble_accuracy}')


Stacking Ensemble Accuracy: 0.5660377358490566


In [58]:
meta_predictions

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1])

In [60]:

class Grader:
    def __init__(self, rnn_model, cnn_model, tokenizer):
        self.rnn_model = rnn_model
        self.cnn_model = cnn_model
        self.tokenizer = tokenizer
        self.meta_model = None  # Initialize meta_model to None

    def evaluate(self, text):
        # Preprocess the input text for RNN
        rnn_encoded_text = pad_sequences(self.tokenizer.texts_to_sequences([text]), maxlen=max_len, padding='post')
        rnn_prediction = self.rnn_model.predict(rnn_encoded_text)[0].squeeze()

        # Preprocess the input text for CNN
        cnn_encoded_text = self.tokenizer.texts_to_sequences([text])
        cnn_encoded_text = pad_sequences(cnn_encoded_text, maxlen=max_len, padding='post')
        cnn_prediction = self.cnn_model.predict(cnn_encoded_text)[0].squeeze()

        # Create a stacking dataset with predictions from both models as features
        stacking_input = np.array([[rnn_prediction, cnn_prediction]])

        if self.meta_model is not None:
            # Make a prediction with the logistic regression meta-model
            meta_prediction = self.meta_model.predict(stacking_input)
            return meta_prediction.item()
        else:
            print("Meta-model not trained. Call train_meta_model before evaluate.")
            return None

    def train_meta_model(self, X_train_rnn, X_train_cnn, y_train_cnn):
        # Make predictions on the training set
        rnn_predictions = self.rnn_model.predict(X_train_rnn)
        cnn_predictions = self.cnn_model.predict(X_train_cnn)

        # Create a stacking dataset with predictions from both models as features
        stacking_dataset = np.column_stack((rnn_predictions, cnn_predictions))

        # Train a logistic regression meta-model on the stacking dataset
        self.meta_model = RandomForestClassifier()
        self.meta_model.fit(stacking_dataset, y_train_cnn)


# Create an instance of the Grader class with the trained models and tokenizer
mygrader = Grader(rnn_model, cnn_model, tokenizer)

# Train the meta-model
mygrader.train_meta_model(X_train_rnn, X_train_cnn, y_train_cnn)

# Example usage
output = mygrader.evaluate("He don’t come here no more.")
print(output)


1


In [61]:
# Evaluate the performance on the test set
test_predictions = meta_model.predict(stacking_val)
# Compute precision and recall using test_predictions and test_labels
# You may need to threshold the predictions based on your specific needs

In [62]:
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_val_stacking, test_predictions)
recall = recall_score(y_val_stacking, test_predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")

Precision: 0.631578947368421
Recall: 0.7272727272727273
