<a href="https://colab.research.google.com/github/Meenusj/Case_study/blob/main/cnn_with_bert_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import nltk

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load pre-trained BERT model and tokenizer (bert-base-cased)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')

# Example dataset loading and preprocessing
csv_file_path = 'train.csv'
df = pd.read_csv(csv_file_path, sep=';')

# Tokenize and encode text data using BERT tokenizer
max_seq_length = 128  # Reduced maximum sequence length
batch_size = 16  # Process data in smaller batches

# Prepare labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['class_type'])

# Function to tokenize and get BERT embeddings in batches
def get_bert_embeddings(texts, batch_size):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokenized_texts = tokenizer(batch_texts.tolist(), padding=True, truncation=True, max_length=max_seq_length, return_tensors='pt')
        input_ids = tokenized_texts['input_ids']
        attention_masks = tokenized_texts['attention_mask']

        with torch.no_grad():
            bert_outputs = bert_model(input_ids, attention_mask=attention_masks)[0]

        embeddings.append(bert_outputs[:, 0, :].numpy())

    return np.vstack(embeddings)

# Get BERT embeddings
texts = df['text'].values
X_embeddings = get_bert_embeddings(texts, batch_size)

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_embeddings, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define input shape for the model
input_shape = X_train.shape[1:]

# Define model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val),
                    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

# Evaluate the model on test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

# Confusion Matrix
y_pred = np.argmax(model.predict(X_test), axis=1)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

# Classification Report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print('Classification Report:')
print(report)

# Save the model
model.save('text_classification_model_with_bert_base_cased.h5')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8236240744590759
Confusion Matrix:
[[ 292  128   23    4]
 [ 170 1288   96    5]
 [   9   55  538    3]
 [  16   29   10  441]]
Classification Report:
              precision    recall  f1-score   support

        gpt2       0.60      0.65      0.63       447
       human       0.86      0.83      0.84      1559
      others       0.81      0.89      0.85       605
         rnn       0.97      0.89      0.93       496

    accuracy                           0.82      3107
   macro avg       0.81      0.81      0.81      3107
weighted avg       0.83      0.82      0.83      3107



  saving_api.save_model(


In [3]:
# Predict labels for the test set
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Map numeric labels back to original class names
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Print actual vs. predicted labels
print("Actual vs. Predicted Labels:")
for actual, predicted in zip(y_test_labels, y_pred_labels):
    print(f"Actual: {actual}, Predicted: {predicted}")


Actual vs. Predicted Labels:
Actual: others, Predicted: others
Actual: rnn, Predicted: rnn
Actual: others, Predicted: others
Actual: human, Predicted: human
Actual: human, Predicted: gpt2
Actual: gpt2, Predicted: gpt2
Actual: human, Predicted: human
Actual: human, Predicted: rnn
Actual: rnn, Predicted: human
Actual: rnn, Predicted: rnn
Actual: others, Predicted: others
Actual: human, Predicted: gpt2
Actual: others, Predicted: others
Actual: human, Predicted: human
Actual: others, Predicted: others
Actual: rnn, Predicted: others
Actual: others, Predicted: others
Actual: human, Predicted: gpt2
Actual: others, Predicted: others
Actual: rnn, Predicted: rnn
Actual: human, Predicted: human
Actual: gpt2, Predicted: others
Actual: others, Predicted: others
Actual: rnn, Predicted: human
Actual: human, Predicted: human
Actual: gpt2, Predicted: gpt2
Actual: human, Predicted: human
Actual: rnn, Predicted: rnn
Actual: others, Predicted: others
Actual: rnn, Predicted: rnn
Actual: others, Predicted: 

In [5]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import nltk

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load pre-trained BERT model and tokenizer (bert-base-cased)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')

# Example dataset loading and preprocessing

df = pd.read_csv('/content/reddit_filtered_dataset.csv')

# Tokenize and encode text data using BERT tokenizer
max_seq_length = 128  # Reduced maximum sequence length
batch_size = 16  # Process data in smaller batches

# Prepare labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Labels'])

# Function to tokenize and get BERT embeddings in batches
def get_bert_embeddings(texts, batch_size):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokenized_texts = tokenizer(batch_texts.tolist(), padding=True, truncation=True, max_length=max_seq_length, return_tensors='pt')
        input_ids = tokenized_texts['input_ids']
        attention_masks = tokenized_texts['attention_mask']

        with torch.no_grad():
            bert_outputs = bert_model(input_ids, attention_mask=attention_masks)[0]

        embeddings.append(bert_outputs[:, 0, :].numpy())

    return np.vstack(embeddings)

# Get BERT embeddings
texts = df['Data'].values
X_embeddings = get_bert_embeddings(texts, batch_size)

# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_embeddings, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define input shape for the model
input_shape = X_train.shape[1:]

# Define model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val),
                    callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])

# Evaluate the model on test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

# Confusion Matrix
y_pred = np.argmax(model.predict(X_test), axis=1)
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

# Save the model
model.save('text_classification_model_with_bert_base_cased.h5')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9774820804595947
Confusion Matrix:
[[774   6]
 [ 16 181]]


  saving_api.save_model(


In [6]:
# Predict labels for the test set
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Map numeric labels back to original class names
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Print actual vs. predicted labels
print("Actual vs. Predicted Labels:")
for actual, predicted in zip(y_test_labels, y_pred_labels):
    print(f"Actual: {actual}, Predicted: {predicted}")


Actual vs. Predicted Labels:
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, 