In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk


df = pd.read_json("/kaggle/input/f1-stats/bert_dataset.json")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
df.head(5)

In [None]:
df = df[["text", "sentiment"]]
# label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
# df["sentiment"] = df["sentiment"].map(label_mapping)
stop_words = set(stopwords.words('english'))
df.head(2)

In [None]:
def clean_text_nltk(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['text'] = df['text'].apply(clean_text_nltk)
print("done")


In [None]:
X = df["text"]
y = to_categorical(df['sentiment'], num_classes=3)
print("done")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize and pad text data
tokenizer = Tokenizer(num_words=100000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, padding='post')  # Adjust maxlen as needed
X_val_pad = pad_sequences(X_val_seq, padding='post')
X_test_pad = pad_sequences(X_test_seq, padding='post')

full_X1 = tokenizer.texts_to_sequences(X)
full_X = pad_sequences(full_X1, padding='post')

In [None]:
print(len(X_train_pad))
print(len(X_val_pad))
print(len(X_test_pad))

In [None]:
def test_model_acc(model):
    
    loss, accuracy = model.evaluate(X_test_pad, y_test)
    print(f"Test Accuracy: {accuracy:.2f}")
    
    predictions = model.predict(X_test_pad)
    predicted_labels = predictions.argmax(axis=1)  # Convert probabilities to label indices
    return predicted_labels

import matplotlib.pyplot as plt
def plot_results(history, model_name: str):
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.legend()
    plt.title("Model Accuracy")
    plt.savefig(f'{model_name}_acc.png')
    plt.show()
    
    # Plot training and validation loss
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.legend()
    plt.title("Model Loss")
    plt.savefig(f'{model_name}_loss.png')
    plt.show()
    
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix(predicted_labels, model_name):
    inverse_label_mapping = {v: k for k, v in label_mapping.items()}
    cm = confusion_matrix(y_test.argmax(axis=1), predicted_labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=inverse_label_mapping.values(), yticklabels=inverse_label_mapping.values())
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig(f"{model_name}_confusion_matrix.png")
    plt.show()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional, SpatialDropout1D, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2, l1

model_gru = Sequential([
    Embedding(input_dim=30000, output_dim=128),
    SpatialDropout1D(0.3),
    Bidirectional(GRU(units=64, return_sequences=True, kernel_regularizer=l2(0.01))),  
    LayerNormalization(),
    Dropout(0.3),  # Increased dropout
    Bidirectional(GRU(units=32, kernel_regularizer=l2(0.01))),
    LayerNormalization(),
    Dropout(0.3),  # Increased dropout
    Dense(units=3, activation='softmax', kernel_regularizer=l2(0.01))  
])

# Compile the model
model_gru.compile(optimizer=Adam(learning_rate=2e-4), loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model_gru.summary()

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
# batch_logger = BatchMetricsLogger()

# history_gru = model_gru.fit(
#     X_train_pad, y_train,
#     validation_data=(X_val_pad, y_val),
#     epochs=80,  # Adjust as needed
#     batch_size=128,
#     callbacks=[early_stopping]  # Optional early stopping
# )
# predictions = model_gru.predict(full_X)
# predicted_labels = predictions.argmax(axis=1)
# df["gru"] = predicted_labels


In [None]:
from gensim.models import KeyedVectors
word2vec_path = '/kaggle/input/ward2vecc/GoogleNews-vectors-negative300.bin'
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional, SpatialDropout1D, LayerNormalization, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2, l1

model_cnn = Sequential([
    Embedding(input_dim=30_000, output_dim=128),
    SpatialDropout1D(0.5),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    Dropout(0.3),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    Dropout(0.3),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    Dropout(0.3), 
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: 0, 1, 2
])
model_cnn.compile(optimizer= Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
model_cnn.summary()

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
# batch_logger = BatchMetricsLogger()

# history_cnn = model_cnn.fit(
#     X_train_pad, y_train,
#     validation_data=(X_val_pad, y_val),
#     epochs=80,  # Adjust as needed
#     batch_size=128,
#     callbacks=[early_stopping]  # Optional early stopping
# )

# predictions = model_cnn.predict(full_X)
# predicted_labels = predictions.argmax(axis=1)
# df["cnn"] = predicted_labels


In [None]:
import numpy as np

# Define parameters
vocab_size = 30000  # Same as the input_dim in your Embedding layer
embedding_dim = 300  # Same as the output_dim in your Embedding layer

# Initialize embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Create a word-to-index dictionary for your tokenizer
word_index = tokenizer.word_index  # Assuming you already have a tokenizer

# Fill the embedding matrix with Word2Vec vectors
for word, i in word_index.items():
    if i < vocab_size:
        try:
            embedding_vector = word2vec[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            pass  # Word not in Word2Vec vocabulary

In [None]:
embedding_layer = Embedding(
    input_dim=vocab_size, 
    output_dim=embedding_dim, 
    weights=[embedding_matrix], 
    trainable=True  # Set to False if you don't want to fine-tune embeddings
)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional, SpatialDropout1D, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2, l1

model_gru_w = Sequential([
    Embedding(input_dim=30_000, output_dim=64),
    SpatialDropout1D(0.1),
    Bidirectional(GRU(units=64, return_sequences=True, kernel_regularizer=l2(0.01))),  
    LayerNormalization(),
    Dropout(0.25),  
    Bidirectional(GRU(units=32, kernel_regularizer=l2(0.01))),
    LayerNormalization(),
    Dropout(0.25),
    Dense(units=3, activation='softmax', kernel_regularizer=l2(0.01))  
])

# Compile the model
model_gru_w.compile(optimizer=Adam(learning_rate=2e-4), loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model_gru_w.summary()

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
# batch_logger = BatchMetricsLogger()

# history_gruw = model_gru_w.fit(
#     X_train_pad, y_train,
#     validation_data=(X_val_pad, y_val),
#     epochs=80,  # Adjust as needed
#     batch_size=128,
#     callbacks=[early_stopping]  # Optional early stopping
# )

# predictions = model_gru_w.predict(full_X)
# predicted_labels = predictions.argmax(axis=1)
# df["gru_word2vec"] = predicted_labels

# CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model_cnnw = Sequential([
    embedding_layer,
    SpatialDropout1D(0.1),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.3),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.3),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Dropout(0.3), 
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: 0, 1, 2
])
model_cnnw.compile(optimizer= Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
model_cnnw.summary()

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
# batch_logger = BatchMetricsLogger()

# history_cnnw = model_cnnw.fit(
#     X_train_pad, y_train,
#     validation_data=(X_val_pad, y_val),
#     epochs=80,  # Adjust as needed
#     batch_size=128,
#     callbacks=[early_stopping]  # Optional early stopping
# )

# predictions = model_cnnw.predict(full_X)
# predicted_labels = predictions.argmax(axis=1)
# df["cnn_word2vec"] = predicted_labels

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, SpatialDropout1D, Conv1D, MaxPooling1D,
    Bidirectional, GRU, LayerNormalization, Dropout, Dense
)
from tensorflow.keras.regularizers import l2

model_gru_cnn = Sequential([
    embedding_layer,  
    
    SpatialDropout1D(0.1),
    
    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=2),
    
    Bidirectional(GRU(units=64, return_sequences=True, kernel_regularizer=l2(0.01))),
    LayerNormalization(),
    Dropout(0.3),
    
    Bidirectional(GRU(units=32, kernel_regularizer=l2(0.01))),
    LayerNormalization(),
    Dropout(0.3),
    
    Dense(units=3, activation='softmax', kernel_regularizer=l2(0.01))
])

model_gru_cnn.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model_gru_cnn.summary()

from tensorflow.keras.utils import plot_model


from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history_gru_cnn = model_gru_cnn.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=80,  # Adjust as needed
    batch_size=128,
    callbacks=[early_stopping]  # Optional early stopping
)

predictions = model_gru_cnn.predict(full_X)
predicted_labels = predictions.argmax(axis=1)
df["gru_cnn"] = predicted_labels
model_gru_cnn.save("model_gru_cnn.h5")

In [None]:
model_gru_cnn.save("model_gru_cnn.h5")

In [None]:
# df.to_json("final_dataset.json",orient="records", lines=True)

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, BertTokenizer, BertForSequenceClassification

import tensorflow as tf

temp_texts, test_texts, temp_labels, test_labels = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.2, random_state=42
)
print(len(train_texts))
print(len(val_texts))
print(len(test_texts))
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader



In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(pd.DataFrame({
    "text": train_texts.tolist(),
    "label": train_labels.tolist(),
}))
val_dataset = Dataset.from_pandas(pd.DataFrame({
    "text": val_texts.tolist(),
    "label": val_labels.tolist(),
}))

test_dataset = Dataset.from_pandas(pd.DataFrame({
    "text": test_texts.tolist(),
    "label": test_labels.astype(int).tolist(), 
}))

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
val_dataset.set_format("torch")
test_dataset.set_format("torch")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results6",
    num_train_epochs=8,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps = 2,
    warmup_steps=500,
    lr_scheduler_type = "linear",
    weight_decay = 0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs4",
    logging_steps=100,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    fp16=True, 
    overwrite_output_dir=True
)

In [None]:
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
import wandb
wandb.login(key="YOUR TOKEN")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

In [None]:
test_results = trainer.evaluate(eval_dataset=test_dataset)

print("\nTest Set Metrics:")
for key, value in test_results.items():
    print(f"{key}: {value}")

In [None]:
test_predictions = trainer.predict(test_dataset)
test_logits = test_predictions.predictions
test_labels = test_predictions.label_ids
test_preds = np.argmax(test_logits, axis=-1)

from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(test_labels, test_preds)
print("\nTest Set Confusion Matrix:\n", cm)

print("\nTest Set Classification Report:\n", classification_report(test_labels, test_preds))
import seaborn as sns
import matplotlib.pyplot as plt
inverse_label_mapping = {v: k for k, v in label_mapping.items()}
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=inverse_label_mapping.values(), yticklabels=inverse_label_mapping.values())
plt.savefig("bert_confusion_matrix.png")

In [None]:
import matplotlib.pyplot as plt

log_history = trainer.state.log_history

# Extract loss and accuracy for plotting
train_loss = [entry["loss"] for entry in log_history if "loss" in entry]
eval_loss = [entry["eval_loss"] for entry in log_history if "eval_loss" in entry]
eval_acc = [entry["eval_accuracy"] for entry in log_history if "eval_accuracy" in entry]

plt.figure(figsize=(12, 5))
plt.plot(train_loss, label="Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.savefig("bert_train_loss.png")
plt.show()

plt.figure(figsize=(12, 5))
plt.plot(eval_loss, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Validation Loss")
plt.legend()
plt.savefig("bert_val_loss.png")
plt.show()

plt.figure(figsize=(12, 5))
plt.plot(eval_acc, label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy Over Epochs")
plt.legend()
plt.savefig("bert_val_acc.png")
plt.show()

In [None]:
import torch

model = trainer.model

device = torch.device("cuda")
model.to(device)

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    sentiment_score = torch.softmax(logits, dim=-1)
    
    predicted_class = torch.argmax(sentiment_score, dim=-1).item()
    return predicted_class

df['bert'] = df['text'].apply(predict_sentiment)