In [32]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import nltk
from nltk.corpus import wordnet
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight, resample
from sklearn.metrics import classification_report
import re
from sklearn.model_selection import train_test_split

# Download WordNet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
# Load train and evaluation datasets
train_df = pd.read_excel("/content/train.xlsx")
eval_df = pd.read_excel("/content/evaluation.xlsx")

train_df.head()

Unnamed: 0,text,reason,label
0,this is an amazing app for online classes!but,good app for conducting online classes,1
1,very practical and easy to use,app is user-friendly,1
2,this app is very good for video conferencing.,good for video conferencing,1
3,i can not download this zoom app,unable to download zoom app,1
4,i am not able to download this app,want to download the app,1


In [34]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2061 entries, 0 to 2060
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2061 non-null   object
 1   reason  2061 non-null   object
 2   label   2061 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 48.4+ KB


In [35]:
eval_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9000 non-null   object
 1   reason  9000 non-null   object
 2   label   9000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 211.1+ KB


In [36]:
# Check class distribution
print("Train Data Class Distribution:\n", train_df["label"].value_counts())
print("Evaluation Data Class Distribution:\n", eval_df["label"].value_counts())

Train Data Class Distribution:
 label
1    2061
Name: count, dtype: int64
Evaluation Data Class Distribution:
 label
0    5999
1    3001
Name: count, dtype: int64


In [37]:
# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text) # Remove numbers
    text = re.sub(r"https?://\S+|www\.\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text


In [38]:
# Apply text cleaning
train_df["text"] = train_df["text"].apply(clean_text)
train_df["reason"] = train_df["reason"].apply(clean_text)
eval_df["text"] = eval_df["text"].apply(clean_text)
eval_df["reason"] = eval_df["reason"].apply(clean_text)

In [39]:
# Synonym Replacement for Data Augmentation
def synonym_replacement(text, n=2):
    words = text.split()
    new_words = words.copy()
    for _ in range(n):
        word_idx = random.randint(0, len(words) - 1)
        synonyms = wordnet.synsets(words[word_idx])
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words[word_idx] = synonym.replace("_", " ")  # Replace underscores if any
    return " ".join(new_words)

In [40]:
# Generate negative examples for training data (if needed)
def generate_negative_pairs(df, num_negatives=None):
    if num_negatives is None:
        num_negatives = len(df)  # Match number of positive examples

    negative_samples = pd.DataFrame({
        "text": df["text"].sample(n=num_negatives, replace=True).values,
        "reason": df["reason"].sample(n=num_negatives, replace=True).values,
        "label": 0
    })
    # Remove accidental positive matches
    negative_samples = negative_samples[negative_samples["text"] != negative_samples["reason"]]
    return pd.concat([df, negative_samples]).sample(frac=1).reset_index(drop=True)

# Augment training data with negative examples
train_df = generate_negative_pairs(train_df)

In [41]:
# Augment training data
train_df["augmented_text"] = train_df["text"].apply(lambda x: synonym_replacement(x) if random.random() < 0.3 else x)
train_df["augmented_reason"] = train_df["reason"].apply(lambda x: synonym_replacement(x) if random.random() < 0.3 else x)

In [42]:
# Handle class imbalance in evaluation dataset (Optional: Downsample majority class)
def balance_evaluation_set(df):
    class_0 = df[df["label"] == 0]
    class_1 = df[df["label"] == 1]
    class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=42)
    return pd.concat([class_0_downsampled, class_1]).sample(frac=1).reset_index(drop=True)

eval_df = balance_evaluation_set(eval_df)

In [43]:
train_df = pd.concat([train_df, eval_df.sample(n=1000, random_state=42)])

In [44]:
# # Further downsample evaluation dataset by 2000 rows while keeping balance
# def downsample_evaluation_set(df, num_samples=1000):
#     class_0 = df[df["label"] == 0].sample(n=len(df[df["label"] == 0]) - num_samples, random_state=42)
#     class_1 = df[df["label"] == 1].sample(n=len(df[df["label"] == 1]) - num_samples, random_state=42)
#     return pd.concat([class_0, class_1]).sample(frac=1).reset_index(drop=True)


# # Then, downsample it further
# eval_df = downsample_evaluation_set(eval_df)


In [45]:
# Check class distribution
print("Train Data Class Distribution:\n", train_df["label"].value_counts())
print("Evaluation Data Class Distribution:\n", eval_df["label"].value_counts())

Train Data Class Distribution:
 label
1    2569
0    2553
Name: count, dtype: int64
Evaluation Data Class Distribution:
 label
1    3001
0    3001
Name: count, dtype: int64


In [46]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5122 entries, 0 to 5001
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              5122 non-null   object
 1   reason            5122 non-null   object
 2   label             5122 non-null   int64 
 3   augmented_text    4122 non-null   object
 4   augmented_reason  4122 non-null   object
dtypes: int64(1), object(4)
memory usage: 240.1+ KB


In [47]:
# Ensure no NaN values in the augmented columns
train_df["augmented_text"] = train_df["augmented_text"].fillna(train_df["text"])
train_df["augmented_reason"] = train_df["augmented_reason"].fillna(train_df["reason"])

In [48]:
# Combine 'text' and 'reason'
train_df["combined_text"] = train_df["augmented_text"] + " [SEP] " + train_df["augmented_reason"]
eval_df["combined_text"] = eval_df["text"] + " [SEP] " + eval_df["reason"]

In [49]:
# Train-Validation Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["combined_text"].values, train_df["label"].values, test_size=0.2, random_state=42
)

In [50]:
# Compute class weights
class_weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label"]),
    y=train_df["label"]
)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}


In [51]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(texts, max_len=128):
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="tf"
    )

In [52]:
# Encode text data
train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
eval_encodings = tokenize(eval_df["combined_text"].values)

In [53]:
# Convert to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).shuffle(1000).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(16)
eval_dataset = tf.data.Dataset.from_tensor_slices((dict(eval_encodings), eval_df["label"].values)).batch(16)


In [54]:
# Load BERT Model
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
# Compile Model
optimizer = tf.optimizers.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])


In [56]:
# Train Model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=7, class_weight=class_weights_dict)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [57]:
# # Save Model
# model.save_pretrained("bert_text_classifier")

In [58]:
# Function for Predictions with Threshold

def predict_with_threshold(model, tokenizer, texts, threshold=0.7):
    encodings = tokenize(texts)
    predictions = model.predict(dict(encodings)).logits
    probs = tf.nn.softmax(predictions, axis=1).numpy()
    return (probs[:, 1] > threshold).astype(int)


In [59]:
# Function for Predictions WITHOUT threshold
def predict_without_threshold(model, tokenizer, texts):
    encodings = tokenize(texts)
    predictions = model.predict(dict(encodings)).logits
    probs = tf.nn.softmax(predictions, axis=1).numpy()
    return np.argmax(probs, axis=1)  # Get the class with the highest probability


In [60]:
# Evaluate on Test Data WITHOUT threshold
eval_preds = predict_with_threshold(model, tokenizer, eval_df["combined_text"].values)
print(classification_report(eval_df["label"].values, eval_preds))


              precision    recall  f1-score   support

           0       0.87      0.68      0.76      3001
           1       0.74      0.90      0.81      3001

    accuracy                           0.79      6002
   macro avg       0.81      0.79      0.79      6002
weighted avg       0.81      0.79      0.79      6002



In [61]:
# Function for Manual Prediction
def manual_predict(text, reason, model, tokenizer, threshold=0.7):
    combined_text = text + " [SEP] " + reason
    encodings = tokenize([combined_text])
    predictions = model.predict(dict(encodings)).logits
    probs = tf.nn.softmax(predictions, axis=1).numpy()
    pred_label = 1 if probs[0][1] > threshold else 0  # Apply threshold

    print(f"Prediction: {pred_label} (1 = Valuable, 0 = Not Valuable)")
    return pred_label

# Example Test
sample_text = "This product is amazing and highly recommened!"
sample_reason = "The quality is waste , and the price is reasonable."

manual_predict(sample_text, sample_reason, model, tokenizer)


Prediction: 0 (1 = Valuable, 0 = Not Valuable)


0

In [62]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [63]:
save_directory = "/content/drive/MyDrive/model"

# Create the directory if it doesn't exist
import os
os.makedirs(save_directory, exist_ok=True)

# Save the model
model.save_pretrained(save_directory)

print(f"Model saved successfully at: {save_directory}")


Model saved successfully at: /content/drive/MyDrive/model


In [64]:
tokenizer.save_pretrained("/content/drive/MyDrive/model")

('/content/drive/MyDrive/model/tokenizer_config.json',
 '/content/drive/MyDrive/model/special_tokens_map.json',
 '/content/drive/MyDrive/model/vocab.txt',
 '/content/drive/MyDrive/model/added_tokens.json')