<a href="https://colab.research.google.com/github/MasaniselviGanesan/Sentimental-Analysis/blob/main/05_BERT_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ STEP 1: Install Dependencies
!pip install transformers scikit-learn -q

# ✅ STEP 2: Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import pickle
import sys

# ✅ STEP 3: Import custom text cleaner
sys.path.append('/content/drive/MyDrive/Sentimental-Analysis')
from shared_preprocessing import clean_text

# ✅ STEP 4: Load and Clean Dataset
df = pd.read_csv("/content/drive/MyDrive/Sentimental-Analysis/data/Tweets.csv")
df = df[['airline_sentiment', 'text']].dropna()
df = df[df['airline_sentiment'].isin(['positive', 'neutral', 'negative'])]
df['clean_text'] = df['text'].apply(clean_text)

# ✅ STEP 5: Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])

# Save label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# ✅ STEP 6: Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

# ✅ STEP 7: Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# ✅ Optional: Save Tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# ✅ STEP 8: Tokenization
def tokenize(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="tf"
    )

train_encodings = tokenize(train_texts, tokenizer)
val_encodings = tokenize(val_texts, tokenizer)

# ✅ STEP 9: Create Datasets (classic Keras format)
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask']
    },
    train_labels
)).shuffle(1000).batch(8)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        'input_ids': val_encodings['input_ids'],
        'attention_mask': val_encodings['attention_mask']
    },
    val_labels
)).batch(8)

# ✅ STEP 10: Load Pretrained Model
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)

# ✅ STEP 11: Optimizer and Loss (no compute_loss)
steps_per_epoch = len(train_dataset) * 5
optimizer, _ = create_optimizer(
    init_lr=5e-5,
    num_train_steps=steps_per_epoch,
    num_warmup_steps=0
)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(
    optimizer=optimizer,
    loss=loss_fn,
    metrics=['accuracy']
)

# ✅ STEP 12: Train the Model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

# ✅ STEP 13: Evaluate
pred_logits = model.predict(val_dataset).logits
pred_labels = np.argmax(pred_logits, axis=1)
print(classification_report(val_labels, pred_labels, target_names=label_encoder.classes_))

# ✅ STEP 14: Save to Drive
model.save_pretrained("/content/drive/MyDrive/tf_bert_sentiment_model/")
tokenizer.save_pretrained("/content/drive/MyDrive/tf_bert_sentiment_model/")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

    negative       0.90      0.90      0.90      1835
     neutral       0.66      0.70      0.68       620
    positive       0.78      0.74      0.76       473

    accuracy                           0.83      2928
   macro avg       0.78      0.78      0.78      2928
weighted avg       0.83      0.83      0.83      2928



('/content/drive/MyDrive/tf_bert_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/tf_bert_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/tf_bert_sentiment_model/vocab.txt',
 '/content/drive/MyDrive/tf_bert_sentiment_model/added_tokens.json')