In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertModel, RobertaTokenizer, TFRobertaModel
from tqdm import tqdm

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Sentimental-Analysis/data/Tweets.csv')
df = df[['text', 'airline_sentiment']]
df = df.dropna()

# Label encoding
df['label'] = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Clean text (basic)
def clean_text(text):
    text = tf.strings.regex_replace(text, '[^a-zA-Z ]', '').numpy().decode('utf-8')
    return text.lower()

df['clean_text'] = df['text'].map(lambda x: clean_text(tf.convert_to_tensor(x)))

# Tokenizer for LSTM
tokenizer_lstm = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer_lstm.fit_on_texts(df['clean_text'])
max_len = 50

def get_lstm_embeddings(texts):
    sequences = tokenizer_lstm.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_len)
    return padded

# Load BERT & RoBERTa
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = TFRobertaModel.from_pretrained("roberta-base")

# Batching to avoid OOM
def get_transformer_embeddings(texts, tokenizer, model, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="tf")
        output = model(tokens)[0]  # (batch_size, seq_len, hidden_size)
        cls_embeddings = tf.reduce_mean(output, axis=1).numpy()
        embeddings.extend(cls_embeddings)
        tf.keras.backend.clear_session(); gc.collect()
    return np.array(embeddings)

# LSTM model
def create_lstm_model():
    model = Sequential([
        Embedding(input_dim=10000, output_dim=64, input_length=max_len),
        LSTM(64),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# 5-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(df['clean_text'], df['label'])):
    print(f"\n--- Fold {fold+1} ---")

    X_train_text = df['clean_text'].iloc[train_idx].tolist()
    X_test_text = df['clean_text'].iloc[test_idx].tolist()
    y_train = df['label'].iloc[train_idx].values
    y_test = df['label'].iloc[test_idx].values

    # LSTM Embedding & Model
    X_train_lstm = get_lstm_embeddings(X_train_text)
    X_test_lstm = get_lstm_embeddings(X_test_text)
    lstm_model = create_lstm_model()
    lstm_model.fit(X_train_lstm, y_train, epochs=2, batch_size=64, verbose=0)
    lstm_preds = np.argmax(lstm_model.predict(X_test_lstm), axis=1)

    # BERT embeddings
    X_train_bert = get_transformer_embeddings(X_train_text, bert_tokenizer, bert_model)
    X_test_bert = get_transformer_embeddings(X_test_text, bert_tokenizer, bert_model)

    # RoBERTa embeddings
    X_train_roberta = get_transformer_embeddings(X_train_text, roberta_tokenizer, roberta_model)
    X_test_roberta = get_transformer_embeddings(X_test_text, roberta_tokenizer, roberta_model)

    # XGBoost on combined features
    X_train_combined = np.concatenate([X_train_lstm, X_train_bert, X_train_roberta], axis=1)
    X_test_combined = np.concatenate([X_test_lstm, X_test_bert, X_test_roberta], axis=1)

    xgb = XGBClassifier(n_estimators=100, max_depth=4, use_label_encoder=False, eval_metric='mlogloss')
    xgb.fit(X_train_combined, y_train)
    xgb_preds = xgb.predict(X_test_combined)

    acc = accuracy_score(y_test, xgb_preds)
    acc_scores.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")

print(f"\n✅ Average Accuracy over 5 folds: {np.mean(acc_scores):.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were 

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe


--- Fold 1 ---




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


  0%|          | 0/183 [00:00<?, ?it/s]TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
100%|██████████| 183/183 [02:53<00:00,  1.05it/s]
100%|██████████| 46/46 [00:43<00:00,  1.07it/s]
100%|██████████| 183/183 [02:55<00:00,  1.04it/s]
100%|██████████| 46/46 [00:43<00:00,  1.05it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 Accuracy: 0.8255

--- Fold 2 ---




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


100%|██████████| 183/183 [02:50<00:00,  1.07it/s]
100%|██████████| 46/46 [00:43<00:00,  1.06it/s]
100%|██████████| 183/183 [02:52<00:00,  1.06it/s]
100%|██████████| 46/46 [00:43<00:00,  1.05it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 Accuracy: 0.8142

--- Fold 3 ---




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


100%|██████████| 183/183 [02:52<00:00,  1.06it/s]
100%|██████████| 46/46 [00:42<00:00,  1.08it/s]
100%|██████████| 183/183 [02:49<00:00,  1.08it/s]
100%|██████████| 46/46 [00:43<00:00,  1.07it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 Accuracy: 0.8084

--- Fold 4 ---




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


100%|██████████| 183/183 [02:51<00:00,  1.06it/s]
100%|██████████| 46/46 [00:43<00:00,  1.05it/s]
100%|██████████| 183/183 [02:51<00:00,  1.06it/s]
100%|██████████| 46/46 [00:42<00:00,  1.08it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 Accuracy: 0.8156

--- Fold 5 ---




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


100%|██████████| 183/183 [02:48<00:00,  1.09it/s]
100%|██████████| 46/46 [00:42<00:00,  1.09it/s]
100%|██████████| 183/183 [02:47<00:00,  1.09it/s]
100%|██████████| 46/46 [00:42<00:00,  1.09it/s]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5 Accuracy: 0.8245

✅ Average Accuracy over 5 folds: 0.8176


In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "source": "print('Hello')",
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {...},
  "widgets": { "state": null }  <-- This causes the GitHub error if it's missing or broken
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


SyntaxError: unterminated string literal (detected at line 12) (ipython-input-1-3418391715.py, line 12)

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # if you haven't mounted yet



Mounted at /content/drive
