In [None]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd

from google.colab import files
uploaded = files.upload()  # Upload the file manually

df = pd.read_excel("Cleaned_ML.xlsx")

# Select relevant columns
df_selected = df[['Cleaned_Post_Text', 'Sentiment_Encoded']]

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_selected['Cleaned_Post_Text'], df_selected['Sentiment_Encoded'],
    test_size=0.2, random_state=42, stratify=df_selected['Sentiment_Encoded']
)

Saving Cleaned_ML.xlsx to Cleaned_ML (1).xlsx


In [None]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm  # For progress tracking

# Load DistilBERT tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

# Function for batch processing
def get_batch_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
        batch_texts = texts[i:i + batch_size]  # Get batch
        tokens = tokenizer(batch_texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model(**tokens)

        batch_embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()  # Move to CPU
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)  # Stack all batches

# Convert training and test text to embeddings
X_train_embeddings = get_batch_embeddings(X_train, batch_size=32)
X_test_embeddings = get_batch_embeddings(X_test, batch_size=32)

# Save the embeddings
np.save("X_train_embeddings.npy", X_train_embeddings)
np.save("X_test_embeddings.npy", X_test_embeddings)
np.save("y_train.npy", y_train.to_numpy())
np.save("y_test.npy", y_test.to_numpy())

print("Embeddings saved successfully!")

Processing Batches: 100%|██████████| 250/250 [00:43<00:00,  5.72it/s]
Processing Batches: 100%|██████████| 63/63 [00:10<00:00,  6.10it/s]

Embeddings saved successfully!





In [None]:
from google.colab import files

files.download("X_train_embeddings.npy")
files.download("X_test_embeddings.npy")
files.download("y_train.npy")
files.download("y_test.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# Load precomputed embeddings (update paths accordingly)
X_train = np.load('/content/X_train_embeddings.npy')
X_test = np.load('/content/X_test_embeddings.npy')
y_train = np.load('/content/y_train.npy')
y_test = np.load('/content/y_test.npy')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the training labels
y_train = label_encoder.fit_transform(y_train)

# Transform the testing labels
y_test = label_encoder.transform(y_test)

# Reduce dimensionality using Truncated SVD (128 components)
svd = TruncatedSVD(n_components=128, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

# Train XGBoost model for multi-class classification
xgb_model = XGBClassifier(objective="multi:softmax", num_class=len(np.unique(y_train)), eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train_svd, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test_svd)
accuracy = accuracy_score(y_test, y_pred_xgb)

print(f"XGBoost Multi-Class Accuracy: {accuracy:.4f}")

XGBoost Multi-Class Accuracy: 0.8090


In [None]:
# Import necessary libraries
import numpy as np
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load precomputed embeddings (update paths accordingly)
X_train = np.load('/content/X_train_embeddings.npy')
X_test = np.load('/content/X_test_embeddings.npy')
y_train = np.load('/content/y_train.npy')
y_test = np.load('/content/y_test.npy')

# Convert labels: -1 → 0, 1 → 1 (if needed)
if np.min(y_train) < 0:
    y_train = np.where(y_train == -1, 0, 1)
    y_test = np.where(y_test == -1, 0, 1)

# Reduce dimensionality using Truncated SVD (128 components)
svd = TruncatedSVD(n_components=128, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

# Train XGBoost model for binary classification
xgb_model = XGBClassifier(objective="binary:logistic", eval_metric="logloss", random_state=42)
xgb_model.fit(X_train_svd, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test_svd)
accuracy = accuracy_score(y_test, y_pred_xgb)

print(f"XGBoost Binary Classification Accuracy: {accuracy:.4f}")

XGBoost Binary Classification Accuracy: 0.7990


In [None]:
!pip install transformers



In [None]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Function to get improved sentence embedding using mean pooling
def get_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()  # Mean pooling instead of CLS

# Test sentence
test_sentence = "It is not good."

# Convert test sentence to embedding
test_embedding = get_embedding(test_sentence)

# Reduce dimensionality using the trained SVD model
test_embedding_svd = svd.transform(test_embedding.reshape(1, -1))

# Predict using the trained XGBoost model
prediction = xgb_model.predict(test_embedding_svd)[0]

# Map prediction to label
label_map = {0: "Negative", 1: "Positive"}
print(f"Predicted Sentiment: {label_map[prediction]}")

RuntimeError: Failed to import transformers.models.distilbert.modeling_distilbert because of the following error (look up to see its traceback):
partially initialized module 'torch._dynamo' has no attribute 'external_utils' (most likely due to a circular import)

In [None]:
!pip uninstall catboost -y
!pip install catboost

Found existing installation: catboost 1.2.7
Uninstalling catboost-1.2.7:
  Successfully uninstalled catboost-1.2.7
Collecting catboost
  Using cached catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Using cached catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
Installing collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
# Import necessary libraries
import numpy as np
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Load precomputed embeddings (update paths accordingly)
X_train = np.load('/content/X_train_embeddings.npy')
X_test = np.load('/content/X_test_embeddings.npy')
y_train = np.load('/content/y_train.npy')
y_test = np.load('/content/y_test.npy')

# Convert labels: -1 → 0, 1 → 1 (if needed)
if np.min(y_train) < 0:
    y_train = np.where(y_train == -1, 0, 1)
    y_test = np.where(y_test == -1, 0, 1)

# Reduce dimensionality using Truncated SVD (128 components)
svd = TruncatedSVD(n_components=128, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

# Train CatBoost model for binary classification
cat_model = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.05,
    loss_function='Logloss',
    verbose=100,
    random_seed=42
)

cat_model.fit(X_train_svd, y_train, eval_set=(X_test_svd, y_test), early_stopping_rounds=50)

# Predict and evaluate
y_pred_cat = cat_model.predict(X_test_svd)
accuracy = accuracy_score(y_test, y_pred_cat)

print(f"CatBoost Binary Classification Accuracy: {accuracy:.4f}")

0:	learn: 0.6783979	test: 0.6793028	best: 0.6793028 (0)	total: 498ms	remaining: 4m 8s
100:	learn: 0.3660165	test: 0.4787722	best: 0.4787722 (100)	total: 22.7s	remaining: 1m 29s
200:	learn: 0.2598998	test: 0.4396538	best: 0.4396538 (200)	total: 40.4s	remaining: 1m
300:	learn: 0.1763450	test: 0.4165073	best: 0.4165073 (300)	total: 51.9s	remaining: 34.3s
400:	learn: 0.1221301	test: 0.4044632	best: 0.4044632 (400)	total: 1m 3s	remaining: 15.7s
499:	learn: 0.0869372	test: 0.3980708	best: 0.3976218 (495)	total: 1m 14s	remaining: 0us

bestTest = 0.397621815
bestIteration = 495

Shrink model to first 496 iterations.
CatBoost Binary Classification Accuracy: 0.8225


In [None]:
# Save CatBoost model
cat_model.save_model("/content/catboost_model.cbm")

print("CatBoost model saved successfully!")

CatBoost model saved successfully!


In [None]:
# Import necessary libraries
import numpy as np
import lightgbm as lgb
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score

# Load precomputed embeddings (update paths accordingly)
X_train = np.load('/content/X_train_embeddings.npy')
X_test = np.load('/content/X_test_embeddings.npy')
y_train = np.load('/content/y_train.npy')
y_test = np.load('/content/y_test.npy')

# Convert labels: -1 → 0, 1 → 1 (if needed)
if np.min(y_train) < 0:
    y_train = np.where(y_train == -1, 0, 1)
    y_test = np.where(y_test == -1, 0, 1)

# Reduce dimensionality using Truncated SVD (128 components)
svd = TruncatedSVD(n_components=128, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

# Prepare LightGBM dataset
train_data = lgb.Dataset(X_train_svd, label=y_train)
test_data = lgb.Dataset(X_test_svd, label=y_test, reference=train_data)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': ['binary_logloss', 'accuracy'],
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'random_state': 42
}

# Train LightGBM model
lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=500,
                      callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True),
                                 lgb.log_evaluation(period=100)])

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test_svd)
y_pred_lgb = (y_pred_lgb > 0.5).astype(int)  # Convert probabilities to binary labels

accuracy = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Binary Classification Accuracy: {accuracy:.4f}")

[LightGBM] [Info] Number of positive: 3991, number of negative: 4009
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32640
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 128
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498875 -> initscore=-0.004500
[LightGBM] [Info] Start training from score -0.004500
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.453934
[200]	valid_0's binary_logloss: 0.413898
[300]	valid_0's binary_logloss: 0.400698
[400]	valid_0's binary_logloss: 0.394848
[500]	valid_0's binary_logloss: 0.39471
Did not meet early stopping. Best iteration is:
[480]	valid_0's binary_logloss: 0.393316
LightGBM Binary Classification Accuracy: 0.8255


In [None]:
# Save LightGBM model
lgb_model.save_model("/content/lightgbm_model.txt")

print("LightGBM model saved successfully!")

LightGBM model saved successfully!


In [None]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import TruncatedSVD

# Load trained CatBoost and LightGBM models
import catboost
import lightgbm as lgb

cat_model = catboost.CatBoostClassifier()
cat_model.load_model("/content/catboost_model.cbm")  # Update path if needed

lgb_model = lgb.Booster(model_file="/content/lightgbm_model.txt")  # Update path

# Load DistilBERT tokenizer & model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Load Truncated SVD
svd = TruncatedSVD(n_components=128, random_state=42)
svd.fit(np.load('/content/X_train_embeddings.npy'))  # Fit using training data

# Function to get DistilBERT embeddings
def get_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()  # CLS token representation

# Function to predict sentiment
def predict_sentiment(text):
    embedding = get_embedding(text)  # Get Transformer embeddings
    embedding_svd = svd.transform([embedding])  # Apply Truncated SVD

    # Predict using CatBoost
    cat_pred = cat_model.predict(embedding_svd)[0]

    # Predict using LightGBM
    lgb_pred_prob = lgb_model.predict(embedding_svd)[0]
    lgb_pred = 1 if lgb_pred_prob > 0.5 else 0  # Convert probability to binary output

    # Print results
    print(f"CatBoost Prediction: {'Positive' if cat_pred == 1 else 'Negative'}")
    print(f"LightGBM Prediction: {'Positive' if lgb_pred == 1 else 'Negative'}")

# Example usage
sentence = "The movie was not good at all."
predict_sentiment(sentence)

CatBoost Prediction: Positive
LightGBM Prediction: Negative


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from sklearn.decomposition import TruncatedSVD
import catboost
import lightgbm as lgb
import re

# Load trained models
cat_model = catboost.CatBoostClassifier()
cat_model.load_model("/content/catboost_model.cbm")  # Update path

lgb_model = lgb.Booster(model_file="/content/lightgbm_model.txt")  # Update path

# Load DistilBERT tokenizer & model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Load Truncated SVD trained on embeddings
svd = TruncatedSVD(n_components=128, random_state=42)
svd.fit(np.load('/content/X_train_embeddings.npy'))  # Fit using training data

# Function to handle negations in text
def preprocess_text(text):
    negation_words = ["not", "no", "never", "n't"]
    words = text.split()
    for i in range(len(words) - 1):
        if words[i] in negation_words:
            words[i] = words[i] + "_" + words[i+1]  # Merge negation words
            words[i+1] = ""
    return " ".join([w for w in words if w]).strip()

# Function to get DistilBERT embeddings
def get_embedding(text):
    text = preprocess_text(text)  # Apply preprocessing
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Function to predict sentiment
def predict_sentiment(text):
    embedding = get_embedding(text)  # Get Transformer embeddings
    embedding_svd = svd.transform([embedding])  # Apply Truncated SVD

    # Predict using CatBoost
    cat_pred = cat_model.predict(embedding_svd)[0]

    # Predict using LightGBM
    lgb_pred_prob = lgb_model.predict(embedding_svd)[0]
    lgb_pred = 1 if lgb_pred_prob > 0.5 else 0  # Convert probability to binary output

    # Print results
    print(f"Processed Sentence: {preprocess_text(text)}")
    print(f"CatBoost Prediction: {'Positive' if cat_pred == 1 else 'Negative'}")
    print(f"LightGBM Prediction: {'Positive' if lgb_pred == 1 else 'Negative'}")

# Example usage
sentence = "I am not happy"
predict_sentiment(sentence)

Processed Sentence: I am not_happy
CatBoost Prediction: Negative
LightGBM Prediction: Negative


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from sklearn.decomposition import TruncatedSVD
import catboost
import lightgbm as lgb

# Load trained models
cat_model = catboost.CatBoostClassifier()
cat_model.load_model("/content/catboost_model.cbm")  # Update path

lgb_model = lgb.Booster(model_file="/content/lightgbm_model.txt")  # Update path

# Load DistilBERT tokenizer & model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Load Truncated SVD trained on embeddings
svd = TruncatedSVD(n_components=128, random_state=42)
svd.fit(np.load('/content/X_train_embeddings.npy'))  # Fit using training data

# Manually defined strong negative words
negative_words = {"sad", "unhappy", "depressed", "angry", "frustrated", "horrible", "terrible"}

# Function to preprocess text
def preprocess_text(text):
    negation_words = ["not", "no", "never", "n't"]
    words = text.split()
    for i in range(len(words) - 1):
        if words[i] in negation_words:
            words[i] = words[i] + "_" + words[i+1]  # Merge negation words
            words[i+1] = ""
    return " ".join([w for w in words if w]).strip()

# Function to get DistilBERT embeddings
def get_embedding(text):
    text = preprocess_text(text)  # Apply preprocessing
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Function to predict sentiment
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    embedding = get_embedding(processed_text)  # Get Transformer embeddings
    embedding_svd = svd.transform([embedding])  # Apply Truncated SVD

    # Predict using CatBoost
    cat_pred = cat_model.predict(embedding_svd)[0]

    # Predict using LightGBM
    lgb_pred_prob = lgb_model.predict(embedding_svd)[0]
    lgb_pred = 1 if lgb_pred_prob > 0.5 else 0  # Convert probability to binary output

    # Manual override for strong negative words
    for word in processed_text.split():
        if word in negative_words:
            cat_pred = 0  # Force negative if strong negative word is found
            lgb_pred = 0

    # Print results
    print(f"Processed Sentence: {processed_text}")
    print(f"CatBoost Prediction: {'Positive' if cat_pred == 1 else 'Negative'}")
    print(f"LightGBM Prediction: {'Positive' if lgb_pred == 1 else 'Negative'}")

# Example usage
sentence = "I am sad."
predict_sentiment(sentence)

Processed Sentence: I am sad.
CatBoost Prediction: Positive
LightGBM Prediction: Positive


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
import catboost
import lightgbm as lgb

# Load trained models
cat_model = catboost.CatBoostClassifier()
cat_model.load_model("/content/catboost_model.cbm")  # Update path

lgb_model = lgb.Booster(model_file="/content/lightgbm_model1.txt")  # Update path

# Load DistilBERT tokenizer & model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Function to preprocess text
def preprocess_text(text):
    negation_words = ["not", "no", "never", "n't"]
    words = text.split()
    for i in range(len(words) - 1):
        if words[i] in negation_words:
            words[i] = words[i] + "_" + words[i+1]  # Merge negation words
            words[i+1] = ""
    return " ".join([w for w in words if w]).strip()

# Function to get DistilBERT embeddings (768-dimension)
def get_embedding(text):
    text = preprocess_text(text)  # Apply preprocessing
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Function to predict sentiment
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    embedding = get_embedding(processed_text).reshape(1, -1)  # Keep original 768 dimensions

    # Predict using CatBoost
    cat_pred = cat_model.predict(embedding)[0]

    # Predict using LightGBM
    lgb_pred_prob = lgb_model.predict(embedding)[0]
    lgb_pred = 1 if lgb_pred_prob > 0.5 else 0  # Convert probability to binary output

    # Manual override for strong negative words
    for word in processed_text.split():
        if word in negative_words:
            cat_pred = 0  # Force negative if strong negative word is found
            lgb_pred = 0

    # Print results
    print(f"Processed Sentence: {processed_text}")
    print(f"CatBoost Prediction: {'Positive' if cat_pred == 1 else 'Negative'}")
    print(f"LightGBM Prediction: {'Positive' if lgb_pred == 1 else 'Negative'}")

# Example usage
sentence = "I am frustated"
predict_sentiment(sentence)

Processed Sentence: I am frustated
CatBoost Prediction: Positive
LightGBM Prediction: Positive


In [None]:
# Import necessary libraries
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# Load precomputed full-dimension embeddings (update paths accordingly)
X_train = np.load('/content/X_train_embeddings.npy')  # Shape: (num_samples, 768)
X_test = np.load('/content/X_test_embeddings.npy')    # Shape: (num_samples, 768)
y_train = np.load('/content/y_train.npy')
y_test = np.load('/content/y_test.npy')

# Convert labels: -1 → 0, 1 → 1 (if needed)
if np.min(y_train) < 0:
    y_train = np.where(y_train == -1, 0, 1)
    y_test = np.where(y_test == -1, 0, 1)

# Prepare LightGBM dataset (No Dimensionality Reduction)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': ['binary_logloss', 'accuracy'],
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'random_state': 42
}

# Train LightGBM model
lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=500,
                      callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True),
                                 lgb.log_evaluation(period=100)])

# Predict and evaluate
y_pred_lgb = lgb_model.predict(X_test)
y_pred_lgb = (y_pred_lgb > 0.5).astype(int)  # Convert probabilities to binary labels

accuracy = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Binary Classification Accuracy (No SVD): {accuracy:.4f}")

[LightGBM] [Info] Number of positive: 3991, number of negative: 4009
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.300953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498875 -> initscore=-0.004500
[LightGBM] [Info] Start training from score -0.004500
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.425672
[200]	valid_0's binary_logloss: 0.394816
[300]	valid_0's binary_logloss: 0.386163
[400]	valid_0's binary_logloss: 0.385804
Early stopping, best iteration is:
[366]	valid_0's binary_logloss: 0.383917
LightGBM Binary Classification Accuracy (No SVD): 0.8210


In [None]:
# Save the trained LightGBM model
model_path = "/content/lightgbm_model1.txt"
lgb_model.save_model(model_path)

print(f"Model saved at: {model_path}")

Model saved at: /content/lightgbm_model1.txt


In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import catboost
import lightgbm as lgb

# Load trained models
cat_model = catboost.CatBoostClassifier()
cat_model.load_model("/content/catboost_model.cbm")  # Update path

lgb_model = lgb.Booster(model_file="/content/lightgbm_model1.txt")  # Update path

# Load DeBERTa tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
bert_model = AutoModel.from_pretrained("microsoft/deberta-v3-base")

# Function to preprocess text
def preprocess_text(text):
    negation_words = ["not", "no", "never", "n't"]
    words = text.split()
    for i in range(len(words) - 1):
        if words[i] in negation_words:
            words[i] = words[i] + "_" + words[i+1]  # Merge negation words
            words[i+1] = ""
    return " ".join([w for w in words if w]).strip()

# Function to get DeBERTa embeddings (768-dimension)
def get_embedding(text):
    text = preprocess_text(text)  # Apply preprocessing
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token representation

# Function to predict sentiment
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    embedding = get_embedding(processed_text).reshape(1, -1)  # Keep original 768 dimensions

    # Predict using CatBoost
    cat_pred = cat_model.predict(embedding)[0]

    # Predict using LightGBM
    lgb_pred_prob = lgb_model.predict(embedding)[0]
    lgb_pred = 1 if lgb_pred_prob > 0.5 else 0  # Convert probability to binary output

    # Manual override for strong negative words
    negative_words = ["frustrated", "angry", "disappointed", "upset", "hate", "bad", "sad"]
    for word in processed_text.split():
        if word in negative_words:
            cat_pred = 0  # Force negative if strong negative word is found
            lgb_pred = 0

    # Print results
    print(f"Processed Sentence: {processed_text}")
    print(f"CatBoost Prediction: {'Positive' if cat_pred == 1 else 'Negative'}")
    print(f"LightGBM Prediction: {'Positive' if lgb_pred == 1 else 'Negative'}")

# Example usage
sentence = "I am not happy"
predict_sentiment(sentence)

Processed Sentence: I am not_happy
CatBoost Prediction: Negative
LightGBM Prediction: Positive
