<a href="https://colab.research.google.com/github/ManuBansalS/manuS/blob/main/ResearchPaper/jupyter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Core libraries
import os, random, re, string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLP
import nltk, spacy
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Visualization
import seaborn as sns

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [2]:
import os
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Optional: Still set environment variables for Kaggle if needed elsewhere
os.environ['KAGGLE_USERNAME'] = "manubansalg"
os.environ['KAGGLE_KEY'] = "aa9b0c66c740f641bd7d2a35cdd58660"

# Dataset reference (as in Code 2)
dataset_name = "thoughtvector/customer-support-on-twitter"

# Specify the internal file name—typically the CSV inside the dataset.
# The Kaggle dataset likely contains a CSV (often named something like 'customer_support_on_twitter.csv')
file_path = "twcs/twcs.csv"

# Load dataset directly into pandas DataFrame
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    dataset_name,
    file_path
)

# Select only desired columns and drop missing values
df = df[['text', 'author_id']]  # Adjust as needed—e.g., if 'label' exists instead of 'author_id'
df.dropna(inplace=True)

print(df.head())


  df = kagglehub.load_dataset(


                                                text   author_id
0  @115712 I understand. I would like to assist y...  sprintcare
1      @sprintcare and how do you propose we do that      115712
2  @sprintcare I have sent several private messag...      115712
3  @115712 Please send us a Private Message so th...  sprintcare
4                                 @sprintcare I did.      115712


In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def fast_sentiment(text):
    score = sia.polarity_scores(str(text))['compound']
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['text'].apply(fast_sentiment)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
import nltk
import re
import string
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", "", str(text))  # remove URLs
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join([word for word in text.split() if word not in STOPWORDS])
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.model_selection import train_test_split

# First split into train (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    df['clean_text'],
    df['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

# Then split the temp (20%) into validation (10%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,   # half of 20% → 10%
    random_state=42,
    stratify=y_temp
)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=SEED)

# 5-fold CV on train set with multiple metrics
scoring = {'f1_macro': 'f1_macro', 'accuracy': 'accuracy'}
cv_results = cross_validate(logreg, X_train_tfidf, y_train, cv=5, scoring=scoring)

print("LogReg CV F1 Scores:", cv_results['test_f1_macro'], "Mean:", cv_results['test_f1_macro'].mean())
print("LogReg CV Accuracy Scores:", cv_results['test_accuracy'], "Mean:", cv_results['test_accuracy'].mean())

# Train & evaluate on test set
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_test_tfidf)
print("LogReg Test Report:\n", classification_report(y_test, y_pred))


LogReg CV F1 Scores: [0.87592994 0.87639213 0.87625138 0.875895   0.87599375] Mean: 0.8760924418280286
LogReg CV Accuracy Scores: [0.89154538 0.89208329 0.89185657 0.89149425 0.89164738] Mean: 0.8917253743842342
LogReg Test Report:
               precision    recall  f1-score   support

    negative       0.85      0.77      0.81     66002
     neutral       0.86      0.92      0.89     72447
    positive       0.92      0.93      0.93    142729

    accuracy                           0.89    281178
   macro avg       0.88      0.87      0.88    281178
weighted avg       0.89      0.89      0.89    281178



In [10]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

# Linear SVM
svm = LinearSVC(random_state=SEED)

# 5-fold CV with multiple metrics
scoring = {'f1_macro': 'f1_macro', 'accuracy': 'accuracy'}
cv_results = cross_validate(svm, X_train_tfidf, y_train, cv=5, scoring=scoring)

print("SVM CV F1 Scores:", cv_results['test_f1_macro'], "Mean:", cv_results['test_f1_macro'].mean())
print("SVM CV Accuracy Scores:", cv_results['test_accuracy'], "Mean:", cv_results['test_accuracy'].mean())

# Train & evaluate on test set
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
print("SVM Test Report:\n", classification_report(y_test, y_pred))


SVM CV F1 Scores: [0.87703725 0.87731334 0.87690997 0.87667114 0.87711301] Mean: 0.8770089421607092
SVM CV Accuracy Scores: [0.89276125 0.89305465 0.89261676 0.89244339 0.89281213] Mean: 0.8927376358404082
SVM Test Report:
               precision    recall  f1-score   support

    negative       0.86      0.77      0.81     66002
     neutral       0.86      0.92      0.89     72447
    positive       0.92      0.94      0.93    142729

    accuracy                           0.89    281178
   macro avg       0.88      0.87      0.88    281178
weighted avg       0.89      0.89      0.89    281178



In [11]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [None]:
# Install PEFT + Accelerate + Transformers
!pip install -q transformers datasets peft accelerate

In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

# ✅ Device check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# =========================
# 1. Prepare Dataset
# =========================
# Encode string labels -> integers
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_val_enc   = encoder.transform(y_val)
y_test_enc  = encoder.transform(y_test)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train_enc}))
val_dataset   = Dataset.from_pandas(pd.DataFrame({"text": X_val, "label": y_val_enc}))
test_dataset  = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test_enc}))

# =========================
# 2. Load DistilBERT + Tokenizer
# =========================
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# =========================
# 3. Apply LoRA (PEFT)
# =========================
config = LoraConfig(
    task_type=TaskType.SEQ_CLS,   # sequence classification
    r=16,                         # rank
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(base_model, config)
model.to(device)

# =========================
# 4. Tokenization
# =========================
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset   = val_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset  = test_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"])

# =========================
# 5. Training Arguments
# =========================
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,  # fast on GPU
    per_device_eval_batch_size=32,
    num_train_epochs=3,              # 3 epochs enough
    weight_decay=0.01,
    fp16=True,                       # mixed precision for speed
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True
)

# =========================
# 6. Metrics
# =========================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

# =========================
# 7. Trainer
# =========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# =========================
# 8. Train
# =========================
trainer.train()

# =========================
# 9. Evaluation
# =========================
print("\nValidation Results:")
val_results = trainer.evaluate(val_dataset)
print(val_results)

print("\nTest Results:")
test_results = trainer.evaluate(test_dataset)
print(test_results)

# Predictions + classification report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print("\nLoRA-DistilBERT Accuracy:", accuracy_score(y_true, y_pred))
print("LoRA-DistilBERT Macro-F1:", f1_score(y_true, y_pred, average='macro'))
print("LoRA-DistilBERT Classification Report:\n", classification_report(y_true, y_pred, target_names=encoder.classes_))
