<a href="https://colab.research.google.com/github/ManuBansalS/manuS/blob/main/ResearchPaper/jupyter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Core libraries
import os, random, re, string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLP
import nltk, spacy
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Transformers
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Visualization
import seaborn as sns

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [2]:
import os
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Optional: Still set environment variables for Kaggle if needed elsewhere
os.environ['KAGGLE_USERNAME'] = "manubansalg"
os.environ['KAGGLE_KEY'] = "aa9b0c66c740f641bd7d2a35cdd58660"

# Dataset reference (as in Code 2)
dataset_name = "thoughtvector/customer-support-on-twitter"

# Specify the internal file name—typically the CSV inside the dataset.
# The Kaggle dataset likely contains a CSV (often named something like 'customer_support_on_twitter.csv')
file_path = "twcs/twcs.csv"

# Load dataset directly into pandas DataFrame
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    dataset_name,
    file_path
)

# Select only desired columns and drop missing values
df = df[['text', 'author_id']]  # Adjust as needed—e.g., if 'label' exists instead of 'author_id'
df.dropna(inplace=True)

print(df.head())


  df = kagglehub.load_dataset(


                                                text   author_id
0  @115712 I understand. I would like to assist y...  sprintcare
1      @sprintcare and how do you propose we do that      115712
2  @sprintcare I have sent several private messag...      115712
3  @115712 Please send us a Private Message so th...  sprintcare
4                                 @sprintcare I did.      115712


In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def fast_sentiment(text):
    score = sia.polarity_scores(str(text))['compound']
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['text'].apply(fast_sentiment)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [6]:
import nltk
import re
import string
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", "", str(text))  # remove URLs
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join([word for word in text.split() if word not in STOPWORDS])
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from sklearn.model_selection import train_test_split

# First split into train (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    df['clean_text'],
    df['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

# Then split the temp (20%) into validation (10%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,   # half of 20% → 10%
    random_state=42,
    stratify=y_temp
)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

# Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=SEED)

# 5-fold CV on train set
cv_scores = cross_val_score(logreg, X_train_tfidf, y_train, cv=5, scoring='f1_macro')
print("LogReg CV F1 Scores:", cv_scores, "Mean:", cv_scores.mean())

# Train & evaluate
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_test_tfidf)
print("LogReg Test Report:\n", classification_report(y_test, y_pred))

LogReg CV F1 Scores: [0.87613812 0.87638048 0.87603756 0.87597888 0.87598855] Mean: 0.8761047205830564
LogReg Test Report:
               precision    recall  f1-score   support

    negative       0.85      0.77      0.81     66002
     neutral       0.86      0.92      0.89     72447
    positive       0.92      0.93      0.93    142729

    accuracy                           0.89    281178
   macro avg       0.88      0.87      0.88    281178
weighted avg       0.89      0.89      0.89    281178



In [21]:
# Linear SVM
svm = LinearSVC(random_state=SEED)

# 5-fold CV
cv_scores = cross_val_score(svm, X_train_tfidf, y_train, cv=5, scoring='f1_macro')
print("SVM CV F1 Scores:", cv_scores, "Mean:", cv_scores.mean())

# Train & evaluate
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
print("SVM Test Report:\n", classification_report(y_test, y_pred))


SVM CV F1 Scores: [0.87702563 0.87730269 0.87690973 0.87671309 0.87713654] Mean: 0.8770175350380214
SVM Test Report:
               precision    recall  f1-score   support

    negative       0.86      0.77      0.81     66002
     neutral       0.86      0.92      0.89     72447
    positive       0.92      0.94      0.93    142729

    accuracy                           0.89    281178
   macro avg       0.88      0.87      0.88    281178
weighted avg       0.89      0.89      0.89    281178



In [None]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import numpy as np

# ✅ Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_val_enc   = encoder.transform(y_val)
y_test_enc  = encoder.transform(y_test)

# ✅ Load lightweight pretrained DistilBERT (sentiment fine-tuned)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
bert_classifier = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name, device=-1)  # -1 = CPU

# Function to process one batch of texts
def process_batch(batch_texts):
    preds = bert_classifier(batch_texts, truncation=True)
    labels = []
    for p in preds:
        if p["label"].lower() in ["positive", "label_2"]:
            labels.append("positive")
        elif p["label"].lower() in ["negative", "label_0"]:
            labels.append("negative")
        else:
            labels.append("neutral")
    return labels

# ✅ Parallelized prediction using joblib
def parallel_predict(texts, n_jobs=4, batch_size=64):
    batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
    results = Parallel(n_jobs=n_jobs)(delayed(process_batch)(b) for b in batches)
    return [label for batch in results for label in batch]

# ----------------- Validation -----------------
print("\nRunning DistilBERT on Validation set...")
y_val_pred = parallel_predict(list(X_val), n_jobs=4, batch_size=64)
print("\nDistilBERT Validation Report:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Macro-F1:", f1_score(y_val, y_val_pred, average="macro"))
print(classification_report(y_val, y_val_pred, target_names=encoder.classes_))

# ----------------- Test -----------------
print("\nRunning DistilBERT on Test set...")
y_test_pred = parallel_predict(list(X_test), n_jobs=4, batch_size=64)
print("\nDistilBERT Test Report:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Macro-F1:", f1_score(y_test, y_test_pred, average="macro"))
print(classification_report(y_test, y_test_pred, target_names=encoder.classes_))


Device set to use cpu



Running DistilBERT on Validation set...
