In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib


In [None]:
!pip install --upgrade --force-reinstall transformers==4.44.2
!pip install --upgrade --force-reinstall accelerate datasets


Collecting transformers==4.44.2
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting filelock (from transformers==4.44.2)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers==4.44.2)
  Using cached huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers==4.44.2)
  Using cached numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting packaging>=20.0 (from transformers==4.44.2)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers==4.44.2)
  Using cached PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers==4.44.2)
  Using cached regex-2025.9.18-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting requests (from tr

KeyboardInterrupt: 

In [2]:
import pandas as pd
from google.colab import files

# This opens a file picker in Colab → select your local CSV
uploaded = files.upload()

# Get the first uploaded filename
filename = list(uploaded.keys())[0]

# Read into pandas
df = pd.read_csv(filename)

# Quick sanity check
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
print(df.head())



Saving data.csv to data.csv
Rows: 2129
Columns: ['reply', 'label']
                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive


In [3]:
df = df.rename(columns={"reply": "text", "label": "label"})
df["text"] = df["text"].astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
df["label"] = df["label"].str.lower().str.strip()

# Quick check
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
print("\nLabel distribution:\n", df["label"].value_counts())
df.head()

Rows: 2129
Columns: ['text', 'label']

Label distribution:
 label
positive    710
negative    710
neutral     709
Name: count, dtype: int64


Unnamed: 0,text,label
0,Can we discuss pricing??,neutral
1,"Im excited to explore this further, plz send c...",positive
2,We not looking for new solutions.,negative
3,Could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


In [4]:
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42
)

print("Train size:", len(train_df), "Test size:", len(test_df))


Train size: 1703 Test size: 426


In [5]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode",
        stop_words="english",
        ngram_range=(1, 2),
        max_features=20000
    )),
    ("clf", LogisticRegression(
        max_iter=200,
        solver="liblinear",
        random_state=42
    ))
])

param_grid = {"clf__C": [0.25, 0.5, 1.0, 2.0, 4.0]}
grid = GridSearchCV(
    pipeline, param_grid=param_grid,
    cv=5, n_jobs=-1, scoring="f1_macro", verbose=1
)

grid.fit(train_df['text'], train_df['label'])


Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [6]:
y_pred = grid.predict(test_df['text'])
print("Accuracy:", accuracy_score(test_df['label'], y_pred))
print("Macro F1:", f1_score(test_df['label'], y_pred, average="macro"))
print("\nClassification Report:\n", classification_report(test_df['label'], y_pred))


Accuracy: 0.9882629107981221
Macro F1: 0.9882781717888101

Classification Report:
               precision    recall  f1-score   support

    negative       0.99      0.98      0.99       142
     neutral       1.00      0.99      0.99       142
    positive       0.97      1.00      0.99       142

    accuracy                           0.99       426
   macro avg       0.99      0.99      0.99       426
weighted avg       0.99      0.99      0.99       426



In [7]:
import joblib
joblib.dump(grid.best_estimator_, "baseline_model.joblib")


['baseline_model.joblib']

In [8]:
from google.colab import files
files.download("baseline_model.joblib")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install -q transformers datasets accelerate evaluate


In [None]:
!pip install -q --upgrade transformers


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)


In [None]:

# Ensure columns are "text" and "label" with lowercase labels
df["label"] = df["label"].str.lower().str.strip()

# Train / validation split
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42
)

# Hugging Face dataset objects
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

# Label mapping
labels = ["negative", "neutral", "positive"]
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

def encode_labels(example):
    example["labels"] = label2id[example["label"]]
    return example

train_ds = train_ds.map(encode_labels)
val_ds = val_ds.map(encode_labels)


Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./distilbert-reply-clf",
    evaluation_strategy="epoch",   # or "steps"
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=20,
)




In [None]:
!pip install --upgrade transformers

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# 1. Normalize labels
df["label"] = df["label"].str.lower().str.strip()

# 2. Train/validation split
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42
)

# 3. Define mappings
labels = ["negative", "neutral", "positive"]
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

# 4. Convert pandas → Dataset, keeping only needed columns
train_ds = Dataset.from_pandas(train_df[["text", "label"]].reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df[["text", "label"]].reset_index(drop=True))

# 5. Encode labels → integers
def encode_labels(example):
    return {"labels": label2id[example["label"]]}

train_ds = train_ds.map(encode_labels)
val_ds = val_ds.map(encode_labels)

# 6. Tokenizer
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

# 7. Remove original string label column
train_ds = train_ds.remove_columns(["label"])
val_ds = val_ds.remove_columns(["label"])

# 8. Verify
print(train_ds[0])


Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]



Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

{'text': 'Please share the details, I’m interested.', 'labels': 2, 'input_ids': [101, 3531, 3745, 1996, 4751, 1010, 1045, 1521, 1049, 4699, 1012, 102, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert-reply-clf",
    eval_strategy="epoch",        # use "eval_strategy" (new name)
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=20,
    report_to="none",             # ✅ turn off wandb prompt
)


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}


In [None]:
from transformers import Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0249,0.025807,0.995305,0.995305
2,0.006,0.004189,1.0,1.0
3,0.0038,0.002724,1.0,1.0
4,0.0031,0.002383,1.0,1.0




TrainOutput(global_step=428, training_loss=0.07845190939467366, metrics={'train_runtime': 1104.7997, 'train_samples_per_second': 6.166, 'train_steps_per_second': 0.387, 'total_flos': 24674562830736.0, 'train_loss': 0.07845190939467366, 'epoch': 4.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)




{'eval_loss': 0.004188899882137775, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 19.1051, 'eval_samples_per_second': 22.298, 'eval_steps_per_second': 1.413, 'epoch': 4.0}


In [None]:
trainer.save_model("./distilbert-reply-clf")
tokenizer.save_pretrained("./distilbert-reply-clf")


('./distilbert-reply-clf/tokenizer_config.json',
 './distilbert-reply-clf/special_tokens_map.json',
 './distilbert-reply-clf/vocab.txt',
 './distilbert-reply-clf/added_tokens.json',
 './distilbert-reply-clf/tokenizer.json')