In [7]:
import random
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DATA_PATH = 'reply_classification_dataset.csv'
os.makedirs('models', exist_ok=True)


In [10]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

SEED = 42
DATA_PATH = "reply_classification_dataset.csv"

df = pd.read_csv(DATA_PATH)
print('Raw shape:', df.shape)

df = df.rename(columns={'reply': 'text'})

df = df.dropna(subset=['text'])

if 'label' in df.columns:
    df['label'] = df['label'].astype(str).str.strip().str.lower().str.replace(',', '')
    df['label'] = df['label'].fillna('neutral')
else:
    raise ValueError('Dataset must contain a "label" column')

def clean_text(s):
    s = str(s)
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r"[^\w\s@#'.,!?-]", '', s)
    return s

df['text'] = df['text'].apply(clean_text)

label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k for k, v in label2id.items()}

df = df[df['label'].isin(label2id.keys())].reset_index(drop=True)

print('After cleaning shape:', df.shape)
print(df['label'].value_counts())

train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=SEED
)
train_df, val_df = train_test_split(
    train_df, test_size=0.125, stratify=train_df['label'], random_state=SEED
)

print('Train/Val/Test sizes:', len(train_df), len(val_df), len(test_df))


Raw shape: (2129, 2)
After cleaning shape: (2129, 2)
label
positive    710
negative    710
neutral     709
Name: count, dtype: int64
Train/Val/Test sizes: 1490 213 426


In [11]:
baseline_pipe = Pipeline([
('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
('clf', LogisticRegression(max_iter=1000, random_state=SEED))
])


baseline_pipe.fit(train_df['text'], train_df['label'])


preds = baseline_pipe.predict(test_df['text'])
acc = accuracy_score(test_df['label'], preds)
f1 = f1_score(test_df['label'], preds, average='weighted')
print('Baseline LogisticRegression Accuracy: {:.4f}, F1-weighted: {:.4f}'.format(acc, f1))
print(classification_report(test_df['label'], preds))


joblib.dump(baseline_pipe, 'models/baseline_model.joblib')

Baseline LogisticRegression Accuracy: 1.0000, F1-weighted: 1.0000
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       142
     neutral       1.00      1.00      1.00       142
    positive       1.00      1.00      1.00       142

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



['models/baseline_model.joblib']

In [13]:
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['text'])
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])


y_train = train_df['label'].map(label2id).values
y_val = val_df['label'].map(label2id).values
y_test = test_df['label'].map(label2id).values


from lightgbm import LGBMClassifier, early_stopping, log_evaluation

lgb = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=SEED
)

lgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(50), log_evaluation(0)]
)

y_pred = lgb.predict(X_test)

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred))
print("LightGBM F1:", f1_score(y_test, y_pred, average="weighted"))
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1043
[LightGBM] [Info] Number of data points in the train set: 1490, number of used features: 183
[LightGBM] [Info] Start training from score -1.097941
[LightGBM] [Info] Start training from score -1.099955
[LightGBM] [Info] Start training from score -1.097941
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	valid_0's multi_logloss: 0.0324751
LightGBM Accuracy: 0.9929577464788732
LightGBM F1: 0.9929577464788732
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       142
           1       0.99      0.99      0.99       142
           2       0.99      0.99      0.99       142

    accuracy                           0.99       426
   macr



In [18]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
!pip install evaluate
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = len(label2id)
import os
os.environ["WANDB_DISABLED"] = "true"
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_ds = Dataset.from_pandas(train_df[['text','label']].rename(columns={'label':'labels'}))
val_ds   = Dataset.from_pandas(val_df[['text','label']].rename(columns={'label':'labels'}))

def map_labels(batch):
    batch['labels'] = [label2id[l] for l in batch['labels']]
    return batch

train_ds = train_ds.map(map_labels, batched=True)
val_ds   = val_ds.map(map_labels, batched=True)

def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)

train_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
val_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

training_args = TrainingArguments(
    output_dir='models/distilbert',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    seed=SEED,
    fp16=torch.cuda.is_available()
)

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(p):
    preds = p.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    preds = np.argmax(preds, axis=-1)
    labels = p.label_ids
    acc_res = metric_acc.compute(predictions=preds, references=labels)
    f1_res = metric_f1.compute(predictions=preds, references=labels, average='weighted')
    return {"accuracy": acc_res["accuracy"], "f1": f1_res["f1"]}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model('models/distilbert')
tokenizer.save_pretrained('models/distilbert')

print("Training finished. Model saved to models/distilbert")




Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5608,0.018974,0.995305,0.995305
2,0.0128,0.006136,1.0,1.0




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5608,0.018974,0.995305,0.995305
2,0.0128,0.006136,1.0,1.0
3,0.007,0.004764,1.0,1.0


Training finished. Model saved to models/distilbert


In [19]:
texts = ["I love this product!", "This is the worst service ever."]

inputs = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k:v.to(device) for k,v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)

id2label = {v:k for k,v in label2id.items()}
pred_labels = [id2label[i.item()] for i in preds]

print(pred_labels)


['positive', 'negative']


app.py

In [None]:
!pip install fastapi uvicorn nest-asyncio pyngrok

from fastapi import FastAPI
from pydantic import BaseModel
from pyngrok import ngrok
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('models/distilbert')
tokenizer = AutoTokenizer.from_pretrained('models/distilbert')
model.eval()

id2label = {0: 'negative', 1: 'neutral', 2: 'positive'}

app = FastAPI()

class InputText(BaseModel):
    text: str

@app.post("/predict")
def predict(input: InputText):
    inputs = tokenizer(input.text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][pred].item()
    return {"label": id2label[pred], "confidence": round(confidence, 2)}

import nest_asyncio, uvicorn
nest_asyncio.apply()

public_url = ngrok.connect(8000)
print("Public URL:", public_url)

uvicorn.run(app, host="0.0.0.0", port=8000)
