In [None]:

"""
Created as part of Liberty Infospace hiring test – not for production use without candidate's consent
Author: Sayantan Ghosh
GitHub: https://github.com/lazy-coder-03
"""

# Google Colab Setup Script
print("🚀 Setting up Customer Query Classifier in Google Colab")
print("=" * 60)

# Install required packages
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
        print(f"✅ Installed {package}")
    except Exception as e:
        print(f"❌ Failed to install {package}: {e}")

# Required packages
packages = [
    "torch>=2.0.0",
    "transformers>=4.35.0",
    "datasets>=2.12.0",
    "scikit-learn>=1.3.0",
    "pandas>=2.0.0",
    "numpy>=1.24.0",
    "matplotlib>=3.7.0",
    "seaborn>=0.12.0",
    "accelerate>=0.21.0",
    "evaluate>=0.4.0",
    "tqdm>=4.65.0",
    "joblib>=1.3.0"
]

print("📦 Installing required packages...")
for package in packages:
    install_package(package)

print("\n🔧 Setup completed!")
print("You can now run the training script.")

# Import test
try:
    import torch
    import transformers
    import sklearn
    import pandas as pd
    print(f"\n✅ All packages imported successfully!")
    print(f"🔥 PyTorch version: {torch.__version__}")
    print(f"🤗 Transformers version: {transformers.__version__}")
    print(f"🧠 Scikit-learn version: {sklearn.__version__}")
    print(f"📊 Pandas version: {pd.__version__}")

    # Check if GPU is available
    if torch.cuda.is_available():
        print(f"🚀 GPU available: {torch.cuda.get_device_name(0)}")
    else:
        print("💻 Running on CPU")

except ImportError as e:
    print(f"❌ Import error: {e}")

print("\n" + "=" * 60)
print("🎯 Ready to train the Customer Query Classifier!")
print("Run the training script to start fine-tuning the model.")

🚀 Setting up Customer Query Classifier in Google Colab
📦 Installing required packages...
✅ Installed torch>=2.0.0
✅ Installed transformers>=4.35.0
✅ Installed datasets>=2.12.0
✅ Installed scikit-learn>=1.3.0
✅ Installed pandas>=2.0.0
✅ Installed numpy>=1.24.0
✅ Installed matplotlib>=3.7.0
✅ Installed seaborn>=0.12.0
✅ Installed accelerate>=0.21.0
✅ Installed evaluate>=0.4.0
✅ Installed tqdm>=4.65.0
✅ Installed joblib>=1.3.0

🔧 Setup completed!
You can now run the training script.

✅ All packages imported successfully!
🔥 PyTorch version: 2.8.0+cu126
🤗 Transformers version: 4.55.2
🧠 Scikit-learn version: 1.6.1
📊 Pandas version: 2.2.2
🚀 GPU available: Tesla T4

🎯 Ready to train the Customer Query Classifier!
Run the training script to start fine-tuning the model.


In [None]:
"""
Created as part of Liberty Infospace hiring test – not for production use without candidate's consent
Author: Sayantan Ghosh
GitHub: https://github.com/lazy-coder-03
"""

# Colab-optimized training script with preprocessing improvements
import os
import re
import string
import pandas as pd
import torch
import numpy as np
import logging
import warnings

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

os.environ["WANDB_DISABLED"] = "true"

# Check runtime environment
try:
    import google.colab
    IN_COLAB = True
    print("🚀 Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("💻 Running locally")


# -------------------------
# 🔧 Preprocessing function
# -------------------------
def clean_text(text):
    """Lowercase, remove punctuation, numbers & extra spaces."""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " url ", text)  # replace URLs
    text = re.sub(r"\d+", " number ", text)          # replace numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()         # remove extra spaces
    return text


class ColabCustomerQueryClassifier:
    """Customer query classifier with preprocessing."""

    def __init__(self, model_name="distilbert-base-uncased"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.label_encoder = LabelEncoder()
        self.labels = ["Billing Issue", "Technical Problem", "Compliment", "Product Question", "Complaint"]

    def create_directories(self):
        """Ensure necessary directories exist."""
        directories = ["data", "models", "output/logs", "output/metrics", "output/visualizations"]
        for directory in directories:
            os.makedirs(directory, exist_ok=True)
        print("📁 Created project directories")

    def prepare_data(self):
        """Load and preprocess dataset."""
        df = pd.read_csv("data/train_data.csv")
        print(f"💾 Loaded {len(df)} training samples")

        # Apply cleaning to queries
        df["query"] = df["query"].apply(clean_text)

        # Encode labels
        df["labels"] = self.label_encoder.fit_transform(df["category"])

        # Stratified train-validation split
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df["query"].tolist(),
            df["labels"].tolist(),
            test_size=0.2,
            random_state=42,
            stratify=df["labels"]
        )

        print(f"📊 Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")
        return train_texts, val_texts, train_labels, val_labels

    def load_model_and_tokenizer(self):
        """Load pretrained model & tokenizer."""
        try:
            print(f"🤖 Loading {self.model_name}...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(
                self.model_name, num_labels=len(self.labels)
            )
            print("✅ Model and tokenizer loaded successfully")

            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise

    def tokenize_data(self, texts, labels):
        """Convert text into tokenized Dataset."""
        print("🔤 Tokenizing data...")
        encodings = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors="pt"
        )

        dataset = Dataset.from_dict({
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": labels
        })
        return dataset

    def compute_metrics(self, eval_pred):
        """Compute evaluation metrics during training."""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        from sklearn.metrics import accuracy_score, f1_score
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average="weighted")

        return {"accuracy": accuracy, "f1": f1}

    def train_model(self, train_dataset, val_dataset):
        """Fine-tune the model with Colab-optimized hyperparameters."""
        print("🏋️ Starting model training...")

        training_args = TrainingArguments(
            output_dir="./output/logs",
            num_train_epochs=10,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            warmup_steps=100,
            weight_decay=0.05,
            learning_rate=5e-5,
            logging_dir="./output/logs",
            logging_steps=10,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=2,
            report_to="none",
            fp16=torch.cuda.is_available()
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()
        print("✅ Training completed!")
        return trainer

    def save_model(self, trainer):
        """Save trained model, tokenizer, and label encoder."""
        output_path = "models/fine_tuned_classifier"
        print(f"💾 Saving model to {output_path}...")

        trainer.save_model(output_path)
        self.tokenizer.save_pretrained(output_path)

        import joblib
        joblib.dump(self.label_encoder, os.path.join(output_path, "label_encoder.pkl"))

        print("✅ Model saved successfully!")

    def run_training(self):
        """End-to-end pipeline."""
        try:
            print("🚀 Starting Customer Query Classifier Training Pipeline")
            print("=" * 60)

            self.create_directories()
            self.load_model_and_tokenizer()
            train_texts, val_texts, train_labels, val_labels = self.prepare_data()
            train_dataset = self.tokenize_data(train_texts, train_labels)
            val_dataset = self.tokenize_data(val_texts, val_labels)
            trainer = self.train_model(train_dataset, val_dataset)
            self.save_model(trainer)

            print("=" * 60)
            print("🎉 Training pipeline completed successfully!")
            print(f"📁 Model saved in: models/fine_tuned_classifier")
            return trainer

        except Exception as e:
            logger.error(f"Training failed: {e}")
            raise


🚀 Running in Google Colab


In [None]:
classifier = ColabCustomerQueryClassifier()
trainer = classifier.run_training()

# Save after training
trainer.save_model("models/fine_tuned_classifier")
classifier.tokenizer.save_pretrained("models/fine_tuned_classifier")
joblib.dump(classifier.label_encoder, "models/fine_tuned_classifier/label_encoder.pkl")

print("✅ Training done and model saved!")

🚀 Starting Customer Query Classifier Training Pipeline
📁 Created project directories
🤖 Loading distilbert-base-uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model and tokenizer loaded successfully
💾 Loaded 990 training samples
📊 Training samples: 792, Validation samples: 198
🔤 Tokenizing data...
🔤 Tokenizing data...
🏋️ Starting model training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3569,1.15679,0.752525,0.683755
2,0.5566,0.368624,0.833333,0.808965
3,0.3759,0.329759,0.853535,0.848585
4,0.2188,0.324302,0.848485,0.845842
5,0.2809,0.438027,0.833333,0.829695


✅ Training completed!
💾 Saving model to models/fine_tuned_classifier...
✅ Model saved successfully!
🎉 Training pipeline completed successfully!
📁 Model saved in: models/fine_tuned_classifier
✅ Training done and model saved!


In [None]:
def quick_test(classifier, model, tokenizer, query):
    """Run a real-time prediction for a single query"""

    try:
        query = clean_text(query)
        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class_id = predictions.argmax().item()
            confidence = predictions.max().item()
            predicted_label = classifier.label_encoder.inverse_transform([predicted_class_id])[0]
        print(f"  '{query}' → {predicted_label} ({confidence:.2%})")
    except Exception as e:
        print(f"⚠️ Prediction failed: {e}")


# Load back model + tokenizer + encoder
tokenizer = AutoTokenizer.from_pretrained("models/fine_tuned_classifier")
model = AutoModelForSequenceClassification.from_pretrained("models/fine_tuned_classifier")
classifier.label_encoder = joblib.load("models/fine_tuned_classifier/label_encoder.pkl")

print("🚀 Real-time Customer Query Classifier")
print("Type 'exit' to quit.\n")

while True:
    review = input("Write a review to classify: ")
    if review.lower().strip() == "exit":
        print("👋 Exiting classifier. Goodbye!")
        break
    quick_test(classifier, model, tokenizer, review)

🚀 Real-time Customer Query Classifier
Type 'exit' to quit.

Write a review to classify: what is payment method?
  'what is payment method' → Product Question (95.57%)
Write a review to classify: awesome product
  'awesome product' → Compliment (49.19%)
Write a review to classify: i cant login
  'i cant login' → Technical Problem (69.89%)
Write a review to classify: exit
👋 Exiting classifier. Goodbye!
