In [12]:
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertModel, DistilBertTokenizerFast
import torch
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
df = pd.read_csv('/content/drive/MyDrive/Grammar_autocorrection_df1_new.csv')

# DistilBERT

In [15]:
# ── 0) Configuration ────────────────────────────────────────────────────────
MODEL_DIR = 'distilbert_saved_model'

# ── 1) If already saved, just reload ────────────────────────────────────────
if os.path.isdir(MODEL_DIR):
    tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
    model     = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)
    print(f"Loaded existing model from '{MODEL_DIR}', skipping training.")

# ── 2) Otherwise: 5-fold CV with metrics + retrain full head ────────────────
else:
    # Prepare texts & labels
    texts  = df['Ungrammatical Statement'].tolist() + df['Standard English'].tolist()
    labels = [0]*len(df) + [1]*len(df)

    # Initialize tokenizer & pre-tokenize
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

    # Build Hugging Face Dataset
    dataset = Dataset.from_dict({
        'input_ids':      encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels':         labels
    })

    # Load model & freeze encoder layers
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )
    for param in model.distilbert.parameters():
        param.requires_grad = False

    # Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer)

    # Metric computation function
    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        acc   = accuracy_score(p.label_ids, preds)
        prec, rec, f1, _ = precision_recall_fscore_support(
            p.label_ids, preds, average='binary'
        )
        return {
            'accuracy':  acc,
            'precision': prec,
            'recall':    rec,
            'f1':        f1
        }

    # Prepare lists to collect per-fold metrics
    accuracy_scores  = []
    precision_scores = []
    recall_scores    = []
    f1_scores        = []

    # 5-fold CV, 1 epoch per fold for speed
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(cv.split(texts, labels), start=1):
        print(f"=== Fold {fold}/5 ===")
        train_ds = dataset.select(train_idx)
        val_ds   = dataset.select(val_idx)

        training_args = TrainingArguments(
            output_dir=f'./cv_fold{fold}',
            num_train_epochs=1,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            eval_strategy='epoch',
            logging_strategy='epoch',
            save_strategy='no',
            fp16=True,               # set to False if no GPU
            report_to=['none'],
            seed=42
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )

        trainer.train()
        metrics = trainer.evaluate()

        # Collect metrics
        accuracy_scores.append(metrics['eval_accuracy'])
        precision_scores.append(metrics['eval_precision'])
        recall_scores.append(metrics['eval_recall'])
        f1_scores.append(metrics['eval_f1'])

        # Print per-fold results
        print(f"Fold {fold}: "
              f"Acc {metrics['eval_accuracy']:.4f}, "
              f"Prec {metrics['eval_precision']:.4f}, "
              f"Rec {metrics['eval_recall']:.4f}, "
              f"F1 {metrics['eval_f1']:.4f}")

    # Summarize CV
    def report(name, vals):
        mean, std = np.mean(vals), np.std(vals)
        print(f"{name}: {mean:.4f} ± {std:.4f}")

    print("\n=== CV summary over 5 folds ===")
    report("Accuracy",  accuracy_scores)
    report("Precision", precision_scores)
    report("Recall",    recall_scores)
    report("F1-score",  f1_scores)

    # Retrain head on full dataset
    print("\nRetraining classifier head on the full dataset…")
    full_args = TrainingArguments(
        output_dir='./full_train',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        save_strategy='no',
        fp16=True,
        report_to=['none']
    )
    trainer_full = Trainer(
        model=model,
        args=full_args,
        train_dataset=dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    trainer_full.train()

    # Save the fine-tuned model & tokenizer
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print(f"Saved model in '{MODEL_DIR}'")


In [16]:
# 1) Load your saved model & tokenizer
MODEL_DIR = 'distilbert_saved_model'
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
model     = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

# 2) Prompt the user for input
sentence = input("Enter a sentence to check (grammar):\n> ")

# 3) Tokenize & forward pass
enc = tokenizer([sentence], padding=True, truncation=True, return_tensors='pt', max_length=128)
with torch.no_grad():
    logits = model(**enc).logits

# 4) Convert to probabilities & predicted label
probs = torch.softmax(logits, dim=-1)[0]
pred  = probs.argmax().item()
label = "Correct" if pred == 1 else "Ungrammatical"
score = probs[pred].item()

# 5) Print result
print(f"\n\"{sentence}\" → {label} (confidence: {score:.2f})")

In [17]:
import os
import numpy as np
import torch
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

MODEL_DIR = 't5_saved_model'

# ─────────────────────────────────────────────────────────────
# 0. Load model if already saved
# ─────────────────────────────────────────────────────────────
if os.path.isdir(MODEL_DIR):
    tokenizer = T5Tokenizer.from_pretrained(MODEL_DIR)
    model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)
    print(f"Loaded existing model from '{MODEL_DIR}', skipping training.")

else:
    # ─────────────────────────────────────────────────────────
    # 1. Prepare data
    # ─────────────────────────────────────────────────────────
    texts = df['Ungrammatical Statement'].tolist() + df['Standard English'].tolist()
    class_labels = ['ungrammatical'] * len(df) + ['standard'] * len(df)

    tokenizer = T5Tokenizer.from_pretrained("t5-small")

    input_texts = [f"classify: {t}" for t in texts]
    label_texts = class_labels

    encodings = tokenizer(input_texts, padding="max_length", truncation=True, max_length=64, return_tensors="pt")
    targets = tokenizer(label_texts, padding="max_length", truncation=True, max_length=5, return_tensors="pt")

    labels = targets["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100

    dataset = Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })

    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # ─────────────────────────────────────────────────────────
    # 2. Metrics
    # ─────────────────────────────────────────────────────────
    def compute_metrics(p):
        # Unpack logits from predictions
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        pred_ids = np.argmax(preds, axis=-1)

        decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(
            np.where(p.label_ids != -100, p.label_ids, tokenizer.pad_token_id),
            skip_special_tokens=True
        )

        preds_bin = [1 if pred.strip() == 'standard' else 0 for pred in decoded_preds]
        labels_bin = [1 if label.strip() == 'standard' else 0 for label in decoded_labels]

        acc = accuracy_score(labels_bin, preds_bin)
        prec, rec, f1, _ = precision_recall_fscore_support(labels_bin, preds_bin, average='binary')
        return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


    # ─────────────────────────────────────────────────────────
    # 3. 5-Fold CV
    # ─────────────────────────────────────────────────────────
    accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(cv.split(input_texts, class_labels), start=1):
        print(f"=== Fold {fold}/5 ===")
        train_ds = dataset.select(train_idx)
        val_ds = dataset.select(val_idx)

        training_args = TrainingArguments(
            output_dir=f'./cv_fold{fold}',
            num_train_epochs=1,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            save_strategy='no',
            report_to=['none'],
            eval_steps=500,               # Optional: manually trigger eval every 500 steps
            logging_steps=500,
            fp16=torch.cuda.is_available()  # Only if you're using GPU
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )

        trainer.train()
        metrics = trainer.evaluate()

        accuracy_scores.append(metrics['eval_accuracy'])
        precision_scores.append(metrics['eval_precision'])
        recall_scores.append(metrics['eval_recall'])
        f1_scores.append(metrics['eval_f1'])

        print(f"Fold {fold}: Acc {metrics['eval_accuracy']:.4f}, "
              f"Prec {metrics['eval_precision']:.4f}, Rec {metrics['eval_recall']:.4f}, "
              f"F1 {metrics['eval_f1']:.4f}")

    def report(name, vals):
        mean, std = np.mean(vals), np.std(vals)
        print(f"{name}: {mean:.4f} ± {std:.4f}")

    print("\n=== CV summary over 5 folds ===")
    report("Accuracy", accuracy_scores)
    report("Precision", precision_scores)
    report("Recall", recall_scores)
    report("F1-score", f1_scores)

    # ─────────────────────────────────────────────────────────
    # 4. Retrain on Full Dataset
    # ─────────────────────────────────────────────────────────
    print("\nRetraining classifier head on the full dataset…")

    full_args = TrainingArguments(
        output_dir='./full_train',
        num_train_epochs=2,
        per_device_train_batch_size=8,
        save_strategy='no',
        report_to=['none'],
        fp16=torch.cuda.is_available()
    )

    trainer_full = Trainer(
        model=model,
        args=full_args,
        train_dataset=dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer_full.train()

    # ─────────────────────────────────────────────────────────
    # 5. Save final model
    # ─────────────────────────────────────────────────────────
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print(f"Saved model in '{MODEL_DIR}'")


=== Fold 1/5 ===


Step,Training Loss


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1: Acc 0.5000, Prec 0.0000, Rec 0.0000, F1 0.0000
=== Fold 2/5 ===


Step,Training Loss


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2: Acc 0.5006, Prec 0.0000, Rec 0.0000, F1 0.0000
=== Fold 3/5 ===


Step,Training Loss


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 3: Acc 0.5006, Prec 0.0000, Rec 0.0000, F1 0.0000
=== Fold 4/5 ===


Step,Training Loss


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 4: Acc 0.4994, Prec 0.0000, Rec 0.0000, F1 0.0000
=== Fold 5/5 ===


Step,Training Loss


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 5: Acc 0.4994, Prec 0.0000, Rec 0.0000, F1 0.0000

=== CV summary over 5 folds ===
Accuracy: 0.5000 ± 0.0006
Precision: 0.0000 ± 0.0000
Recall: 0.0000 ± 0.0000
F1-score: 0.0000 ± 0.0000

Retraining classifier head on the full dataset…


Step,Training Loss
500,0.2294
1000,0.2146


Saved model in 't5_saved_model'
