In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertModel, DistilBertTokenizerFast
import torch
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
df = pd.read_csv('/content/drive/MyDrive/Grammar_autocorrection_df1_new.csv')

# DistilBERT

In [36]:
# ── 0) Configuration ────────────────────────────────────────────────────────
MODEL_DIR = 'distilbert_saved_model'

# ── 1) If already saved, just reload ────────────────────────────────────────
if os.path.isdir(MODEL_DIR):
    tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
    model     = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)
    print(f"Loaded existing model from '{MODEL_DIR}', skipping training.")

# ── 2) Otherwise: 5-fold CV with metrics + retrain full head ────────────────
else:
    # Prepare texts & labels
    texts  = df['Ungrammatical Statement'].tolist() + df['Standard English'].tolist()
    labels = [0]*len(df) + [1]*len(df)

    # Initialize tokenizer & pre-tokenize
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

    # Build Hugging Face Dataset
    dataset = Dataset.from_dict({
        'input_ids':      encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels':         labels
    })

    # Load model & freeze encoder layers
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )
    for param in model.distilbert.parameters():
        param.requires_grad = False

    # Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer)

    # Metric computation function
    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        acc   = accuracy_score(p.label_ids, preds)
        prec, rec, f1, _ = precision_recall_fscore_support(
            p.label_ids, preds, average='binary'
        )
        return {
            'accuracy':  acc,
            'precision': prec,
            'recall':    rec,
            'f1':        f1
        }

    # Prepare lists to collect per-fold metrics
    accuracy_scores  = []
    precision_scores = []
    recall_scores    = []
    f1_scores        = []

    # 5-fold CV, 1 epoch per fold for speed
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(cv.split(texts, labels), start=1):
        print(f"=== Fold {fold}/5 ===")
        train_ds = dataset.select(train_idx)
        val_ds   = dataset.select(val_idx)

        training_args = TrainingArguments(
            output_dir=f'./cv_fold{fold}',
            num_train_epochs=1,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            eval_strategy='epoch',
            logging_strategy='epoch',
            save_strategy='no',
            fp16=True,               # set to False if no GPU
            report_to=['none'],
            seed=42
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )

        trainer.train()
        metrics = trainer.evaluate()

        # Collect metrics
        accuracy_scores.append(metrics['eval_accuracy'])
        precision_scores.append(metrics['eval_precision'])
        recall_scores.append(metrics['eval_recall'])
        f1_scores.append(metrics['eval_f1'])

        # Print per-fold results
        print(f"Fold {fold}: "
              f"Acc {metrics['eval_accuracy']:.4f}, "
              f"Prec {metrics['eval_precision']:.4f}, "
              f"Rec {metrics['eval_recall']:.4f}, "
              f"F1 {metrics['eval_f1']:.4f}")

    # Summarize CV
    def report(name, vals):
        mean, std = np.mean(vals), np.std(vals)
        print(f"{name}: {mean:.4f} ± {std:.4f}")

    print("\n=== CV summary over 5 folds ===")
    report("Accuracy",  accuracy_scores)
    report("Precision", precision_scores)
    report("Recall",    recall_scores)
    report("F1-score",  f1_scores)

    # Retrain head on full dataset
    print("\nRetraining classifier head on the full dataset…")
    full_args = TrainingArguments(
        output_dir='./full_train',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        save_strategy='no',
        fp16=True,
        report_to=['none']
    )
    trainer_full = Trainer(
        model=model,
        args=full_args,
        train_dataset=dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    trainer_full.train()

    # Save the fine-tuned model & tokenizer
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print(f"Saved model in '{MODEL_DIR}'")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== Fold 1/5 ===


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6894,0.690914,0.534653,0.524648,0.737624,0.613169


Fold 1: Acc 0.5347, Prec 0.5246, Rec 0.7376, F1 0.6132
=== Fold 2/5 ===


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6857,0.681561,0.596035,0.597964,0.583127,0.590452


Fold 2: Acc 0.5960, Prec 0.5980, Rec 0.5831, F1 0.5905
=== Fold 3/5 ===


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6802,0.674387,0.587361,0.585784,0.593052,0.589396


Fold 3: Acc 0.5874, Prec 0.5858, Rec 0.5931, F1 0.5894
=== Fold 4/5 ===


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6764,0.66577,0.612144,0.598272,0.685644,0.638985


Fold 4: Acc 0.6121, Prec 0.5983, Rec 0.6856, F1 0.6390
=== Fold 5/5 ===


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.672,0.658086,0.635688,0.622222,0.693069,0.655738


Fold 5: Acc 0.6357, Prec 0.6222, Rec 0.6931, F1 0.6557

=== CV summary over 5 folds ===
Accuracy: 0.5932 ± 0.0336
Precision: 0.5858 ± 0.0328
Recall: 0.6585 ± 0.0603
F1-score: 0.6175 ± 0.0263

Retraining classifier head on the full dataset…


Step,Training Loss
500,0.667


Saved model in 'distilbert_saved_model'


In [37]:
# 1) Load your saved model & tokenizer
MODEL_DIR = 'distilbert_saved_model'
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
model     = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

# 2) Prompt the user for input
sentence = input("Enter a sentence to check (grammar):\n> ")

# 3) Tokenize & forward pass
enc = tokenizer([sentence], padding=True, truncation=True, return_tensors='pt', max_length=128)
with torch.no_grad():
    logits = model(**enc).logits

# 4) Convert to probabilities & predicted label
probs = torch.softmax(logits, dim=-1)[0]
pred  = probs.argmax().item()
label = "Correct" if pred == 1 else "Ungrammatical"
score = probs[pred].item()

# 5) Print result
print(f"\n\"{sentence}\" → {label} (confidence: {score:.2f})")

Enter a sentence to check (grammar):
> she have a dog

"she have a dog" → Ungrammatical (confidence: 0.54)
