
#Task 2 – Complex vs simple classifier (free analysis)

This script trains a simple classifier to predict Label from Sentence
(using the English corpus), evaluates it on a held-out test set, and
prints some misclassified examples for basic error analysis.

Assumed label semantics: Label = 0 (simple), Label = 1 (complex)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from pathlib import Path
import argparse

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


DATA_DIR_DEFAULT = Path("/content/drive/MyDrive/idem-candidate-task-data/data")

In [6]:
def load_en_dataset(data_dir: Path) -> pd.DataFrame:
    """Load the English dataset and check required columns."""
    path = data_dir / "En-Dataset.csv"
    if not path.exists():
        raise FileNotFoundError(f"Expected file not found: {path}")

    df = pd.read_csv(path)

    required_cols = {"Sentence", "Label"}
    missing = required_cols.difference(df.columns)
    if missing:
        raise ValueError(f"{path} is missing required columns: {missing}")

    # Drop rows with missing sentence or label
    df = df.dropna(subset=["Sentence", "Label"])
    return df



# Complex vs simple classifier

Train a simple model (e.g. logistic regression, small transformer, etc.) to predict the Label from Sentence (EN or FR).

Evaluate on a held-out test set.

Include at least a little error analysis: show some misclassified sentences and comment on patterns.

In [7]:
def train_classifier(df: pd.DataFrame):
    """
    Train a TF-IDF + Logistic Regression classifier.

    Returns:
        vectorizer, clf, X_test (Series), y_test (Series), y_pred (ndarray)
    """
    X = df["Sentence"].astype(str)
    y = df["Label"].astype(int)

    # Stratified train/test split to preserve class balance
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y,
    )

    # Vectorise text using TF-IDF (word + bigrams)
    vectorizer = TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 2),
        lowercase=True,
    )
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Simple linear classifier
    clf = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
    )
    clf.fit(X_train_vec, y_train)

    y_pred = clf.predict(X_test_vec)

    return vectorizer, clf, X_test, y_test, y_pred

In [12]:
def print_metrics(y_test, y_pred):
    """Print classification report and confusion matrix."""
    print("\nClassification report (EN) ")
    print(
        classification_report(
            y_test,
            y_pred,
            digits=3,
            target_names=[
                "simple (Label=0)",
                "complex (Label=1)",
            ],
        )
    )

    print("=== Confusion matrix (rows=true, cols=predicted) ===")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)


# Misclassified sentences and comment on patterns.

In [13]:
def sample_misclassifications(X_test, y_test, y_pred, n_examples: int = 8):
    """
    Return a small list of misclassified examples for error analysis.
    Each item is a dict: {true_label, pred_label, sentence}
    """
    y_test_arr = y_test.values
    mis_mask = y_test_arr != y_pred
    mis_idx = np.where(mis_mask)[0]

    if len(mis_idx) == 0:
        return []

    rng = np.random.RandomState(42)
    chosen = rng.choice(
        mis_idx,
        size=min(n_examples, len(mis_idx)),
        replace=False,
    )

    examples = []
    for idx in chosen:
        examples.append(
            {
                "true_label": int(y_test_arr[idx]),
                "pred_label": int(y_pred[idx]),
                "sentence": X_test.iloc[idx],
            }
        )
    return examples

In [15]:
def print_error_analysis(examples):
    """
    Print misclassified sentences and brief hints on what to look for.

    This is where you can manually inspect patterns such as:
    - short but dense/technical sentences mistaken for complex
    - long but structurally simple sentences mistaken for simple
    - clear label noise in the original annotations
    """
    if not examples:
        print("\nNo misclassifications found in the sampled test set.")
        return

    print("\n Sample misclassified sentences (for error analysis) \n")
    for ex in examples:
        tl = ex["true_label"]
        pl = ex["pred_label"]
        sent = ex["sentence"]
        label_str = {
            0: "simple (0)",
            1: "complex (1)",
        }
        print(f"True label: {label_str.get(tl, tl)}")
        print(f"Predicted : {label_str.get(pl, pl)}")
        print(f"Sentence  : {sent}")
        print("-" * 80)

    print(
        "\nMy Observations\n"
        "When we look at these examples above\n"
        "- Short sentences with dense or specialised vocabulary classified as complex.\n"
        "- Long sentences with simple, repetitive structure classified as simple.\n"
        "- Apparent label noise: some sentences the model gets 'wrong' may actually\n"
        "  look more like the opposite class on inspection.\n"
        "This supports the idea that labels are noisy and that complexity depends on\n"
        "both structure and vocabulary, not just length.\n"
    )


In [16]:
def main(data_dir: str = DATA_DIR_DEFAULT):
    data_dir = Path(data_dir)
    df_en = load_en_dataset(data_dir)

    print(f"Loaded English dataset with {len(df_en)} sentences.")

    vectorizer, clf, X_test, y_test, y_pred = train_classifier(df_en)

    print_metrics(y_test, y_pred)

    mis_examples = sample_misclassifications(X_test, y_test, y_pred, n_examples=8)
    print_error_analysis(mis_examples)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Task 2 – Complex vs simple classifier (free analysis)."
    )
    parser.add_argument(
        "--data-dir",
        type=str,
        default=str(DATA_DIR_DEFAULT),
        help="Directory containing En-Dataset.csv",
    )
    # Use parse_known_args to ignore arguments passed by the Jupyter kernel
    args, unknown = parser.parse_known_args()
    main(data_dir=args.data_dir)

Loaded English dataset with 290708 sentences.

 Classification report (EN) 
                   precision    recall  f1-score   support

 simple (Label=0)      0.230     0.711     0.348      3594
complex (Label=1)      0.978     0.843     0.906     54548

         accuracy                          0.835     58142
        macro avg      0.604     0.777     0.627     58142
     weighted avg      0.932     0.835     0.871     58142

=== Confusion matrix (rows=true, cols=predicted) ===
[[ 2556  1038]
 [ 8543 46005]]

 Sample misclassified sentences (for error analysis) 

True label: complex (1)
Predicted : simple (0)
Sentence  : He died owing medical bills and back taxes, and contributions were solicited from the public.
--------------------------------------------------------------------------------
True label: complex (1)
Predicted : simple (0)
Sentence  : The "lozenge" paintings are square canvases tilted 45 degrees, so that they have a diamond shape.
------------------------------------