Step 1:Computes naïve proportion of simple sentences

Step 2: Builds a simple classifier on clean subsets

Step 3: Uses it to estimate an adjusted true proportion

Step 4: Detects Vikidia-style simple sentences inside the complex-labelled set

Step 5: Prints + saves a small summary

schema :Label = 1(simple), Label = 0(complex)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install scikit-learn




In [1]:
from pathlib import Path
import argparse

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
DATA_DIR_DEFAULT = Path("/content/drive/MyDrive/idem-candidate-task-data/data")
OUTPUT_DIR_DEFAULT = Path("/content/drive/MyDrive/idem-candidate-task-data/data")

LANG_FILES = {
    "en": "En-Dataset.csv",
    "fr": "Fr-Dataset.csv",
}

EXPECTED_COLUMNS = [
    "ID",
    "Name",
    "Sentence",
    "Label",
    "LengthWords",
    "LengthChars",
]


def load_dataset(data_dir: Path, lang: str) -> pd.DataFrame:
    """Load and validate dataset for a given language ('en' or 'fr')."""
    fname = LANG_FILES[lang]
    path = data_dir / fname
    if not path.exists():
        raise FileNotFoundError(f"Expected file not found: {path}")

    df = pd.read_csv(path)

    missing = set(EXPECTED_COLUMNS) - set(df.columns)
    if missing:
        raise ValueError(f"{path} is missing expected columns: {missing}")

    # Enforce column order for consistency
    df = df[EXPECTED_COLUMNS]
    return df

In [3]:
def select_clean_subsets(
    df: pd.DataFrame,
    min_per_class: int = 200,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Define high-confidence "clean" subsets for simple and complex.

    Heuristic:
    - Confident simple: Label == 1 and short sentences
    - Confident complex: Label == 0 and long sentences

    If those conditions do not give enough examples, fall back to
    using quartiles of LengthWords.
    """
    # First try fixed thresholds
    simple_max_len = 10
    complex_min_len = 20

    clean_simple = df[(df["Label"] == 1) & (df["LengthWords"] <= simple_max_len)]
    clean_complex = df[(df["Label"] == 0) & (df["LengthWords"] >= complex_min_len)]

    if len(clean_simple) < min_per_class or len(clean_complex) < min_per_class:
        # Fall back to quantile-based thresholds to get enough data
        q1 = df["LengthWords"].quantile(0.25)
        q3 = df["LengthWords"].quantile(0.75)
        clean_simple = df[(df["Label"] == 1) & (df["LengthWords"] <= q1)]
        clean_complex = df[(df["Label"] == 0) & (df["LengthWords"] >= q3)]

    return clean_simple, clean_complex

#Compute the naive estimate

In [4]:
def train_simple_classifier(
    clean_simple: pd.DataFrame,
    clean_complex: pd.DataFrame,
    max_features: int = 20000,
) -> tuple[TfidfVectorizer, LogisticRegression]:
    """
    Train TF-IDF + Logistic Regression classifier on clean examples.

    Target labels:
    - y = 1 for simple (Label == 1)
    - y = 0 for complex (Label == 0)
    """
    clean_df = pd.concat([clean_simple, clean_complex], ignore_index=True)
    y = clean_df["Label"].astype(int)  # 1 = simple, 0 = complex

    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=(1, 2),
        lowercase=True,
    )
    X = vectorizer.fit_transform(clean_df["Sentence"])

    clf = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
    )
    clf.fit(X, y)
    return vectorizer, clf

#Design a better estimation method, using any reasonable approach.

In [5]:
def estimate_vikidia_like(
    df: pd.DataFrame,
    vectorizer: TfidfVectorizer,
    p_simple_col: str = "p_simple",
    label_col: str = "Label",
    simple_label: int = 1,
    complex_label: int = 0,
    top_k_simple: int = 2000,
    similarity_quantile: float = 0.95,
) -> tuple[float, np.ndarray]:
    """
    Estimate the proportion of Vikidia-style simple sentences
    among complex-labelled sentences.

    Operationalisation:
    - Take top_k_simple sentences with highest p_simple as
      prototypical simple sentences.
    - Represent sentences using the same TF-IDF vectorizer.
    - For each complex-labelled sentence, compute maximum
      cosine similarity to these prototypical simple ones.
    - Mark as Vikidia-like if the maximum similarity is above
      a high quantile threshold (e.g. 95th percentile).
    """
    df_sorted = df.sort_values(p_simple_col, ascending=False)
    ref_simple = df_sorted.head(top_k_simple)

    X_ref = vectorizer.transform(ref_simple["Sentence"])

    complex_mask = df[label_col] == complex_label
    df_complex = df[complex_mask]

    if df_complex.empty:
        return 0.0, np.zeros(len(df), dtype=bool)

    X_complex = vectorizer.transform(df_complex["Sentence"])

    sims = cosine_similarity(X_complex, X_ref)
    max_sim = sims.max(axis=1)

    # Threshold picked adaptively based on similarity distribution
    threshold = np.quantile(max_sim, similarity_quantile)
    vikidia_like_complex = max_sim >= threshold

    vikidia_like_mask = np.zeros(len(df), dtype=bool)
    vikidia_like_mask[df_complex.index.values] = vikidia_like_complex

    prop_vikidia_like_in_complex = (
        vikidia_like_complex.mean() if len(vikidia_like_complex) > 0 else 0.0
    )
    return prop_vikidia_like_in_complex, vikidia_like_mask


#Produce final estimates

In [8]:
def analyse_language(
    lang: str,
    data_dir: Path,
    output_dir: Path,
) -> dict:
    """
    Full pipeline for one language:
    - Load data
    - Compute naive proportion of simple sentences
    - Train classifier on clean subset
    - Compute adjusted proportion of simple sentences
    - Estimate Vikidia-like simple sentences among complex-labelled
    - Save enriched dataset and return summary metrics
    """
    df = load_dataset(data_dir, lang)

    print(f"\n{lang.upper()} Dataset – Task 1 Analysis")

    # 1. Naive estimate of simple proportion
    naive_prop_simple = (df["Label"] == 1).mean()
    print(f"Naïve simple proportion (Label == 1): {naive_prop_simple:.4f}")

    # 2. Clean subsets for training
    clean_simple, clean_complex = select_clean_subsets(df)
    print(
        f"Clean subset sizes: simple={len(clean_simple)}, complex={len(clean_complex)}"
    )

    # 3. Train classifier & get probabilities
    vectorizer, clf = train_simple_classifier(clean_simple, clean_complex)

    X_all = vectorizer.transform(df["Sentence"])
    p_simple = clf.predict_proba(X_all)[:, 1]  # probability of being simple

    df = df.copy()
    df["p_simple"] = p_simple

    adjusted_prop_simple = float(df["p_simple"].mean())
    print(f"Adjusted simple proportion (mean p_simple): {adjusted_prop_simple:.4f}")

    # 4. Vikidia-style sentences among complex-labelled ones
    prop_vikidia_like_in_complex, vikidia_like_mask = estimate_vikidia_like(
        df,
        vectorizer,
        p_simple_col="p_simple",
        label_col="Label",
        simple_label=1,
        complex_label=0,
        top_k_simple=2000,
        similarity_quantile=0.95,
    )
    df["vikidia_like_in_complex"] = vikidia_like_mask

    print(
        f"Proportion of complex-labelled sentences that look Vikidia-style simple: "
        f"{prop_vikidia_like_in_complex:.4f}"
    )

    # 5. Save enriched dataset for inspection
    output_dir.mkdir(parents=True, exist_ok=True)
    out_path = output_dir / f"{lang}_with_probs_and_vikidia_like.csv"
    df.to_csv(out_path, index=False)
    print(f"Enriched dataset saved to: {out_path}")

    summary = {
        "language": lang,
        "n_sentences": len(df),
        "naive_prop_simple": naive_prop_simple,
        "adjusted_prop_simple": adjusted_prop_simple,
        "prop_vikidia_like_in_complex": prop_vikidia_like_in_complex,
    }
    return summary



In [9]:
def main(data_dir: str = DATA_DIR_DEFAULT, output_dir: str = OUTPUT_DIR_DEFAULT):
    data_dir = Path(data_dir)
    output_dir = Path(output_dir)

    summaries = []
    for lang in ("en", "fr"):
        summary = analyse_language(lang, data_dir, output_dir)
        summaries.append(summary)

    summary_df = pd.DataFrame(summaries)

    print("\nFinal Summary")
    with pd.option_context("display.max_columns", None):
        print(
            summary_df.to_string(
                index=False,
                float_format=lambda x: f"{x:.4f}",
            )
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Task 1 — Estimate true proportion of simple sentences."
    )
    parser.add_argument(
        "--data-dir",
        type=str,
        default=str(DATA_DIR_DEFAULT),
        help="Directory containing En-Dataset.csv and Fr-Dataset.csv",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default=str(OUTPUT_DIR_DEFAULT),
        help="Directory to save enriched datasets and summary CSV.",
    )
    args = parser.parse_args([]) # Pass an empty list to parse_args()
    main(data_dir=args.data_dir, output_dir=args.output_dir)



EN Dataset – Task 1 Analysis
Naïve simple proportion (Label == 1): 0.9382
Clean subset sizes: simple=8955, complex=6166
Adjusted simple proportion (mean p_simple): 0.3770
Proportion of complex-labelled sentences that look Vikidia-style simple: 0.0500
Enriched dataset saved to: /content/drive/MyDrive/idem-candidate-task-data/data/en_with_probs_and_vikidia_like.csv

FR Dataset – Task 1 Analysis
Naïve simple proportion (Label == 1): 0.8656
Clean subset sizes: simple=60039, complex=102665
Adjusted simple proportion (mean p_simple): 0.3690
Proportion of complex-labelled sentences that look Vikidia-style simple: 0.0500
Enriched dataset saved to: /content/drive/MyDrive/idem-candidate-task-data/data/fr_with_probs_and_vikidia_like.csv

Final Summary
language  n_sentences  naive_prop_simple  adjusted_prop_simple  prop_vikidia_like_in_complex
      en       290708             0.9382                0.3770                        0.0500
      fr      1699063             0.8656                0.3690