In [None]:
import pandas as pd

# Define 8 categories with keywords
category_keywords = {
    "PGWP": [
        "pgwp", "post graduation work permit"
    ],
    "Express Entry": [
        "express entry", "crs score", "ita", "ee profile", "federal skilled worker", "cec"
    ],
    "Student Permit": [
        "study permit", "student visa", "dli"
    ],
    "PNP": [
        "pnp", "provincial nominee", "oinp", "bc pnp", "mpnp", "sinp"
    ],
    "Work Permit": [
        "lmia", "closed work permit", "gts", "global talent stream",
        "work permit", "intra-company transfer", "lmia-exempt"
    ],
    "Family Sponsorship": [
        "spousal sponsorship", "common law sponsorship",
        "parents sponsorship", "family sponsorship"
    ],
    "Refugee": [
        "refugee", "asylum", "protected person", "h&c"
    ],
    "Other":
}

def predict_category_by_keywords(text, cat_keywords):
    """
    Simple heuristic:
      - For each category except 'Other', check if any keyword appears in text.
      - If found, return that category immediately.
      - If none found, return 'Other'.
    """
    text_lower = str(text).lower()
    for cat, kw_list in cat_keywords.items():
        if cat == "Other":
            continue
        for kw in kw_list:
            if kw in text_lower:
                return cat
    return "Other"

def guess_categories(df, text_col="clean_text"):
    """
    Apply the simple keyword-based guess for each row,
    store the result in df["predicted_cat"].
    """
    df["predicted_cat"] = df[text_col].apply(
        lambda x: predict_category_by_keywords(x, category_keywords)
    )
    return df

def stratified_sample_with_min_quota(df, group_col, n_total=1000, min_per_cat=None):
    """
    Stratified sampling with optional minimum quota per category.

    - If min_per_cat is given as a dict like {"Refugee": 50, "PNP": 50},
      then those categories will get at least that many rows in the sample.
    - If sum_of_requested > n_total, we automatically expand n_total
      to sum_of_requested to meet min quotas.
    - The final sample is returned (shuffled).
    """

    group_counts = df[group_col].value_counts()
    total_count = group_counts.sum()

    cat_sampling_numbers = {}
    sum_of_requested = 0


    for cat_value, cat_count in group_counts.items():
        ratio = cat_count / total_count
        n_group = int(round(ratio * n_total))


        if min_per_cat and cat_value in min_per_cat:
            min_q = min_per_cat[cat_value]
            if n_group < min_q:
                n_group = min_q

        subset_size = len(df[df[group_col] == cat_value])
        n_group_final = min(n_group, subset_size)
        cat_sampling_numbers[cat_value] = n_group_final
        sum_of_requested += n_group_final


    if sum_of_requested > n_total:
        print(f"[WARNING] sum_of_requested({sum_of_requested}) > n_total({n_total}). "
              f"Will expand total sample size to {sum_of_requested}.")
        n_total = sum_of_requested

    all_samples = []
    for cat_value, n_req in cat_sampling_numbers.items():
        subset_df = df[df[group_col] == cat_value]
        if len(subset_df) <= n_req:
            sampled_df = subset_df
        else:
            sampled_df = subset_df.sample(n_req, random_state=42)
        all_samples.append(sampled_df)

    final_sample = pd.concat(all_samples).sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"[INFO] Final sample size: {len(final_sample)} (target was {n_total})")
    print("[INFO] Sample distribution:")
    print(final_sample[group_col].value_counts())
    return final_sample

def main():
    df = pd.read_csv("/content/cleaned_merged_data_Final.csv")
    print("[INFO] Original data size:", len(df))

    text_col = "clean_text"

    df_with_guess = guess_categories(df, text_col=text_col)


    min_per_cat = {
        "Refugee": 50,
        "Family Sponsorship": 50,
        "PNP": 50
    }

    df_sample = stratified_sample_with_min_quota(
        df_with_guess,
        group_col="predicted_cat",
        n_total=1000,  # desired total
        min_per_cat=min_per_cat
    )

    df_sample.to_csv("sample_for_manual_label.csv", index=False)
    print("[DONE] 'sample_for_manual_label.csv' is ready for manual labeling.")

if __name__ == "__main__":
    main()


[INFO] Original data size: 10664
[INFO] Final sample size: 1090 (target was 1090)
[INFO] Sample distribution:
predicted_cat
Other                 503
Express Entry         164
PGWP                   96
Student Permit         90
Work Permit            87
Family Sponsorship     50
PNP                    50
Refugee                50
Name: count, dtype: int64
[DONE] 'sample_for_manual_label.csv' is ready for manual labeling.


In [None]:
import pandas as pd

# Define 8 categories with keywords
category_keywords = {
    "PGWP": [
        "pgwp", "post graduation work permit"
    ],
    "Express Entry": [
        "express entry", "crs score", "ita", "ee profile", "federal skilled worker", "cec"
    ],
    "Student Permit": [
        "study permit", "student visa", "dli"
    ],
    "PNP": [
        "pnp", "provincial nominee", "oinp", "bc pnp", "mpnp", "sinp"
    ],
    "Work Permit": [
        "lmia", "closed work permit", "gts", "global talent stream",
        "work permit", "intra-company transfer", "lmia-exempt"
    ],
    "Family Sponsorship": [
        "spousal sponsorship", "common law sponsorship",
        "parents sponsorship", "family sponsorship"
    ],
    "Refugee": [
        "refugee", "asylum", "protected person", "h&c"
    ],
    "Other": []
}

def predict_category_by_keywords(text, cat_keywords):
    """
    Simple heuristic:
     - For each category except 'Other', check if any keyword appears in text.
     - If matched, return that category.
     - Otherwise return 'Other'.
    """
    text_lower = str(text).lower()
    for cat, kw_list in cat_keywords.items():
        if cat == "Other":
            continue
        for kw in kw_list:
            if kw in text_lower:
                return cat
    return "Other"

def guess_categories(df, text_col="clean_text"):
    """
    Apply the simple keyword-based guess for each row
    and store the result in df["predicted_cat"].
    """
    df["predicted_cat"] = df[text_col].apply(
        lambda x: predict_category_by_keywords(x, category_keywords)
    )
    return df

def stratified_sample_with_min_quota(df, group_col, n_total=1000, min_per_cat=None):
    """
    Stratified sampling with optional minimum quota per category.

    - If min_per_cat is a dict like {"Refugee": 50, "PNP": 50},
      then those categories will get at least that many rows in the sample.
    - If sum_of_requested > n_total, we automatically expand n_total
      to sum_of_requested to meet minimum quotas.
    - The final sample is returned (shuffled).
    """
    group_counts = df[group_col].value_counts()
    total_count = group_counts.sum()

    cat_sampling_numbers = {}
    sum_of_requested = 0

    for cat_value, cat_count in group_counts.items():
        ratio = cat_count / total_count
        n_group = int(round(ratio * n_total))

        if min_per_cat and cat_value in min_per_cat:
            min_q = min_per_cat[cat_value]
            if n_group < min_q:
                n_group = min_q

        subset_size = len(df[df[group_col] == cat_value])
        n_group_final = min(n_group, subset_size)
        cat_sampling_numbers[cat_value] = n_group_final
        sum_of_requested += n_group_final

    if sum_of_requested > n_total:
        print(f"[WARNING] sum_of_requested({sum_of_requested}) > n_total({n_total}). "
              f"Will expand total sample size to {sum_of_requested}.")
        n_total = sum_of_requested

    all_samples = []
    for cat_value, n_req in cat_sampling_numbers.items():
        subset_df = df[df[group_col] == cat_value]
        if len(subset_df) <= n_req:
            sampled_df = subset_df
        else:
            sampled_df = subset_df.sample(n_req, random_state=42)
        all_samples.append(sampled_df)

    final_sample = pd.concat(all_samples).sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"[INFO] Final sample size: {len(final_sample)} (target was {n_total})")
    print("[INFO] Sample distribution:")
    print(final_sample[group_col].value_counts())
    return final_sample

def main():
    df = pd.read_csv("/content/cleaned_merged_data_Final.csv")
    print("[INFO] Original data size:", len(df))

    text_col = "clean_text"

    # 1A) auto-guess categories
    df_with_guess = guess_categories(df, text_col=text_col)

    # 1B) stratified sampling ~1000 rows
    min_per_cat = {
        "Refugee": 50,
        "Family Sponsorship": 50,
        "PNP": 50
    }

    df_sample = stratified_sample_with_min_quota(
        df_with_guess,
        group_col="predicted_cat",
        n_total=1000,
        min_per_cat=min_per_cat
    )

    df_sample.to_csv("sample_for_manual_label.csv", index=False)
    print("[DONE] 'sample_for_manual_label.csv' is ready for manual labeling (~1000 rows).")

    # Supplement 50 Express Entry rows
    # We want additional 50 'Express Entry' posts that are not in df_sample.

    df_used = df_sample
    print(f"[INFO] Already used {len(df_used)} rows for the main sample.")

    df_remaining = df_with_guess[~df_with_guess[text_col].isin(df_used[text_col])]
    print(f"[INFO] Remaining rows after removing used: {len(df_remaining)}")

    df_express = df_remaining[df_remaining["predicted_cat"] == "Express Entry"]
    print(f"[INFO] Found {len(df_express)} rows predicted as Express Entry in the remainder.")

    num_needed = 50
    if len(df_express) >= num_needed:
        df_express_supp = df_express.sample(num_needed, random_state=42)
    else:
        df_express_supp = df_express

    df_express_supp.to_csv("supplement_express_entry.csv", index=False)
    print(f"[DONE] 'supplement_express_entry.csv' created with {len(df_express_supp)} rows for extra Express Entry labeling.")

if __name__ == "__main__":
    main()


[INFO] Original data size: 10664
[INFO] Final sample size: 1090 (target was 1090)
[INFO] Sample distribution:
predicted_cat
Other                 503
Express Entry         164
PGWP                   96
Student Permit         90
Work Permit            87
Family Sponsorship     50
PNP                    50
Refugee                50
Name: count, dtype: int64
[DONE] 'sample_for_manual_label.csv' is ready for manual labeling (~1000 rows).
[INFO] Already used 1090 rows for the main sample.
[INFO] Remaining rows after removing used: 9540
[INFO] Found 1583 rows predicted as Express Entry in the remainder.
[DONE] 'supplement_express_entry.csv' created with 50 rows for extra Express Entry labeling.
