In [3]:
import json
import re
import csv
from pathlib import Path
from typing import Dict, Any, List
from bs4 import BeautifulSoup

# ---------- CONFIG ----------
INPUT_JSON = r"C:\Users\navya\Desktop\IBY\data\emails_raw.jsonl"
OUT_JSON = "labeled_emails.json"
OUT_CSV = "labeled_emails.csv"
REQUIRE_OCS_FOR_OCS_SPECIFIC = True

# ---------- Patterns ----------
ASSIGNMENT_KEYWORDS = [
    r"\bassign(?:ment|ments)\b", r"\bhomework\b", r"\bproject\b", r"\blab\b",
    r"\bsubmission\b", r"\bsubmit\b", r"\bdue date\b", r"\bturnitin\b"
]
EXAM_QUIZ_KEYWORDS = [
    r"\bexam\b", r"\bexamination\b", r"\bquiz\b", r"\bmid[-\s]?sem\b", r"\bmidterm\b",
    r"\bend[-\s]?sem\b", r"\bfinals?\b", r"\btest\b", r"\btimetable\b"
]
SHORTLIST_KEYWORDS = [
    r"\bshortlist(?:ed)?\b", r"\byou (?:have )?been selected\b",
    r"\byou (?:have )?been shortlisted\b", r"\bnext round\b", r"\bround[-\s]?\d+\b",
    r"\binterview(?:s)?\b", r"\bcall for interview\b", r"\binvite(?:d)? for interview\b",
    r"\bcongratulations\b"
]
REGISTRATION_KEYWORDS = [
    r"\bregister\b", r"\bregistration\b", r"\bapply\b", r"\bapplication\b",
    r"\bportal\b", r"\bonline registration\b", r"\bregister here\b", r"\bapply by\b",
    r"\bapply before\b", r"\bopen for applications\b", r"\bdeadline\b"
]
OCS_KEYWORDS = [
    r"\bocs\b", r"\boffice of career services\b", r"\bcareer services\b",
    r"\bcareer cell\b", r"\bocs portal\b", r"\bocs-portal\b", r"\bocsp\b"
]

def compile_patterns(list_of_regex):
    return [re.compile(pat, re.IGNORECASE) for pat in list_of_regex]

ASSIGNMENT_RE, EXAM_RE, SHORTLIST_RE, REG_RE, OCS_RE = (compile_patterns(kw) for kw in [
    ASSIGNMENT_KEYWORDS, EXAM_QUIZ_KEYWORDS, SHORTLIST_KEYWORDS, REGISTRATION_KEYWORDS, OCS_KEYWORDS
])

def find_matches(text: str, regex_list: List[re.Pattern]) -> List[str]:
    found = set()
    for rx in regex_list:
        for m in rx.finditer(text):
            found.add(m.group(0).strip())
    return list(found)

# --- 2. NEW FUNCTION TO CLEAN HTML ---
def clean_html_body(html_content: str) -> str:
    """Uses BeautifulSoup to parse HTML and return only the plain text."""
    if not html_content:
        return ""
    # Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    # Get all the text
    text = soup.get_text(separator=' ')
    # Clean up extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def label_email(email: Dict[str, Any]) -> Dict[str, Any]:
    subject = email.get("subject", "") or ""
    
    # --- 3. USE THE CLEANING FUNCTION ---
    raw_body = email.get("body", "") or ""
    body = clean_html_body(raw_body) 
    
    text = (subject + "\n" + body).strip()
    text_lower = text.lower()

    assignment_matches = find_matches(text, ASSIGNMENT_RE)
    exam_matches = find_matches(text, EXAM_RE)
    shortlist_matches = find_matches(text, SHORTLIST_RE)
    reg_matches = find_matches(text, REG_RE)
    ocs_matches = find_matches(text, OCS_RE)

    assignment_deadline = bool(assignment_matches)
    exam_or_quiz_date = bool(exam_matches)
    shortlisted_or_interview = bool(shortlist_matches)
    internship_registration = bool(reg_matches and re.search(r"\b(opportunity|role|job|internship|intern)\b", text_lower))
    internship_registration_ocs_specific = bool(internship_registration and ocs_matches)
    
    match_count = sum(bool(x) for x in [assignment_matches, exam_matches, shortlist_matches, reg_matches, ocs_matches])
    confidence = min(1.0, 0.2 * match_count)
    
    explanation = {
        "assignment_matches": assignment_matches, "exam_matches": exam_matches,
        "shortlist_matches": shortlist_matches, "registration_matches": reg_matches,
        "ocs_matches": ocs_matches
    }
    
    result = {
        "subject": subject,
        "from": email.get("from", ""),
        "body": body,
        "assignment_deadline": assignment_deadline,
        "exam_or_quiz_date": exam_or_quiz_date,
        "shortlisted_or_interview": shortlisted_or_interview,
        "internship_registration": internship_registration,
        "internship_registration_ocs_specific": internship_registration_ocs_specific,
        "label_true_if_any": any([assignment_deadline, exam_or_quiz_date, shortlisted_or_interview, internship_registration_ocs_specific]),
        "confidence": round(confidence, 2),
        "explain": explanation
    }
    return result

def load_emails(path: str) -> List[Dict[str, Any]]:
    p = Path(path)
    if not p.exists(): raise FileNotFoundError(f"{path} not found")
    emails = []
    with p.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                try: emails.append(json.loads(line))
                except json.JSONDecodeError as e: print(f"Warning: Skipping a malformed line: {e}")
    return emails

def write_outputs(results: List[Dict[str, Any]], out_json: str, out_csv: str):
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    fieldnames = [
        "subject", "from", "body", "assignment_deadline", "exam_or_quiz_date",
        "shortlisted_or_interview", "internship_registration", "internship_registration_ocs_specific",
        "label_true_if_any", "confidence", "explain"
    ]
    with open(out_csv, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for r in results:
            row = {k: r.get(k, "") for k in fieldnames}
            row["explain"] = json.dumps(r.get("explain", {}))
            writer.writerow(row)

def main():
    try:
        emails = load_emails(INPUT_JSON)
        results = [label_email(e) for e in emails]
        write_outputs(results, OUT_JSON, OUT_CSV)
        print(f"Processed {len(results)} emails. Outputs written to {OUT_JSON} and {OUT_CSV}.")
    except FileNotFoundError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    main()

Processed 500 emails. Outputs written to labeled_emails.json and labeled_emails.csv.
