In [None]:
!pip install spacy transformers sentencepiece accelerate
!python -m spacy download en_core_web_sm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
from transformers import pipeline
import re
import spacy
from dateutil import parser
import json

# Load models
nlp = spacy.load("en_core_web_sm")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def extract_email_data(email_text):
    doc = nlp(email_text)

    data = {
        "name": None,
        "roll_no": None,
        "department": None,
        "division": None,
        "email": None,
        "reason": None,
        "leave_start": None,
        "leave_end": None
    }

    # ✅ Name
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not data["name"]:
            data["name"] = ent.text.strip()
    if not data["name"]:
        match = re.search(r"My name is ([A-Za-z ]+)", email_text, re.IGNORECASE)
        if match:
            data["name"] = match.group(1).strip()

    # ✅ Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', email_text)
    if email_match:
        data["email"] = email_match.group(0)

    # ✅ Roll Number
    roll_match = re.search(r'roll number[:\s]*([A-Za-z0-9\-]+)', email_text, re.IGNORECASE)
    if roll_match:
        data["roll_no"] = roll_match.group(1).strip()

    # ✅ Department
    dept_match = re.search(r'(?:department of|student of|dept)[^\n]*?([A-Za-z\s&\.]+)', email_text, re.IGNORECASE)
    if dept_match:
        department = dept_match.group(1).strip()
        department = re.sub(r'(year|student)', '', department, flags=re.IGNORECASE).strip()
        data["department"] = department

    # ✅ Division
    div_match = re.search(r'(?:division|div)[^\n:]*[:\- ]+([A-Z])', email_text, re.IGNORECASE)
    if div_match:
        data["division"] = div_match.group(1).strip()

    # ✅ Dates
    date_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    try:
        parsed_dates = [parser.parse(d, fuzzy=True, dayfirst=True).date() for d in date_entities]
        parsed_dates = sorted(parsed_dates)
        if len(parsed_dates) > 0:
            data["leave_start"] = str(parsed_dates[0])
        if len(parsed_dates) > 1:
            data["leave_end"] = str(parsed_dates[-1])
    except:
        pass

    # ✅ Reason
    reason_match = re.search(r'(due to|because of|reason[:\- ]+)([^\.\n]+)', email_text, re.IGNORECASE)
    if reason_match:
        data["reason"] = reason_match.group(2).strip()

    # ✅ Fallback with AI if missing critical fields
    questions = {
        "name": "What is the student's name?",
        "roll_no": "What is the student's roll number?",
        "department": "Which department is the student from?",
        "reason": "What is the reason for leave?",
        "leave_start": "What is the start date of leave?",
        "leave_end": "What is the end date of leave?"
    }

    for field, question in questions.items():
        if data[field] is None:
            try:
                answer = qa_pipeline({"context": email_text, "question": question})["answer"]
                if answer and answer.lower() != "no answer":
                    data[field] = answer
            except:
                pass

    return data


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
email_text = """To whom it may concern, I am writing to request medical leave for 18-08-2025 to 20-08-2025.
My name is Sneha Reddy (roll number 24), and I am a TY year student of Btech, Computer Engineering department.
I am unable to attend classes during this period due to Viral fever. I hope you understand my situation. Warm regards, Sneha Reddy"""

result = extract_email_data(email_text)
print(json.dumps(result, indent=4))


{
    "name": "Sneha Reddy",
    "roll_no": "24",
    "department": "Btech",
    "division": null,
    "email": null,
    "reason": "Viral fever",
    "leave_start": "2025-08-18",
    "leave_end": "2025-08-20"
}


In [None]:
%%writefile extractor.py
import re
import spacy
from dateutil import parser
from transformers import pipeline

nlp = spacy.load("en_core_web_sm")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def extract_email_data(email_text):
    doc = nlp(email_text)

    data = {
        "name": None,
        "roll_no": None,
        "department": None,
        "division": None,
        "email": None,
        "reason": None,
        "leave_start": None,
        "leave_end": None
    }

    # ✅ Name
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not data["name"]:
            data["name"] = ent.text.strip()
    if not data["name"]:
        match = re.search(r"My name is ([A-Za-z ]+)", email_text, re.IGNORECASE)
        if match:
            data["name"] = match.group(1).strip()

    # ✅ Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', email_text)
    if email_match:
        data["email"] = email_match.group(0)

    # ✅ Roll Number
    roll_match = re.search(r'roll number[:\s]*([A-Za-z0-9\-]+)', email_text, re.IGNORECASE)
    if roll_match:
        data["roll_no"] = roll_match.group(1).strip()

    # ✅ Department
    dept_match = re.search(r'(?:department of|student of|dept)[^\n]*?([A-Za-z\s&\.]+)', email_text, re.IGNORECASE)
    if dept_match:
        department = dept_match.group(1).strip()
        department = re.sub(r'(year|student)', '', department, flags=re.IGNORECASE).strip()
        data["department"] = department

    # ✅ Division
    div_match = re.search(r'(?:division|div)[^\n:]*[:\- ]+([A-Z])', email_text, re.IGNORECASE)
    if div_match:
        data["division"] = div_match.group(1).strip()

    # ✅ Leave Dates
    date_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    try:
        parsed_dates = [parser.parse(d, fuzzy=True, dayfirst=True).date() for d in date_entities]
        parsed_dates = sorted(parsed_dates)
        if len(parsed_dates) > 0:
            data["leave_start"] = str(parsed_dates[0])
        if len(parsed_dates) > 1:
            data["leave_end"] = str(parsed_dates[-1])
    except:
        pass

    # ✅ Reason
    reason_match = re.search(r'(due to|because of|reason[:\- ]+)([^\.\n]+)', email_text, re.IGNORECASE)
    if reason_match:
        data["reason"] = reason_match.group(2).strip()

    # ✅ AI Fallback
    questions = {
        "name": "What is the student's name?",
        "roll_no": "What is the student's roll number?",
        "department": "Which department is the student from?",
        "reason": "What is the reason for leave?",
        "leave_start": "What is the start date of leave?",
        "leave_end": "What is the end date of leave?"
    }

    for field, question in questions.items():
        if data[field] is None:
            try:
                answer = qa_pipeline({"context": email_text, "question": question})["answer"]
                if answer and answer.lower() != "no answer":
                    data[field] = answer
            except:
                pass

    return data


Writing extractor.py


In [None]:
%%writefile app.py
import streamlit as st
import json
from extractor import extract_email_data

st.title("📧 Email Leave Data Extractor")
st.write("Paste the email text below to extract details like Name, Roll No, Department, Dates, and Reason.")

email_text = st.text_area("Enter Email Text:")

if st.button("Extract Details"):
    if email_text.strip():
        result = extract_email_data(email_text)
        st.subheader("Extracted Data:")
        st.json(result)
    else:
        st.warning("Please enter some email text.")


Overwriting app.py


In [None]:
!pip install pyngrok
from pyngrok import ngrok




In [None]:
!kill $(ps -fA | grep streamlit | awk '{print $2}') 2>/dev/null


^C


In [None]:
# Start Streamlit in the background
!nohup streamlit run app.py --server.port 8501 &

# Authenticate and create a tunnel
ngrok.set_auth_token("30lU2extiBojyrSaadCgpoUwwOa_7coyERzDkF6V1NWY1XrnY")  # Replace with your token from https://dashboard.ngrok.com/get-started/your-authtoken
public_url = ngrok.connect(8501)
print("Access your app here:", public_url)


nohup: appending output to 'nohup.out'
Access your app here: NgrokTunnel: "https://98f03a2d5715.ngrok-free.app" -> "http://localhost:8501"
