In [None]:
!pip install playwright
!playwright install

In [None]:
pip install boto3

In [None]:
import asyncio
import base64
import boto3
import nest_asyncio

nest_asyncio.apply()

# AWS Textract OCR function
def extract_text_from_image_aws(image_path: str) -> str:
    AWS_ACCESS_KEY = ""
    AWS_SECRET_KEY = ""
    AWS_REGION = "ap-south-1"

    client = boto3.client(
        "textract",
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name=AWS_REGION
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = client.detect_document_text(Document={"Bytes": image_bytes})
    lines = [block["Text"] for block in response["Blocks"] if block["BlockType"] == "LINE"]
    return "\n".join(lines).strip()

In [None]:
import asyncio
import base64
import nest_asyncio
import requests
from playwright.async_api import async_playwright

nest_asyncio.apply()

def extract_text_from_image_ocr_space(image_path: str, api_key="helloworld") -> str:
    try:
        with open(image_path, 'rb') as f:
            response = requests.post(
                'https://api.ocr.space/parse/image',
                files={'filename': f},
                data={
                    'apikey': api_key,
                    'OCREngine': 2,
                    'isOverlayRequired': False,
                    'scale': True,
                    'language': 'eng'
                }
            )
        result = response.json()
        if result['IsErroredOnProcessing']:
            print("⚠️ OCR.Space error:", result.get("ErrorMessage"))
            return ""
        return result['ParsedResults'][0]['ParsedText'].strip()
    except Exception as e:
        print("❌ OCR.Space Exception:", e)
        return ""


In [None]:
import asyncio
import base64
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

TARGET_URL_KEYWORD = "getmenulist"

async def get_all_text():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()

        # Flag to track if we found localname
        found_localname = False

        # Capture request details
        def handle_request(request):
            nonlocal found_localname
            if TARGET_URL_KEYWORD in request.url:
                print(f"\n📤 Matched Request to: {request.url}")
                local_name = request.headers.get("localname")  # header keys are lowercase
                if local_name:
                    print(f"🔍 LocalName: {local_name}")
                    found_localname = True
                    print("✅ LocalName found! Exiting...")
                else:
                    print("⚠️ LocalName header not found.")
        
        page.on("request", handle_request)


        # Go to login page
        await page.goto("https://webportal.juit.ac.in:6011/studentportal/#/")
        await page.wait_for_selector('input[formcontrolname="userid"]')
        await page.fill('input[formcontrolname="userid"]', '231030118')

        # Handle CAPTCHA image
        try:
            captcha_data_url = await page.get_attribute('img[src^="data:image"]', 'src')
            base64_data = captcha_data_url.split(",")[1]

            with open("captcha.jpg", "wb") as f:
                f.write(base64.b64decode(base64_data))
            print("✅ CAPTCHA image saved as captcha.jpg")

            captcha_text = extract_text_from_image_aws("captcha.jpg").strip().lower()
            print("🔍 Detected CAPTCHA Text:", captcha_text or "[none]")

            await page.fill('input[formcontrolname="captcha"]', captcha_text)
            await page.click('button[aria-label="LOGIN"]')
        except Exception as e:
            print("❌ Failed to fetch or OCR CAPTCHA:", e)

        print("⏳ Waiting for password field to appear...")
        await page.wait_for_selector('input[formcontrolname="password"]', timeout=10000)
        await page.fill('input[formcontrolname="password"]', '5496B8')
        print("🔑 Password filled.")
        await page.click('button[aria-label="LOGIN"]')
        
        # Wait and check for localname periodically
        max_wait_time = 30  # Maximum wait time in seconds
        check_interval = 0.5  # Check every 0.5 seconds
        
        for i in range(int(max_wait_time / check_interval)):
            if found_localname:
                break
            await asyncio.sleep(check_interval)
        
        if not found_localname:
            print("⚠️ LocalName not found within timeout period")
            # Still get page content if localname wasn't found
            content = await page.inner_text('body')
            print("📝 Page Text:\n")
            print(content)

        await browser.close()

# Run it
await get_all_text()

# Captcha handling  

In [None]:
import asyncio
import base64
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def download_captcha():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()

        await page.goto("https://webportal.juit.ac.in:6011/studentportal/#/")
        await page.wait_for_selector('img[src^="data:image"]')

        # Get CAPTCHA base64 and save as PNG
        captcha_data_url = await page.get_attribute('img[src^="data:image"]', 'src')
        base64_data = captcha_data_url.split(",")[1]

        with open("captcha.png", "wb") as f:
            f.write(base64.b64decode(base64_data))

        print("✅ CAPTCHA image saved as captcha.png")
        await browser.close()

await download_captcha()

✅ CAPTCHA image saved as captcha.png
