In [None]:
"""
Indian Clinical Trial Registry Web Scrap

These functions enable us to access the ICTR database by recognizing and inputting the CAPTCHA value, 
which are part of the big data platform I helped build during my internship at PureFDA.
"""

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pytesseract
from PIL import Image
import time

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

pytesseract.pytesseract.tesseract_cmd = r'D:\Tesseract-OCR\tesseract'

import cv2
import numpy as np
import io

In [None]:
"""
extract captcha value from an image

this is a method that I found online
I fiddled with the parameters, but the overall steps are unchanged
it first changes the image color to gray
then adds weights to strokes in order to increase contrast
then it uses a series of functions to further define the shapes of the characters 
eventually the captcha can be extracted from the image
"""

def get_captcha(img_arr):
    gray = cv2.cvtColor(img_arr, cv2.COLOR_BGR2GRAY)
    contrast = cv2.addWeighted(gray, 0.9, gray, 0, 0.1)
    blur = cv2.GaussianBlur(contrast, (3,3), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    invert = 255 - opening
    
    # save image
    # bb = Image.fromarray(invert)
    # bb.save("D:\\MyProjects\\Internship Projects\\captcha.png")
    
    captcha_value = pytesseract.image_to_string(invert, lang="eng", config=r'--oem 3 --psm 6')
    captcha_value = captcha_value.replace("\n", "").replace(" ", "").strip()
    
    return captcha_value

In [None]:
"""
take a screenshot of the captcha

it first takes a screenshot of the entire webpage
then crops only the captcha part out
"""

def screenshot_captcha(demo_driver):
    # screenshot the entire webpage
    screenshot = demo_driver.get_screenshot_as_png()
    screenshot_io = Image.open(io.BytesIO(screenshot))
    # positions of the captcha image. This might be affected if the website layout changes.
    top = 445
    bottom = 485
    left = 845
    right = 960
    # crop out the captcha image
    screenshot_io = screenshot_io.crop((left, top, right, bottom))

    return screenshot_io, demo_driver

In [2]:
"""
request the website

because get_captcha does not always return the correct value
we gives it 10 tries to get it right
once the captcha value is inputted correctly
the webdriver will nevigate through the database and read the page source for scrapping later
"""

def request():
    for i in range(10):
        url = "http://ctri.nic.in/Clinicaltrials/advancesearchmain.php"
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--window-size=1920x1200")
        demo_driver = webdriver.Chrome('D:\chromedriver', options=chrome_options)
        demo_driver.get(url)
        time.sleep(5)

        # take screenshot of the captcha
        screenshot_io, demo_driver = screenshot_captcha(demo_driver)
        # convert the image into numpy.ndarray
        img_arr = np.array(screenshot_io, dtype=np.uint8)
        # recognize the captcha value
        captcha_value = get_captcha(img_arr)
        print(captcha_value)

        # insert captcha value
        try: 
            captcha_insert = WebDriverWait(demo_driver, 20).until(EC.presence_of_element_located((By.XPATH, "//*[@id=\"T4\"]")))
            captcha_insert.send_keys(captcha_value)
        except Exception as e:
            print(f"Failed to insert captcha value. detail \n {e}")

        # insert search value
        try:
            search_keyword = WebDriverWait(demo_driver, 20).until(EC.presence_of_element_located((By.XPATH, "//*[@id=\"searchword\"]")))
            search_keyword.send_keys("0")
        except Exception as e:
            print(f"Failed to insert search keyword. detail \n {e}")

        # click search
        try:
            search_button = WebDriverWait(demo_driver, 20).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div/center/div[3]/form/div/center/table[1]/tbody/tr[13]/td/input")))
            search_button.click()
        except Exception as e:
            print(f"Failed to search. detail \n {e}")

        # check if the database is loaded
        try:
            demo_driver.switch_to.alert.dismiss()
            demo_driver.quit()
            print("CTRI failed to load database")
            continue
        except:
            page_source = demo_driver.page_source
            soup = BeautifulSoup(page_source)
            tables = soup.find_all("table")
            if tables and len(tables) == 2:
                break          
    else:
        raise Exception("CTRI requester failed to recognize captcha...")

In [None]:
request()
print("done")