In [1]:
import json 
import pandas as pd 

In [2]:
filepath = './pdf_urls_uncleaned.json'

with open(filepath) as fp:
    data = json.load(fp)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Orgname,file_urls
0,Baazar Style Retail Ltd.,[https://trendlyne.com/get-document/post/pdf/4...
1,Sequent Scientific Ltd.,[https://trendlyne.com/get-document/post/pdf/4...
2,Indostar Capital Finance Ltd.,[https://trendlyne.com/get-document/post/pdf/4...
3,Northern Arc Capital Ltd.,[https://trendlyne.com/get-document/post/pdf/4...
4,Vodafone Idea Ltd.,[https://trendlyne.com/get-document/post/pdf/4...


In [3]:
df.isna().sum()

Orgname      0
file_urls    0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Orgname,file_urls
count,191,191
unique,162,191
top,Vodafone Idea Ltd.,[https://trendlyne.com/get-document/post/pdf/4...
freq,3,1


In [5]:
print(df.columns)


Index(['Orgname', 'file_urls'], dtype='object')


In [6]:

df.drop_duplicates(subset=['file_urls'], keep='first', inplace=True)
df.dropna(subset=['file_urls'], inplace=True)

In [7]:
df.describe()

Unnamed: 0,Orgname,file_urls
count,191,191
unique,162,191
top,Vodafone Idea Ltd.,[https://trendlyne.com/get-document/post/pdf/4...
freq,3,1


In [8]:
df.to_json('data.json', orient='records', lines=True)
json_str = ',\n'.join(json.dumps(row) for row in df.to_dict(orient='records'))
with open('data.json', 'w') as f:
    f.write('[' + json_str + ']')
#data prepared with respect to organisation and their links


In [12]:
import json
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

USERNAME = "username"
PASSWORD = "password"

def setup_driver(download_directory):
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    prefs = {
        "download.default_directory": download_directory, 
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True  
    }
    chrome_options.add_experimental_option("prefs", prefs)
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def login_to_trendlyne(driver, login_url):
    try:
        driver.get(login_url)
        print(f"Current URL: {driver.current_url}")
        
        username_field = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.NAME, "login"))
        )
        username_field.send_keys(USERNAME)
        password_field = driver.find_element(By.NAME, "password")
        password_field.send_keys(PASSWORD)
        login_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
        login_button.click()
        # Wait for either a URL change to the PDF URL or check if the PDF URL is loaded
        WebDriverWait(driver, 20).until(
            lambda d: 'pdf' in d.current_url or d.current_url != login_url
        )
        # Check if the page has successfully changed to the PDF or other post-login URL
        if 'pdf' in driver.current_url:
            print(f"Logged in successfully! New URL: {driver.current_url}")
            return True
        else:
            raise TimeoutException("Failed to reach the target page after login.")
            
    except TimeoutException:
        print("Login failed - timeout occurred")
        return False
    except NoSuchElementException as e:
        print(f"Element not found: {str(e)}")
        return False
    except Exception as e:
        print(f"Login failed: {str(e)}")
        return False

def check_download_complete(download_directory, timeout=30):
    download_wait_time = 0.5  
    elapsed_time = 0

    while elapsed_time < timeout:
        time.sleep(download_wait_time)
        files = os.listdir(download_directory)
        
        pdf_files = [f for f in files if f.endswith(".pdf")]
        #just to check if the donwloading is in temporary format
        tmp_files = [f for f in files if f.endswith(".tmp")]
        
        # If we have PDF files and no temporary (.tmp) files, assume downloads are complete
        if pdf_files and not tmp_files:
            return True

        elapsed_time += download_wait_time
    
    return False 

def main(orgname, output_directory='downloads'):
    output_directory = os.path.abspath(output_directory)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    with open('data.json', 'r') as f:
        data = json.load(f)

    organization_entries = [item for item in data if item["Orgname"] == orgname]
    organization_data = [x["file_urls"][0] for x in organization_entries] 
    print(organization_data)
    
    if not organization_data:
        print(f"No data found for organization: {orgname}")
        return

    # Iterating over all URLs for this organization
    for index, url in enumerate(organization_data, start=1):
        driver = setup_driver(output_directory)
        
        if login_to_trendlyne(driver, url):

            if check_download_complete(output_directory):
                print(f"Downloaded PDF for {orgname} (file {index}) to {output_directory}")
            else:
                print(f"PDF download for {orgname} (file {index}) timed out or did not complete")
        else:
            print(f"Failed to log in or download for {orgname} (file {index})")

        driver.quit()



['https://trendlyne.com/get-document/post/pdf/4789405/', 'https://trendlyne.com/get-document/post/pdf/4775518/', 'https://trendlyne.com/get-document/post/pdf/4721419/']
Current URL: https://trendlyne.com/accounts/login/?next=/get-document/post/pdf/4789405/
Logged in successfully! New URL: https://trendlyne.com/accounts/login/?next=/get-document/post/pdf/4789405/
Downloaded PDF for Vodafone Idea Ltd. (file 1) to c:\Users\HP\Desktop\Noqs\QNLP-main\QNLP-main\pdfScraper\pdfScraper\downloads
Current URL: https://trendlyne.com/accounts/login/?next=/get-document/post/pdf/4775518/
Logged in successfully! New URL: https://trendlyne.com/accounts/login/?next=/get-document/post/pdf/4775518/
Downloaded PDF for Vodafone Idea Ltd. (file 2) to c:\Users\HP\Desktop\Noqs\QNLP-main\QNLP-main\pdfScraper\pdfScraper\downloads
Current URL: https://trendlyne.com/accounts/login/?next=/get-document/post/pdf/4721419/
Logged in successfully! New URL: https://trendlyne.com/accounts/login/?next=/get-document/post/pd

In [None]:
organization_name=input("Type the company name: ")
main(organization_name)