In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# Set up Chrome WebDriver options
options = Options()
options.add_argument('--blink-settings=imagesEnabled=false')  # Disable loading images
# options.add_argument('--enable-print-browser')  # Uncomment if you want to enable printing from the browser (optional)
options.add_argument("--headless")  # Run the browser in headless mode (without a GUI)
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"  # Set page load strategy to eager

# Import additional libraries and modules
import time
import os

from bs4 import BeautifulSoup
import requests

# Set a custom User-Agent header for web requests
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
import urllib.request

# Read a list of tickers from an Excel file and sort them
tickers = list(pd.read_excel('لیست صندوق_های سهامی.xlsx')['نماد'].values)
tickers.sort()

# Define a function to extract the date and check for "اصلاحیه" (correction) in a text item
def date_gen(item):
    text = item.text
    loc = text.find('/')
    date = int(text[loc - 4 : loc + 6].replace('/', ''))
    eslahie = 'اصلاحیه' in text
    return date, eslahie

# Define a function to check if two dates match
def date_check(date, sent):
    date_m = (((date // 100) % 100) + 1) % 12
    sent_m = ((sent // 100) % 100) % 12
    date_y = date // 10000 if date_m != 1 else (date // 10000) + 1
    sent_y = sent // 10000
    return (date_m == sent_m) and (date_y == sent_y)


In [11]:
# Define the starting URLs and web scraping configuration
link_start = 'https://codal.ir'  # Base URL
start_url = "https://codal.ir/ReportList.aspx?search&Symbol="  # URL to start scraping from
end_url = "&LetterType=-1&FromDate=1398%2F01%2F01&Isic=46430170&AuditorRef=-1&Audited&NotAudited&IsNotAudited=false&Childs&Mains&Publisher=false&CompanyState=2&Category=3&CompanyType=-1&Consolidatable&NotConsolidatable&PageNumber="
driver = webdriver.Chrome(options=options, desired_capabilities=caps)  # Initialize a headless Chrome WebDriver

# Iterate over a list of tickers
for ticker in tickers:
    # Create a directory for each ticker
    os.mkdir(os.getcwd() + '/New_Data/' + ticker)
    
    page = '1'  # Start on page 1
    url = start_url + ticker + end_url + page  # Construct the URL for the first page of reports for a ticker
    driver.get(url)  # Open the URL in the browser
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'letter-title')))  # Wait for the page to load
    soup = BeautifulSoup(driver.page_source, 'html.parser')  # Parse the page source with BeautifulSoup
    
    try:
        # Extract the number of pages from the last page link
        page_num = int(soup.find_all('li', {'title': 'آخرین صفحه'})[0].find('a')['href'].split('=')[-1])
    except:
        page_num = 1  # If no page number is found, assume there's only one page
    
    a = []  # Initialize a list for report titles
    sent = []  # Initialize a list for report sending times
    
    # Extract report titles and sending times from the first page
    a += soup.find_all('a', class_='letter-title')
    sent += soup.find_all('td', {'data-heading': 'زمان ارسال'})
    
    if page_num != 1:
        # If there are more than one page, loop through the remaining pages
        for page in range(2, page_num + 1):
            url = start_url + ticker + end_url + str(page)  # Construct the URL for the next page
            driver.get(url)  # Open the URL in the browser
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'letter-title')))  # Wait for the page to load
            soup = BeautifulSoup(driver.page_source, 'html.parser')  # Parse the page source with BeautifulSoup
            a += soup.find_all('a', class_='letter-title')  # Extract report titles
            sent += soup.find_all('td', {'data-heading': 'زمان ارسال'})  # Extract report sending times

    # Extract dates and "اصلاحیه" flags from the report titles
    dates = [date_gen(x)[0] for x in a]
    eslahie = [date_gen(x)[1] for x in a]

    # Extract sending dates from the sending time cells
    sent_dates = [date_gen(td.findChild())[0] for td in sent]

    # Extract report links and build complete URLs
    links = [link_start + x['href'] for x in a]

    j = 0  # Counter for handling multiple files with the same date
    for i, link in enumerate(links):
        r = requests.get(link, headers=headers, allow_redirects=False)  # Send a GET request to the report page
        soup2 = BeautifulSoup(r.text, 'html.parser')  # Parse the report page

        imgs = soup2.find_all('img')  # Find all image elements on the report page
        for img in imgs:
            if 'xls' in img['src']:
                # Extract the download link for the Excel file
                download_link = 'https://codal.ir/Reports/' + img.parent.parent['onclick'].split("'")[1]
                excel_file = requests.get(download_link, headers=headers, allow_redirects=True)  # Download the Excel file

                if (date_check(dates[i], sent_dates[i])) and not (eslahie[i]):
                    # Check if the date matches the sending date and if it's not a correction report
                    open(os.getcwd() + '/New_Data/' + ticker + '/' + str(dates[i]) + '.xlsx', 'wb').write(excel_file.content)
                else:
                    # If it's a correction report or the date doesn't match, add a suffix to the file name
                    open(os.getcwd() + '/New_Data/' + ticker + '/' + str(dates[i]) + '_check' + str(j) + '.xlsx', 'wb').write(excel_file.content)
                    j += 1
                break


In [34]:
for ticker in tickers:
    path = os.getcwd() + '/New_Data/' + ticker + '/'
    naames = os.listdir(path)
    for name in naames:
        if 'check' in name:
            print(ticker)