In [4]:
import os
import re
import requests
import pandas as pd
from tqdm import tqdm
from PIL import Image 
from bs4 import BeautifulSoup

## Data Pipeline

### Extract Links

In [19]:
def extract_links(year_start:int, year_end:int) -> list[str]:
    """Extract links to the specific SEC case from directory based on year

    Args:
        year_start (_type_): start year of extracted document
        year_end (_type_): end year of extracted document

    Returns:
        _type_: list of links for SEC documents
    """
    site_list_cleaned = []
    for year in tqdm(range(year_start, year_end)):
        page = 0
        while True:
            links = f'https://www.sec.gov/litigation/litreleases?aId=edit-year&year={year}&page={page}'
            response = requests.get(links)
            soup = BeautifulSoup(response.text, 'html.parser')

            site_list = list(set(soup.find_all("tr", {"class": "pr-list-page-row"})))
            if len(site_list) == 0 :
                break

            for site in site_list:
                if len(site.find_all("a", {"type": "application/pdf"}, href=True)) == 1: #With only one pdf file
                    site_list_cleaned.append('https://www.sec.gov/' + site.find_all("a", href=True)[0].get('href'))
            page += 1
        
    return site_list_cleaned


In [None]:
def extract_text(site_lists:list[str])->pd.DataFrame():
    """Extract summary and pdf links from the website

    Args:
        site_lists (list[str]): _description_

    Returns:
        DataFrame: First Columns contains links of SEC file documents
                   Second Columns contains extracted summary of SEC file documents 
    """
    pdf_summary_df = pd.DataFrame(columns=['pdf_link', 'summary'])
    for site_list in tqdm(site_lists):
        response_site = requests.get(site_list)
        soup_site = BeautifulSoup(response_site.text, 'html.parser')
        paragraph_list = soup_site.find_all("p", attrs = {'class' :None})
        paragraph_combine = ''.join([str(paragraph) for paragraph in paragraph_list])
        if len(soup_site.find_all("a", href=re.compile("complaints"))) != 0:
            pdf_link = 'https://www.sec.gov/' + soup_site.find_all("a", href=re.compile("complaints"))[-1].get('href')
            pdf_summary_df.loc[len(pdf_summary_df.index)] = [pdf_link, paragraph_combine]
    return pdf_summary_df

### Apply OCR to extract text from pdf link

In [10]:
from pathlib import Path
import pytesseract as tess  
from pdf2image import convert_from_path
from PIL import Image
tess.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

In [37]:
def read_pdf(file_name, txt_folder):
    """Extract text from pdf using OCR

    Args:
        file_name (_type_): input document path
        txt_folder (_type_): output text file target path

    Returns:
        None
    """
    # Store all pages of one file here:
    pages = []
    output_file_name = txt_folder + '/' + file_name.split('/')[-1] + ".txt"  # Generating output file name
    if not os.path.isfile(output_file_name):
      try:
          # Convert the PDF file to a list of PIL images:
          images = convert_from_path(file_name)  

          # Extract text from each image:
          for i, image in enumerate(images):
            # Generating filename for each image
              filename = "page_" + str(i) + "_" + os.path.basename(file_name) + ".jpeg"  
              image.save(filename, "JPEG")  
            # Saving each image as JPEG
              text = tess.image_to_string(Image.open(filename))  # Extracting text from each image using pytesseract
              os.remove(filename)
              pages.append(text)
            # Appending extracted text to pages list

      except Exception as e:
          print(e)

      # Write the extracted text to a file:
      with open(output_file_name, "w") as f:
          f.write("\n".join(pages))  
        # Writing extracted text to output file

    return output_file_name

In [38]:
def read_cropped_pdf(file_name, txt_folder):
    """Extract text from pdf using OCR with cropped pictures

    Args:
        file_name (_type_): input document path
        txt_folder (_type_): output text file target path

    Returns:
        None
    """
    # Store all pages of one file here:
    pages = []
    output_file_name = txt_folder + '/' + file_name.split('/')[-1] + ".txt"  # Generating output file name
    if not os.path.isfile(output_file_name):
      try:
          # Convert the PDF file to a list of PIL images:
          images = convert_from_path(file_name)  

          # Extract text from each image:
          for i, image in enumerate(images):
              width, height = image.size
              left = 180
              top = 130
              right = width - 100
              bottom = height - 230
              im1 = image.crop((left, top, right, bottom))
            # Generating filename for each image
              filename = "page_" + str(i) + "_" + os.path.basename(file_name) + ".jpeg"  
              im1.save(filename, "JPEG")  
            # Saving each image as JPEG
              text = tess.image_to_string(Image.open(filename))  # Extracting text from each image using pytesseract
              os.remove(filename)
              pages.append(text)
            # Appending extracted text to pages list

      except Exception as e:
          print(e)

      # Write the extracted text to a file:
      with open(output_file_name, "w") as f:
          f.write("\n".join(pages))  
        # Writing extracted text to output file

    return output_file_name

### Small Demo on OCR

In [39]:
pdf_summary_df = pd.read_csv('final_version_withouttext.csv', index_col=[0])
pdf_summary_df.head(5)

Unnamed: 0,pdf_link,summary
0,https://www.sec.gov//litigation/complaints/200...,"<p align=""right"">CORRECTED</p><p>The Securitie..."
1,https://www.sec.gov//litigation/complaints/200...,<p>The United States Securities and Exchange C...
2,https://www.sec.gov//litigation/complaints/200...,<p>The Securities and Exchange Commission anno...
3,https://www.sec.gov//litigation/complaints/200...,"<p>The Securities and Exchange Commission (""Co..."
4,https://www.sec.gov//litigation/complaints/200...,"<p>The Securities and Exchange Commission (""Co..."


### Cropped Image first 100

In [34]:
pdf_summary_df_cropped_100 = pd.read_csv('final_version_withtext.csv', index_col=[0])
pdf_summary_df_cropped_100 = pdf_summary_df_cropped_100.iloc[:1000,:]
pdf_summary_df_cropped_100 = pdf_summary_df_cropped_100.reset_index(drop=True)
pdf_summary_df_cropped_100['text_extracted'] = ''
pdf_summary_df_cropped_100

Unnamed: 0,pdf_link,summary,text_extracted
0,https://www.sec.gov//litigation/complaints/200...,"<p align=""right"">CORRECTED</p><p>The Securitie...",
1,https://www.sec.gov//litigation/complaints/200...,<p>The United States Securities and Exchange C...,
2,https://www.sec.gov//litigation/complaints/200...,<p>The Securities and Exchange Commission anno...,
3,https://www.sec.gov//litigation/complaints/200...,"<p>The Securities and Exchange Commission (""Co...",
4,https://www.sec.gov//litigation/complaints/200...,"<p>The Securities and Exchange Commission (""Co...",
...,...,...,...
995,https://www.sec.gov//litigation/complaints/201...,<p>The Securities and Exchange Commission toda...,
996,https://www.sec.gov//litigation/complaints/201...,"<p>On November 14, the Securities and Exchange...",
997,https://www.sec.gov//litigation/complaints/201...,<p>The Securities and Exchange Commission toda...,
998,https://www.sec.gov//litigation/complaints/201...,"<p>On October 10, 2013, the Securities and Exc...",


In [35]:
tess.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
for index, row in tqdm(pdf_summary_df_cropped_100.iterrows()):
    
    pdf_path = os.path.join("Documents_pdf_cropped", (row['pdf_link'].split('/')[-1]))
    pdf_path = pdf_path.replace('\\', '/')
    try:
        response_pdf = requests.get(row['pdf_link'])
        with open(pdf_path, 'wb') as f:
            f.write(response_pdf.content)

        read_cropped_pdf(pdf_path, 'Documents_txt_cropped')
        
        row['text_extracted'] = open('./Documents_txt_cropped/'+pdf_path.split('/')[-1] +'.txt', 'r').read()
        pdf_summary_df_cropped_100.iloc[index, -1] =  row['text_extracted']
    except Exception as e:
        print(e)
        

988it [2:42:38, 10.99s/it]

Unable to get page count.
Syntax Error (2): Illegal character <21> in hex string
Syntax Error (4): Illegal character <4f> in hex string
Syntax Error (6): Illegal character <54> in hex string
Syntax Error (7): Illegal character <59> in hex string
Syntax Error (8): Illegal character <50> in hex string
Syntax Error (11): Illegal character <68> in hex string
Syntax Error (12): Illegal character <74> in hex string
Syntax Error (13): Illegal character <6d> in hex string
Syntax Error (14): Illegal character <6c> in hex string
Syntax Error (18): Illegal character <68> in hex string
Syntax Error (19): Illegal character <74> in hex string
Syntax Error (20): Illegal character <6d> in hex string
Syntax Error (21): Illegal character <6c> in hex string
Syntax Error (23): Illegal character <6c> in hex string
Syntax Error (25): Illegal character <6e> in hex string
Syntax Error (26): Illegal character <67> in hex string
Syntax Error (27): Illegal character <3d> in hex string
Syntax Error (28): Illegal 

1000it [2:47:01, 10.02s/it]


### Cleaning

In [40]:
def line_break(sample:str) ->str:
    """Delete extra line breaks

    Args:
        sample (str): original text

    Returns:
        str: cleaned text
    """
    pattern = r'(?<!\n)\n(?!\\n)'
    sample = re.sub(pattern, ' ', sample)
    return sample

def pages(sample:str) ->str:
    """Delete the page number

    Args:
        sample (str): original text

    Returns:
        str: cleaned text
    """
    pattern = r'\nCase\s*\d+:\s*\d+-cv-\d+\s*Document\s*\d+\s*Filed\s*\d+/\d+/\d+\s*Page\s*\d+\s*of\s*\d+\s*\n'
    sample = re.sub(pattern, ' ', sample)
    sample = sample.replace('|', '')
    return sample

def html_to_text(sample):
    """Delete the HTML symbols
    Args:
        sample (str): original text

    Returns:
        str: cleaned text
    """
    soup = BeautifulSoup(sample, 'html.parser')
    plain_text = soup.get_text()
    return plain_text

In [7]:
pdf_summary_df_cropped_100 = pd.read_csv('../data/final_version_cropped_first1000.csv')
pdf_summary_df_cropped_100['text_extracted'] = [line_break(str(ele)) for ele in pdf_summary_df_cropped_100['text_extracted']]
pdf_summary_df_cropped_100['text_extracted'] = [pages(ele) for ele in pdf_summary_df_cropped_100['text_extracted']]
pdf_summary_df_cropped_100['text_extracted'] = [html_to_text(ele) for ele in pdf_summary_df_cropped_100['text_extracted']]
pdf_summary_df_cropped_100['summary'] = [line_break(str(ele)) for ele in pdf_summary_df_cropped_100['summary']]
pdf_summary_df_cropped_100['summary'] = [pages(ele) for ele in pdf_summary_df_cropped_100['summary']]
pdf_summary_df_cropped_100['summary'] = [html_to_text(ele) for ele in pdf_summary_df_cropped_100['summary']]

pdf_summary_df_cropped_100.to_csv('../data/final_version_cropped_first1000.csv')