# WebScraping of tribunal transcript cases

This notebook aims to show the functionality for cleaning transcripts of cases through web scraping.

Following the general ethical guidelines when using WebScraping, it was retrieved the permission on the three tribunals:
- [International criminal tribunal for the former Yugoslavia](https://www.icty.org/), permission can be fetched [here](https://www.icty.org/robots.txt)
- [Extraordinary Chamber in the Courts of Cambodia](https://www.eccc.gov.kh/), permission can be fetched [here](https://www.eccc.gov.kh/robots.txt)
- [International Criminal Tribunal for Rwanda](https://ucr.irmct.org/) no robots.txt file was found

### Imports

In [None]:
#import sys  
#!{sys.executable} -m pip install PyPDF2

In [1]:
%load_ext autoreload
%autoreload 2

import requests
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
from PyPDF2 import PdfFileReader

import src.cleaning_transcripts as cleaning_transc

### Globals

In [2]:
GLB_FILE_ENCODING_UTF8 = "utf8"
GLB_FILE_WRITE_OP = "w"
GLB_FILE_BINARY_OP = "wb"

GLB_HTML_P_TAG = "p"
GLB_HTML_PARSER = "html.parser"

GLB_PATH_OUTPUT_DIRECTORY_ICTY = "output/clean_transcripts/icty"
GLB_PATH_INPUT_DIRECTORY_ECCC = "input/transcripts/eccc"
GLB_PATH_OUTPUT_DIRECTORY_ECCC = "output/clean_transcripts/eccc"
GLB_PATH_INPUT_DIRECTORY_ICTR = "input/transcripts/ictr"
GLB_EXTENSION_TXT = ".txt"

GLB_CHAR_NEWLINE = "\n"

GLB_COURT_PREFIX_FILE_ICTY = "ICTY_"
GLB_COURT_PREFIX_FILE_ECCC = "ECCC_"

DEBUG = True

## International criminal tribunal for the former Yugoslavia

In [None]:
transcript_icty_case_url = "https://www.icty.org/x/cases/tadic/trans/en/960719ed.htm"

In [None]:
if DEBUG:
    # Extracting information from the "International criminal tribunal for the former Yugoslavia"
    response = requests.get(transcript_icty_case_url)
    
    # Get just the content of the page
    #response.__dict__["_content"]

    content_soup = BeautifulSoup(response.content, "html.parser")

    # Print in text content in html format
    #print(content_soup.prettify())

    # Get list of paragraphs (this is how the information is retrieved)
    list_p = list()
    for p in content_soup.find_all("p"):
        list_p.append(p)
    print(f'Number of retrieved paragraphs of transcript {transcript_icty_case_url} is {len(list_p)}')
    print("="*50)

    counter = 0
    for paragraph in list_p:
        clean_paragraph = cleaning_transc.cleanParagraphsICFYtranscript(str(paragraph))
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            print(clean_paragraph)

#### Save Documents

In [None]:
list_url_cases_icty = [transcript_icty_case_url]

for index_case, url_html_case in enumerate(list_url_cases_icty):
    response = requests.get(url_html_case)
    
    content_soup = BeautifulSoup(response.content, GLB_HTML_PARSER)
    
    list_p = list()
    for p in content_soup.find_all(GLB_HTML_P_TAG):
        list_p.append(p)
        
    counter = 0
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ICTY, GLB_COURT_PREFIX_FILE_ICTY + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    for paragraph in list_p:
        clean_paragraph = cleaning_transc.cleanParagraphsICFYtranscript(str(paragraph))
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
        
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {transcript_icty_case_url} is {len(list_p)} was reduced to {counter}')
    

## Extraordinary Chamber in the Courts of Cambodia

In [3]:
transcript_eccc_case_url = "https://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_41.1_TR001_20090701_Final_EN_Pub.pdf"

In [8]:
if DEBUG:
    counter = 0
    list_all_sentences = list()
    # Extracting information from the "Extraordinary Chamber in the Courts of Cambodia"
    # Get the PDF
    response = requests.get(transcript_eccc_case_url)
    doc_name = transcript_eccc_case_url[transcript_eccc_case_url.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    doc_name = transcript_eccc_case_url[transcript_eccc_case_url.rindex("/")+1:]
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name))
    number_of_pages = len(pdf.pages)
    #print(f"Number of pages {number_of_pages}")
    index_page_of_interest = 1
    patter_was_found = False
    for index_page in range(number_of_pages):
        page_pdf = pdf.pages[index_page]
        text_page = page_pdf.extract_text()
        
        if not patter_was_found:
            index_pattern = text_page.index(cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_OF_INTEREST) if cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_OF_INTEREST in text_page else -1
            if index_pattern >= 0:
                patter_was_found = True
                
                list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                list_all_sentences = [*list_all_sentences, *list_aux]
                index_page_of_interest += 1
        else:
            list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
            list_all_sentences = [*list_all_sentences, *list_aux]
            index_page_of_interest += 1

    print(f"Total num of sentences from PDF file {len(list_all_sentences)}")
    print("="*50)
    
    for e in list_all_sentences:
        clean_sent = cleaning_transc.cleanSentenceECCCtranscript(e)
        if clean_sent != cleaning_transc.GLB_EMPTY_STRING :
            counter+=1
            print(clean_sent)

Total num of sentences from PDF file 2600
1   P R O C E E D I N G S
2   (Judges enter courtroom)
3
4   MR. PRESIDENT:
5   Please be seated.  The Court is now in session.
6   According to our schedule, today we're going to hear the
7   testimony of another survivor; the third person among the nine
8   survivors of S-21.
9   The lawyer, I note your presence.  Would you like to make any
10   comments?
11   MS. STUDZINSKY:
12   Mr. President, good morning.  Your Honours, good morning, dear
13   colleagues.
14   Yes, I would like to make some observations and also I'm seeking
15   for clarification before we hear the next survivor.
16   We have observed that Mr. Chum Mey yesterday was overwhelmed
17   sometimes when he accounted his story and he had to cry, and he
18   could not control his emotions any more.  He shares his
19   traumatization as well as the next survivor, who is my client
20   together with Cambodian colleagues, and he shares this situation
21   with other survivors, victi

#### Save Documents

In [None]:
list_url_cases_eccc = [transcript_eccc_case_url]
counter = 0

for index_case, url_html_case in enumerate(list_url_cases_eccc):
    list_all_sentences = list()
    
    response = requests.get(url_html_case)
    doc_name = transcript_eccc_case_url[transcript_eccc_case_url.rindex("/")+1:]
    
    # Write PDF
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name))
    number_of_pages = len(pdf.pages)
    
    index_page_of_interest = 1
    patter_was_found = False
    for index_page in range(number_of_pages):
        page_pdf = pdf.pages[index_page]
        text_page = page_pdf.extract_text()
        
        if not patter_was_found:
            index_pattern = text_page.index(cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_OF_INTEREST) if cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_OF_INTEREST in text_page else -1
            if index_pattern >= 0:
                patter_was_found = True
                
                list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                list_all_sentences = [*list_all_sentences, *list_aux]
                index_page_of_interest += 1
        else:
            list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
            list_all_sentences = [*list_all_sentences, *list_aux]
            index_page_of_interest += 1
        
    counter = 0
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ECCC, GLB_COURT_PREFIX_FILE_ECCC + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    
    for index, sentence in enumerate(list_all_sentences):
        clean_paragraph = cleaning_transc.cleanSentenceECCCtranscript(sentence)
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
            
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {url_html_case} is {len(list_all_sentences)} was reduced to {counter}')
    

## International Criminal Tribunal for Rwanda

In [None]:
transcript_ictr_case_url = "https://ucr.irmct.org/LegalRef/CMSDocStore/Public/English/Transcript/NotIndexable/ICTR-96-04/TRS13317R0000613662.PDF"
#list_url_cases_ictr = [transcript_ictr_case_url]

In [None]:
if DEBUG:
    list_all_sentences = list()
    # Extracting information from the "International Criminal Tribunal for Rwanda"
    # Get the PDF
    response = requests.get(transcript_ictr_case_url)
    doc_name = transcript_ictr_case_url[transcript_ictr_case_url.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name))
    number_of_pages = len(pdf.pages)
    """
    page_pdf = pdf.pages[0]#index_page
    text_page = page_pdf.extract_text()
    
    print(text_page)
    """
    for index_page in range(number_of_pages):
        page_pdf = pdf.pages[index_page]
        text_page = page_pdf.extract_text()
        print(cleaning_transc.cleanPagePdfICTRtranscript(text_page))
        print("*"*40)
    
    