# WebScraping of tribunal transcript cases

This notebook aims to show the functionality for cleaning transcripts of cases through web scraping.

Following the general ethical guidelines when using WebScraping, it was retrieved the permission on the three tribunals:
- [International criminal tribunal for the former Yugoslavia](https://www.icty.org/), permission can be fetched [here](https://www.icty.org/robots.txt)
- [Extraordinary Chamber in the Courts of Cambodia](https://www.eccc.gov.kh/), permission can be fetched [here](https://www.eccc.gov.kh/robots.txt)
- [International Criminal Tribunal for Rwanda](https://ucr.irmct.org/) no robots.txt file was found

### Imports

In [6]:
#import sys  
#!{sys.executable} -m pip install PyPDF2
#!{sys.executable} -m pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple
Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m[31m4.5 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10


In [7]:
%load_ext autoreload
%autoreload 2

import requests
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
from PyPDF2 import PdfFileReader
import pandas as pd

import src.cleaning_transcripts as cleaning_transc

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Globals

In [8]:
GLB_FILE_ENCODING_UTF8 = "utf8"
GLB_FILE_WRITE_OP = "w"
GLB_FILE_BINARY_OP = "wb"

GLB_HTML_P_TAG = "p"
GLB_HTML_PARSER = "html.parser"

GLB_PATH_OUTPUT_DIRECTORY_ICTY = "output/clean_transcripts/icty"
GLB_PATH_INPUT_DIRECTORY_ECCC = "input/transcripts/eccc"
GLB_PATH_OUTPUT_DIRECTORY_ECCC = "output/clean_transcripts/eccc"
GLB_PATH_INPUT_DIRECTORY_ICTR = "input/transcripts/ictr"
GLB_EXTENSION_TXT = ".txt"

GLB_CHAR_NEWLINE = "\n"

GLB_COURT_PREFIX_FILE_ICTY = "ICTY_"
GLB_COURT_PREFIX_FILE_ECCC = "ECCC_"

DEBUG = True
GLB_GET_LIST_FILES_ECCC_FROM_EXCEL_FILE = True

## International criminal tribunal for the former Yugoslavia

In [None]:
transcript_icty_case_url = "https://www.icty.org/x/cases/tadic/trans/en/960719ed.htm"

In [None]:
if DEBUG:
    # Extracting information from the "International criminal tribunal for the former Yugoslavia"
    response = requests.get(transcript_icty_case_url)
    
    # Get just the content of the page
    #response.__dict__["_content"]

    content_soup = BeautifulSoup(response.content, "html.parser")

    # Print in text content in html format
    #print(content_soup.prettify())

    # Get list of paragraphs (this is how the information is retrieved)
    list_p = list()
    for p in content_soup.find_all("p"):
        list_p.append(p)
    print(f'Number of retrieved paragraphs of transcript {transcript_icty_case_url} is {len(list_p)}')
    print("="*50)

    counter = 0
    for paragraph in list_p:
        clean_paragraph = cleaning_transc.cleanParagraphsICFYtranscript(str(paragraph))
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            print(clean_paragraph)

#### Save Documents

In [None]:
list_url_cases_icty = [transcript_icty_case_url]

for index_case, url_html_case in enumerate(list_url_cases_icty):
    response = requests.get(url_html_case)
    
    content_soup = BeautifulSoup(response.content, GLB_HTML_PARSER)
    
    list_p = list()
    for p in content_soup.find_all(GLB_HTML_P_TAG):
        list_p.append(p)
        
    counter = 0
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ICTY, GLB_COURT_PREFIX_FILE_ICTY + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    for paragraph in list_p:
        clean_paragraph = cleaning_transc.cleanParagraphsICFYtranscript(str(paragraph))
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
        
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {transcript_icty_case_url} is {len(list_p)} was reduced to {counter}')
    

## Extraordinary Chamber in the Courts of Cambodia

In [12]:
transcript_eccc_case_url = "https://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_41.1_TR001_20090701_Final_EN_Pub.pdf"

In [25]:
str_tmp = ''
pattern_was_found = False

if DEBUG:
    counter = 0
    list_all_sentences = list()
    # Extracting information from the "Extraordinary Chamber in the Courts of Cambodia"
    # Get the PDF
    response = requests.get(transcript_eccc_case_url)
    doc_name = transcript_eccc_case_url[transcript_eccc_case_url.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()    
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name))
    number_of_pages = len(pdf.pages)
    #print(f"Number of pages {number_of_pages}")
    index_page_of_interest = 1
    for PATTERN_BEGIN in cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_LIST:
        if pattern_was_found:
            break
            
        for index_page in range(number_of_pages):
            page_pdf = pdf.pages[index_page]
            text_page = page_pdf.extract_text()

            if not pattern_was_found:
                index_pattern = text_page.index(PATTERN_BEGIN) if PATTERN_BEGIN in text_page else -1
                
                if index_pattern >= 0:
                    pattern_was_found = True

                    list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                    list_all_sentences = [*list_all_sentences, *list_aux]
                    index_page_of_interest += 1
            else:
                list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                list_all_sentences = [*list_all_sentences, *list_aux]
                index_page_of_interest += 1
                
    if not pattern_was_found:
        print(f"Pattern was never found for document: {transcript_eccc_case_url}")
    else:
        print(f"Total num of sentences from PDF file {len(list_all_sentences)}")
        print("="*50)

        for e in list_all_sentences:
            #print("====================BEGIN====================")
            #print(f'<<{e}>>')
            clean_sent = cleaning_transc.cleanSentenceECCCtranscript(e)
            if clean_sent != cleaning_transc.GLB_EMPTY_STRING :
                counter+=1
                print(clean_sent)
            #print("=====================END=====================")

        print(f"Num new sentences <<{counter}>> reduced from {len(list_all_sentences)}")

Total num of sentences from PDF file 2600
P R O C E E D I N G S
(Judges enter courtroom)
MR. PRESIDENT:
Please be seated.  The Court is now in session.
According to our schedule, today we're going to hear the
testimony of another survivor; the third person among the nine
survivors of S-21.
The lawyer, I note your presence.  Would you like to make any
comments?
MS. STUDZINSKY:
Mr. President, good morning.  Your Honours, good morning, dear
colleagues.
Yes, I would like to make some observations and also I'm seeking
for clarification before we hear the next survivor.
We have observed that Mr. Chum Mey yesterday was overwhelmed
sometimes when he accounted his story and he had to cry, and he
could not control his emotions any more.  He shares his
traumatization as well as the next survivor, who is my client
together with Cambodian colleagues, and he shares this situation
with other survivors, victims, civil parties and witnesses.
I would like to make a proposal.  I would like that the Chamb

#### Get list of documents (either from excel file or default values)

In [36]:
if GLB_GET_LIST_FILES_ECCC_FROM_EXCEL_FILE:
    PATH_ECCC_EXCEL_LIST_FILE_ECCC_TRANSCRIPTS = "input/html-links-ECCC.xlsx"

    df = pd.read_excel(PATH_ECCC_EXCEL_LIST_FILE_ECCC_TRANSCRIPTS, index_col=None, header=None, names=["url", "court", "case", "id_transcript", "date", "person_name"])

    list_url_cases_eccc = list(df["url"])
else:
    list_url_cases_eccc = [transcript_eccc_case_url]
    
print(f"Length of elements in list_url_cases_eccc is {len(list_url_cases_eccc)}")

Length of elements in list_url_cases_eccc is 50


#### Save Documents

In [40]:
counter = 0
pattern_was_found = False

for index_case, url_html_case in enumerate(list_url_cases_eccc):
    list_all_sentences = list()
    
    response = requests.get(url_html_case)
    doc_name = url_html_case[url_html_case.rindex("/")+1:]
    if doc_name in ["E1_300.1_TR002_20150518_Final_EN_Pub.pdf"]: #Exceptions
        continue
    
    # Write PDF
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name))
    number_of_pages = len(pdf.pages)
    
    index_page_of_interest = 1
    pattern_was_found = False
    
    for PATTERN_BEGIN in cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_LIST:
        if pattern_was_found:
            break
            
        for index_page in range(number_of_pages):
            page_pdf = pdf.pages[index_page]
            text_page = page_pdf.extract_text()

            if not pattern_was_found:
                index_pattern = text_page.index(PATTERN_BEGIN) if PATTERN_BEGIN in text_page else -1

                if index_pattern >= 0:
                    pattern_was_found = True

                    list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                    
                    if list_aux == None:
                        pattern_was_found = False
                        continue
                        
                    list_all_sentences = [*list_all_sentences, *list_aux]
                    index_page_of_interest += 1
            else:
                list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                list_all_sentences = [*list_all_sentences, *list_aux]
                index_page_of_interest += 1
    
    if not pattern_was_found:
        print(f"Pattern was never found for document: {url_html_case}")
        continue
        
    counter = 0
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ECCC, GLB_COURT_PREFIX_FILE_ECCC + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    
    for index, sentence in enumerate(list_all_sentences):
        clean_paragraph = cleaning_transc.cleanSentenceECCCtranscript(sentence)
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
            
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {url_html_case} is {len(list_all_sentences)} was reduced to {counter}')
    

1) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_67.1_TR001_20090824_Final_EN_Pub.pdf is 2650 was reduced to 2545
2) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_66.1_TR001_20090820_Final_EN_Pub.pdf is 2250 was reduced to 2141
3) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_63.1_TR001_20090817_Final_EN_Pub.pdf is 3000 was reduced to 2874
4) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_42.1_TR001_20090702_Final_EN_Pub.pdf is 2475 was reduced to 2378
5) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_41.1_TR001_20090701_Final_EN_Pub.pdf is 2600 was reduced to 2481
6) Number of retrieved pa

43) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/2015-08-06%2021%3A13/E1_304.1_TR002_20150525_Final_EN_Pub.pdf is 2750 was reduced to 2587
45) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/2015-06-11%2016%3A05/E1_297.1_TR002_20150505_Final_EN_Pub.pdf is 1950 was reduced to 1810
46) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_278.1_TR002_20150317_Final_EN_Pub.pdf is 2475 was reduced to 2322
47) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_290.1_TR002_20150422_Final_EN_Pub.pdf is 2250 was reduced to 2135
48) Number of retrieved paragraphs of transcript http://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_277.1_TR002_20150316_Final_EN_Pub_0.pdf is 2450 was reduced to 2311
49) N

## International Criminal Tribunal for Rwanda

In [None]:
transcript_ictr_case_url = "https://ucr.irmct.org/LegalRef/CMSDocStore/Public/English/Transcript/NotIndexable/ICTR-96-04/TRS13317R0000613662.PDF"
#list_url_cases_ictr = [transcript_ictr_case_url]

In [None]:
if DEBUG:
    list_all_sentences = list()
    # Extracting information from the "International Criminal Tribunal for Rwanda"
    # Get the PDF
    response = requests.get(transcript_ictr_case_url)
    doc_name = transcript_ictr_case_url[transcript_ictr_case_url.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name))
    number_of_pages = len(pdf.pages)
    """
    page_pdf = pdf.pages[0]#index_page
    text_page = page_pdf.extract_text()
    
    print(text_page)
    """
    for index_page in range(number_of_pages):
        page_pdf = pdf.pages[index_page]
        text_page = page_pdf.extract_text()
        print(cleaning_transc.cleanPagePdfICTRtranscript(text_page))
        print("*"*40)
    
    