# WebScraping of tribunal transcript cases

This notebook aims to show the functionality for cleaning transcripts of cases through web scraping.

Following the general ethical guidelines when using WebScraping, it was retrieved the permission on the three tribunals:
- [International criminal tribunal for the former Yugoslavia](https://www.icty.org/), permission can be fetched [here](https://www.icty.org/robots.txt)
- [Extraordinary Chamber in the Courts of Cambodia](https://www.eccc.gov.kh/), permission can be fetched [here](https://www.eccc.gov.kh/robots.txt)
- [International Criminal Tribunal for Rwanda](https://ucr.irmct.org/) no robots.txt file was found

### Imports

In [None]:
%load_ext autoreload
%autoreload 2

import requests
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
from PyPDF2 import PdfFileReader
import pandas as pd
import docx
import textract

import src.cleaning_transcripts as cleaning_transc

### Globals

In [None]:
GLB_FILE_ENCODING_UTF8 = "utf8"
GLB_FILE_WRITE_OP = "w"
GLB_FILE_BINARY_OP = "wb"

GLB_HTML_P_TAG = "p"
GLB_HTML_PARSER = "html.parser"

GLB_PATH_OUTPUT_DIRECTORY_ICTY = "output/clean_transcripts/icty"
GLB_PATH_INPUT_DIRECTORY_ECCC = "input/transcripts/eccc"
GLB_PATH_OUTPUT_DIRECTORY_ECCC = "output/clean_transcripts/eccc"
GLB_PATH_INPUT_DIRECTORY_ICTR = "input/transcripts/ictr"
GLB_PATH_OUTPUT_DIRECTORY_ICTR = "output/clean_transcripts/ictr"
GLB_EXTENSION_TXT = ".txt"

GLB_CHAR_NEWLINE = "\n"

GLB_COURT_PREFIX_FILE_ICTY = "ICTY_"
GLB_COURT_PREFIX_FILE_ECCC = "ECCC_"
GLB_COURT_PREFIX_FILE_ICTR = "ICTR_"

DEBUG = True
GLB_GET_LIST_FILES_ECCC_FROM_EXCEL_FILE = True
GLB_GET_LIST_FILES_ICTR_FROM_EXCEL_FILE = True
GLB_GET_LIST_FILES_ICTY_FROM_EXCEL_FILE = True

## International criminal tribunal for the former Yugoslavia

In [None]:
transcript_icty_case_url = "https://www.icty.org/x/cases/milan_lukic_sredoje_lukic/trans/en/080919ED.htm" #Example

In [None]:
if DEBUG:
    # Extracting information from the "International criminal tribunal for the former Yugoslavia"
    response = requests.get(transcript_icty_case_url)

    content_soup = BeautifulSoup(response.content, "html.parser")

    # Get list of paragraphs (this is how the information is retrieved)
    list_p = list()
    for p in content_soup.find_all("p"):
        list_p.append(p)
    print(f'Number of retrieved paragraphs of transcript {transcript_icty_case_url} is {len(list_p)}')
    print("="*50)

    counter = 0
    for paragraph in list_p:
        clean_paragraph = cleaning_transc.cleanParagraphsICFYtranscript(str(paragraph))
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            print(f"{clean_paragraph}")

#### Get list of documents (either from excel file or default values)

In [None]:
list_url_cases_icty = list()
if GLB_GET_LIST_FILES_ICTY_FROM_EXCEL_FILE:
    PATH_ICTY_EXCEL_LIST_FILE_ICTY_TRANSCRIPTS = "input/html-links-ICTY.xlsx"

    df = pd.read_excel(PATH_ICTY_EXCEL_LIST_FILE_ICTY_TRANSCRIPTS, index_col=None, header=None, names=["url", "court", "accused", "case", "date", "witness"])

    list_url_cases_icty = list(df["url"])
else:
    list_url_cases_icty = [transcript_eccc_case_url]
    
print(f"Length of elements in list_url_cases_icty is {len(list_url_cases_icty)}")

#### Save Documents

In [None]:
for index_case, url_html_case in enumerate(list_url_cases_icty):
    response = requests.get(url_html_case)
    
    content_soup = BeautifulSoup(response.content, GLB_HTML_PARSER)
    
    list_p = list()
    for p in content_soup.find_all(GLB_HTML_P_TAG):
        list_p.append(p)
        
    counter = 0
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ICTY, GLB_COURT_PREFIX_FILE_ICTY + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    for paragraph in list_p:
        clean_paragraph = cleaning_transc.cleanParagraphsICFYtranscript(str(paragraph))
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
        
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {url_html_case} is {len(list_p)} was reduced to {counter}')
    

In [None]:
print(f'Duplicated values: {df[df["url"].duplicated()]["url"].count()}')
df[df["url"].duplicated()]["url"]

## Extraordinary Chamber in the Courts of Cambodia

In [None]:
transcript_eccc_case_url = "https://www.eccc.gov.kh/sites/default/files/documents/courtdoc/%5Bdate-in-tz%5D/E1_41.1_TR001_20090701_Final_EN_Pub.pdf" #Example

In [None]:
str_tmp = ''
pattern_was_found = False

if DEBUG:
    counter = 0
    list_all_sentences = list()
    # Extracting information from the "Extraordinary Chamber in the Courts of Cambodia"
    # Get the PDF
    response = requests.get(transcript_eccc_case_url)
    doc_name = transcript_eccc_case_url[transcript_eccc_case_url.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()    
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name))
    number_of_pages = len(pdf.pages)
    #print(f"Number of pages {number_of_pages}")
    index_page_of_interest = 1
    for PATTERN_BEGIN in cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_LIST:
        if pattern_was_found:
            break
            
        for index_page in range(number_of_pages):
            page_pdf = pdf.pages[index_page]
            text_page = page_pdf.extract_text()

            if not pattern_was_found:
                index_pattern = text_page.index(PATTERN_BEGIN) if PATTERN_BEGIN in text_page else -1
                
                if index_pattern >= 0:
                    pattern_was_found = True

                    list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                    list_all_sentences = [*list_all_sentences, *list_aux]
                    index_page_of_interest += 1
            else:
                list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                list_all_sentences = [*list_all_sentences, *list_aux]
                index_page_of_interest += 1
                
    if not pattern_was_found:
        print(f"Pattern was never found for document: {transcript_eccc_case_url}")
    else:
        print(f"Total num of sentences from PDF file {len(list_all_sentences)}")
        print("="*50)

        for e in list_all_sentences:
            #print("====================BEGIN====================")
            #print(f'<<{e}>>')
            clean_sent = cleaning_transc.cleanSentenceECCCtranscript(e)
            if clean_sent != cleaning_transc.GLB_EMPTY_STRING :
                counter+=1
                print(clean_sent)
            #print("=====================END=====================")

        print(f"Num new sentences <<{counter}>> reduced from {len(list_all_sentences)}")

#### Get list of documents (either from excel file or default values)

In [None]:
if GLB_GET_LIST_FILES_ECCC_FROM_EXCEL_FILE:
    PATH_ECCC_EXCEL_LIST_FILE_ECCC_TRANSCRIPTS = "input/html-links-ECCCv1.xlsx"

    df = pd.read_excel(PATH_ECCC_EXCEL_LIST_FILE_ECCC_TRANSCRIPTS, index_col=None, header=None, names=["url", "court", "case", "id_transcript", "date", "person_name"])

    list_url_cases_eccc = list(df["url"])
else:
    list_url_cases_eccc = [transcript_eccc_case_url]
    
print(f"Length of elements in list_url_cases_eccc is {len(list_url_cases_eccc)}")

#### Save Documents

In [None]:
counter = 0
pattern_was_found = False

for index_case, url_html_case in enumerate(list_url_cases_eccc):
    list_all_sentences = list()
    
    response = requests.get(url_html_case)
    doc_name = url_html_case[url_html_case.rindex("/")+1:]
    if doc_name in ["E1_300.1_TR002_20150518_Final_EN_Pub.pdf"]: #Exceptions
        continue
    
    # Write PDF
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    # Get content of the PDF
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ECCC, doc_name))
    number_of_pages = len(pdf.pages)
    
    index_page_of_interest = 1
    pattern_was_found = False
    
    for PATTERN_BEGIN in cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_LIST:
        if pattern_was_found:
            break
            
        for index_page in range(number_of_pages):
            page_pdf = pdf.pages[index_page]
            text_page = page_pdf.extract_text()

            if not pattern_was_found:
                index_pattern = text_page.index(PATTERN_BEGIN) if PATTERN_BEGIN in text_page else -1

                if index_pattern >= 0:
                    pattern_was_found = True

                    list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                    
                    if list_aux == None:
                        pattern_was_found = False
                        continue
                        
                    list_all_sentences = [*list_all_sentences, *list_aux]
                    index_page_of_interest += 1
            else:
                list_aux = cleaning_transc.cleanPagePdfECCCtranscript(text_page, index_page_of_interest)
                list_all_sentences = [*list_all_sentences, *list_aux]
                index_page_of_interest += 1
    
    if not pattern_was_found:
        print(f"Pattern was never found for document: {url_html_case}")
        continue
        
    counter = 0
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ECCC, GLB_COURT_PREFIX_FILE_ECCC + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    
    for index, sentence in enumerate(list_all_sentences):
        clean_paragraph = cleaning_transc.cleanSentenceECCCtranscript(sentence)
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
            
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {url_html_case} is {len(list_all_sentences)} was reduced to {counter}')
    

## International Criminal Tribunal for Rwanda

In [None]:
transcript_ictr_case_url = "https://ucr.irmct.org/LegalRef/CMSDocStore/Public/English/Transcript/NotIndexable/ICTR-98-44/TRS15978R0000616228.DOC" #Example

In [None]:
if DEBUG:
    list_all_sentences = list()
    # Extracting information from the "International Criminal Tribunal for Rwanda"
    # Get the PDF
    response = requests.get(transcript_ictr_case_url)
    doc_name = transcript_ictr_case_url[transcript_ictr_case_url.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    # Get content of the DOC
    """
    pdf = PdfFileReader(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name))
    number_of_pages = len(pdf.pages)
    """
    print(f"Path {join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name)}")
    text = textract.process(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name))
    text = text.decode("utf-8")
    print("Num of sentences", len(text.split('\n')))
    # Consider from PROCEEDINGS
    patternWasFound = False
    for PATTERN_PROCEEDINGS in cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_LIST:
        if not patternWasFound:
            index_pattern = text.index(PATTERN_PROCEEDINGS) if PATTERN_PROCEEDINGS in text else -1
            if index_pattern >= 0:
                patternWasFound = True
                text = text[index_pattern:]
                break
    
    text = text.split("\n")
    text = [cleaning_transc.cleanPagePdfICTRtranscript(elem) for elem in text if elem != ""]
    print(f"Num of sentences with cleaning {len(text)}\n")    
    

#### Get list of documents (either from excel file or default values)

In [None]:
list_url_cases_ictr = list()
if GLB_GET_LIST_FILES_ICTR_FROM_EXCEL_FILE:
    PATH_ICTR_EXCEL_LIST_FILE_ICTR_TRANSCRIPTS = "input/html-links-ICTR.xlsx"

    df = pd.read_excel(PATH_ICTR_EXCEL_LIST_FILE_ICTR_TRANSCRIPTS, index_col=None)    

    list_url_cases_ictr = list(df["url"])
else:
    list_url_cases_ictr = [transcript_eccc_case_url]
    
print(f"Length of elements in list_url_cases_ictr is {len(list_url_cases_ictr)}")

#### Save documents

In [None]:
counter = 0
pattern_was_found = False

for index_case, url_html_case in enumerate(list_url_cases_ictr):
    list_all_sentences = list()
    
    response = requests.get(url_html_case)
    doc_name = url_html_case[url_html_case.rindex("/")+1:]
    
    f = open(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name), GLB_FILE_BINARY_OP)
    f.write(response.content)
    f.close()
    
    text = textract.process(join(GLB_PATH_INPUT_DIRECTORY_ICTR, doc_name))
    text = text.decode("utf-8")
    counter_list_all_sentences = len(text.split('\n'))
    
    # Consider from PROCEEDINGS
    patternWasFound = False
    for PATTERN_PROCEEDINGS in cleaning_transc.GLB_ECCC_PATTERN_BEGIN_CONTENT_LIST:
        if not patternWasFound:
            index_pattern = text.index(PATTERN_PROCEEDINGS) if PATTERN_PROCEEDINGS in text else -1
            if index_pattern >= 0:
                patternWasFound = True
                text = text[index_pattern:]
                break
    
    text = text.split("\n")
    text = [cleaning_transc.cleanPagePdfICTRtranscript(elem) for elem in text if elem != ""]
    
    id_case = url_html_case[url_html_case.rindex("/")+1: url_html_case.rindex(".")]
    f = open(join(GLB_PATH_OUTPUT_DIRECTORY_ICTR, GLB_COURT_PREFIX_FILE_ICTR + id_case + GLB_EXTENSION_TXT), GLB_FILE_WRITE_OP, encoding=GLB_FILE_ENCODING_UTF8)
    
    counter = 0
    for index, sentence in enumerate(text):
        clean_paragraph = cleaning_transc.cleanSentenceECCCtranscript(sentence)
        if clean_paragraph != cleaning_transc.GLB_EMPTY_STRING:
            counter+=1
            f.write(clean_paragraph + GLB_CHAR_NEWLINE)
    f.close()
    
    
    print(f'{index_case+1}) Number of retrieved paragraphs of transcript {url_html_case} is {counter_list_all_sentences} was reduced to {counter}')