In [2]:
import os
from tqdm import tqdm
import pandas as pd
import requests
from selenium import webdriver 
from selenium.webdriver.common.by import By 
# from pypdf import PdfReader

In [3]:
df = pd.read_csv("scraped_data.csv")

In [4]:
df = df.drop([0]).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,Level and Score,URL,Year_round,Subjects,Question Format,Theme,Language Family,Author
0,Breakthrough_ [72%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_1 Old English and West Frisian,_*Ph_,Text,"Classical, Comparative","Indo-European, Germanic",Babette Verhoeven
1,Breakthrough / Foundation [41% // 45%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_2 Warlpiri,_*Mo*Ph_,Pattern,,Pama-Nyungan,Mary Laughren
2,Breakthrough / Foundation [56% // 62%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_3 Khmer,_*Sy*Wr_,Rosetta,,Austroasiatic,Babette Verhoeven
3,Foundation / Intermediate [58% // 81%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_4 Xhosa,_*Mo_,Rosetta,,"Altantic-Congo, Bantu",Babette Verhoeven
4,Foundation / Intermediate [27% // 46%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_5 Tariana,_*Se*Mo_,Rosetta,Story,Arawakan,"Babette Verhoeven, Simi Hellsten"


### Download pdfs

In [34]:
driver = webdriver.Chrome(executable_path="chromedriver.exe")
# Create the folder if it does not exist
if not os.path.exists("pdfs/"):
    os.makedirs("pdfs/")

progress_bar = tqdm(range(len(df)))
for i in range(len(df)):
    driver.get(df['URL'][i])
    pdf_url = driver.current_url
    
    # Downloading pdf from the web version of the pdf file
    response = requests.get(pdf_url)
    with open(f"pdfs/{df['Year_round'][i]}.pdf", 'wb') as file:
        file.write(response.content)
        progress_bar.update(1)
        
    
driver.close()

100%|████████████████████████████████████████████████████████████████████████████████| 220/220 [05:59<00:00,  1.30s/it]

## Experimenting and research
1. Understanding which libraries work best
2. Trying to figure out what strategies work for splitting all years

In [1]:
import os
from pathlib import Path
import re
from tqdm import tqdm
import pandas as pd
import requests
from selenium import webdriver 
from selenium.webdriver.common.by import By 
import fitz
# from PyPDF2 import PdfReader, PdfWriter

#### PyMuPDF

In [9]:
doc = fitz.open('pdfs/2019_pdfs/2019_R1_2 Japanese.pdf')
for page in doc:
    print(page.get_text())
    
print(len(doc))

 
Your name:                                                                                                                                             
 
The UK Linguistics Olympiad   2019 
Round 1 
 
 
 
Problem 2. Japanese characters (5 marks) 
In our alphabet, letters stand for sounds. Japanese is different. Most Japanese 
words are written in characters called ‘kanji’, which the Japanese adapted 
from Chinese. Kanji characters represent meanings, not sounds. For example, 
火 means ‘fire’ and is generally pronounced hi. In borrowings from Chinese, 
however, it is pronounced ka: ‘Tuesday’, for instance, is 火曜日ka-you-bi (‘fire day’, after the 
Chinese philosophy of the Five Elements). Here are some more Japanese words in kanji with their 
English translations (and literal translations in quotation marks in brackets). You may like to know 
that each kanji has just one form, without anything like our distinction between capital (‘upper case’) 
and small (‘lower case’) letters. 
 
日 
su

In [None]:
# Directory where your PDFs are stored
input_directory = "demo pdfs/"
question_output_directory = "demo pdfs/questions/"
answer_output_directory = "demo pdfs/answers/"

# Make sure output directories exist
os.makedirs(question_output_directory, exist_ok=True)
os.makedirs(answer_output_directory, exist_ok=True)

# Function to check if a page contains the keywords 'Answer' or 'Solutions'
def is_answer_page(text):
    return re.search(r'\b(Answer|Solutions)\b', text) is not None

for filename in os.listdir(input_directory):
    if filename.endswith(".pdf"):
        input_path = os.path.join(input_directory, filename)
        
        last_ques_page = 0
        doc = fitz.open(input_path)
        # reader = PdfReader(input_path)
        num_pages = len(doc)
        ques_pdf = fitz.open()
        ans_pdf = fitz.open()
        
        question_output_path = os.path.join(question_output_directory, f"questions_{filename}")
        for ind, page in enumerate(doc):
            text = page.get_text()
            if text and is_answer_page(text):
                last_ques_page = ind
                break
            else:
                ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)
            
        answer_output_path = os.path.join(answer_output_directory, f"answers_{filename}")
        ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)
        
        
        ques_pdf.save(question_output_path)  
        ans_pdf.save(answer_output_path)  
        
        doc.close()
        ques_pdf.close()
        ans_pdf.close()
        print("Pdf processed")

Pdf processed
Pdf processed
Pdf processed
Pdf processed
Pdf processed


#### Split pdfs year-wise

In [10]:
import os
import shutil

for i in range(2010, 2025):
    os.makedirs(f"pdfs/{i}_pdfs/", exist_ok=True)
    for pdf in os.listdir("pdfs/"):
        if pdf.endswith(".pdf") and pdf[:4] == str(i):
            src_file = os.path.join("pdfs/", pdf)
            dest_file = os.path.join(f"pdfs/{i}_pdfs/", pdf)
            shutil.copy(src_file, dest_file)

## Year-wise splitting of questions and solutions
1. A common strategy for splitting questions and answers in one single pdf is not possible since the pdfs were created manually, without a consistent structure over 14 years.

##### 2010

In [18]:
import os
from pathlib import Path
import re
from tqdm import tqdm
import pandas as pd
import requests
from selenium import webdriver 
from selenium.webdriver.common.by import By 
import fitz
# from PyPDF2 import PdfReader, PdfWriter

In [19]:
# Directory where your PDFs are stored
input_directory = "pdfs/2010_pdfs"
question_output_directory = "pdfs/2010_pdfs/2010_questions/"
answer_output_directory = "pdfs/2010_pdfs/2010_answers/"


# Function to check if a page contains the keywords 'Answer' or 'Solutions'
def criteria(text):
    return re.search(r'\b(Solutions)\b', text) is not None


def split_qna(input_dir, q_dir, a_dir):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        print(filename)
        if filename.endswith(".pdf"):
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if text and criteria(text):
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
            print(f"{filename} processed")
            

split_qna(input_directory, question_output_directory, answer_output_directory)

2010_answers
2010_questions
2010_R1_1 French.pdf
2010_R1_1 French.pdf processed
2010_R1_2 English.pdf
2010_R1_2 English.pdf processed
2010_R1_3 Abma.pdf
2010_R1_3 Abma.pdf processed
2010_R1_4 Armenian.pdf
2010_R1_4 Armenian.pdf processed
2010_R1_5 Turkish.pdf
2010_R1_5 Turkish.pdf processed
2010_R1_6 Tangkhul.pdf
2010_R1_6 Tangkhul.pdf processed
2010_R1_7 English.pdf
2010_R1_7 English.pdf processed
2010_R2_1 Minangkabau.pdf
2010_R2_1 Minangkabau.pdf processed
2010_R2_2 Cree.pdf
2010_R2_2 Cree.pdf processed
2010_R2_3 English.pdf
2010_R2_3 English.pdf processed
2010_R2_4 Vietnamese.pdf
2010_R2_4 Vietnamese.pdf processed
2010_R2_5 Tanna.pdf
2010_R2_5 Tanna.pdf processed


##### 2017 - 2022

In [22]:
# Function to check
def criteria(text):
    return re.search(r'\b(Solution and marking)\b', text) is not None


def split_qna(input_dir, q_dir, a_dir):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf"):
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if text and criteria(text):
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
        print(f"{filename} processed")
            

for i in range(2017,2023):
    # Directory where your PDFs are stored
    input_directory = f"pdfs/{i}_pdfs"
    question_output_directory = f"pdfs/{i}_pdfs/{i}_questions/"
    answer_output_directory = f"pdfs/{i}_pdfs/{i}_answers/"
    split_qna(input_directory, question_output_directory, answer_output_directory)

2017_answers processed
2017_questions processed
2017_R1_1 Italian.pdf processed
2017_R1_2 Inuktitut.pdf processed
2017_R1_3 European.pdf processed
2017_R1_4 Tshiluba.pdf processed
2017_R1_5 Basque.pdf processed
2017_R1_6 Maori.pdf processed
2017_R1_7 Tamil.pdf processed
2017_R1_8 Choctaw.pdf processed
2017_R1_9 Abkhaz.pdf processed
2017_R1_x10 Kaytetye.pdf processed
2017_R2_1 Nepali.pdf processed
2017_R2_2 Proto-Algonquian.pdf processed
2017_R2_3 Vietnamese.pdf processed
2017_R2_4 Hieroglyphs.pdf processed
2017_R2_5 Yupik.pdf processed
2018_answers processed
2018_questions processed
2018_R1_1 Romanian.pdf processed
2018_R1_2 Lithuanian.pdf processed
2018_R1_3 Bulgarian.pdf processed
2018_R1_4 Fijian.pdf processed
2018_R1_5 Gilbertese.pdf processed
2018_R1_6 Nko.pdf processed
2018_R1_7 Icelandic.pdf processed
2018_R1_8 Vietnamese.pdf processed
2018_R1_9 Pame.pdf processed
2018_R1_x10 Albanian.pdf processed
2018_R2_1 Blazon.pdf processed
2018_R2_2 Nivkh.pdf processed
2018_R2_3 Menya.pdf 

##### 2023

In [1]:
import os
from pathlib import Path
import re
from tqdm import tqdm
import pandas as pd
import requests
from selenium import webdriver 
from selenium.webdriver.common.by import By 
import fitz
# from PyPDF2 import PdfReader, PdfWriter

In [2]:
# Directory where your PDFs are stored
input_directory = "pdfs/2023_pdfs"
question_output_directory = "pdfs/2023_pdfs/2023_questions/"
answer_output_directory = "pdfs/2023_pdfs/2023_answers/"


# Function to check if a page contains the keywords 'Answer' or 'Solutions'
def criteria(text):
    return re.search(r'\b(Answers and Explanation)\b', text) is not None


def split_qna(input_dir, q_dir, a_dir):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        print(filename)
        if filename.endswith(".pdf"):
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if text and criteria(text):
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
            print(f"{filename} processed")
            

split_qna(input_directory, question_output_directory, answer_output_directory)

2023_answers
2023_questions
2023_R1_1 Umbrian.pdf
2023_R1_1 Umbrian.pdf processed
2023_R1_2 Jam Sai.pdf
2023_R1_2 Jam Sai.pdf processed
2023_R1_3 Gilbertese.pdf
2023_R1_3 Gilbertese.pdf processed
2023_R1_4 Swedish Runes.pdf
2023_R1_4 Swedish Runes.pdf processed
2023_R1_5 Permyak.pdf
2023_R1_5 Permyak.pdf processed
2023_R1_6 Albanian.pdf
2023_R1_6 Albanian.pdf processed
2023_R1_7 Lardil.pdf
2023_R1_7 Lardil.pdf processed
2023_R1_8 Meroitic.pdf
2023_R1_8 Meroitic.pdf processed
2023_R1_9 Kiche.pdf
2023_R1_9 Kiche.pdf processed
2023_R1_x10 Filomeno Mata Totonac.pdf
2023_R1_x10 Filomeno Mata Totonac.pdf processed
2023_R2_1 Abawiri.pdf
2023_R2_1 Abawiri.pdf processed
2023_R2_2 Roon.pdf
2023_R2_2 Roon.pdf processed
2023_R2_3 Pular.pdf
2023_R2_3 Pular.pdf processed
2023_R2_4 Komnzo.pdf
2023_R2_4 Komnzo.pdf processed
2023_R2_5 Mongo.pdf
2023_R2_5 Mongo.pdf processed


##### 2024

In [3]:
# Directory where your PDFs are stored
input_directory = "pdfs/2024_pdfs"
question_output_directory = "pdfs/2024_pdfs/2024_questions/"
answer_output_directory = "pdfs/2024_pdfs/2024_answers/"


# Function to check if a page contains the keywords 'Answer' or 'Solutions'
def criteria(text):
    return re.search(r'\b(Answers)\b', text) is not None


def split_qna(input_dir, q_dir, a_dir):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        print(filename)
        if filename.endswith(".pdf"):
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if text and criteria(text):
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
            print(f"{filename} processed")
            

split_qna(input_directory, question_output_directory, answer_output_directory)

2024_answers
2024_questions
2024_R1_1 Old English and West Frisian.pdf
2024_R1_1 Old English and West Frisian.pdf processed
2024_R1_2 Warlpiri.pdf
2024_R1_2 Warlpiri.pdf processed
2024_R1_3 Khmer.pdf
2024_R1_3 Khmer.pdf processed
2024_R1_4 Xhosa.pdf
2024_R1_4 Xhosa.pdf processed
2024_R1_5 Tariana.pdf
2024_R1_5 Tariana.pdf processed
2024_R1_6 Adinkra Symbols.pdf
2024_R1_6 Adinkra Symbols.pdf processed
2024_R1_7 Kannada.pdf
2024_R1_7 Kannada.pdf processed
2024_R1_8 Georgian.pdf
2024_R1_8 Georgian.pdf processed
2024_R1_9 Zou.pdf
2024_R1_9 Zou.pdf processed
2024_R1_x10 Guna.pdf
2024_R1_x10 Guna.pdf processed
2024_R2_1 Yawalapiti.pdf
2024_R2_1 Yawalapiti.pdf processed
2024_R2_2 Taa.pdf
2024_R2_2 Taa.pdf processed
2024_R2_3 Stodsde.pdf
2024_R2_3 Stodsde.pdf processed
2024_R2_4 Coptic.pdf
2024_R2_4 Coptic.pdf processed
2024_R2_5 Maonan.pdf
2024_R2_5 Maonan.pdf processed


##### 2011 and 2012

<i><b>R1</b></i>

In [14]:
# Function to check
def criteria(text, year):
    if year == 2011:
        return re.search(r'\b(Solutions)\b', text) is not None
    else:
        return re.search(r'\b(SOLUTION)\b', text) is not None


def split_qna(input_dir, q_dir, a_dir, year):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf") and "_R1_" in filename:
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if text and criteria(text, year):
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
            print(f"{filename} processed")
            

for i in range(2011,2013):
    # Directory where your PDFs are stored
    input_directory = f"pdfs/{i}_pdfs"
    question_output_directory = f"pdfs/{i}_pdfs/{i}_questions/"
    answer_output_directory = f"pdfs/{i}_pdfs/{i}_answers/"
    split_qna(input_directory, question_output_directory, answer_output_directory, i)

2011_R1_1 English.pdf processed
2011_R1_2 Japanese.pdf processed
2011_R1_3 Arrernte.pdf processed
2011_R1_4 Ulwa.pdf processed
2011_R1_5 O'odham.pdf processed
2011_R1_6 Indonesian.pdf processed
2011_R1_7 English Braille.pdf processed
2012_R1_1 Yolmo.pdf processed
2012_R1_2 Danish.pdf processed
2012_R1_3d Dutch.pdf processed
2012_R1_3w Welsh.pdf processed
2012_R1_4 Haitian.pdf processed
2012_R1_5 Esperanto.pdf processed
2012_R1_6 Bardi.pdf processed
2012_R1_7 Waorani.pdf processed
2012_R1_8 Arcturan.pdf processed
2012_R1_9 Waanyi.pdf processed


<i><b>R2 - 2011</b></i>

In [26]:
doc = fitz.open('pdfs/2011_pdfs/2011_R2_3 Nahuatl.pdf')
for page in doc:
    text = page.get_text()
    neh = text.splitlines()[:6]
    print(neh)
    
# print(len(doc))

[' ', ' ', '                                                                                                             2011 R2 ', ' ', '3. Axolotl in the water (16 marks) ', ' ']
[' ', ' ', '                                                                                                             2011 R2 ', ' ', 'Axolotl in the water (16 marks) ', ' ']
[' ', ' ', '                                                                                                             2011 R2 ', ' ', 'Axolotl in the water (15 marks) ', ' ']


In [28]:
doc = fitz.open('pdfs/2011_pdfs/2011_R2_5 Tadaksahak.pdf')
for page in doc:
    text = page.get_text()
    neh = text.splitlines()[:]
    print(neh)
    
# print(len(doc))

[' ', ' ', '                                                                                                             2011 R2 ', ' ', ' ', '1 ', '5. Swallow the salt (20) ', 'Tadaksahak is a Songhay language spoken primarily in the Republic of Mali, a ', 'landlocked country in Western Africa. There are approximately 32,000 speakers of ', 'the Tadaksahak language. Given below are several Tadaksahak phrases and their ', 'English translations:  ', ' ', 'aƔagon cidi  ', 'I swallowed the salt. ', 'atezelmez hamu  ', 'He will have the meat swallowed (by ', 'somebody). ', 'atedini a  ', 'He will take it. ', 'hamu anetubuz  ', 'The meat was not taken. ', 'jifa atetukuš  ', 'The corpse will be taken out. ', 'amanokal anešukuš cidi  ', "The chief didn't have the salt taken out. ", 'aƔakaw hamu  ', 'I took out the meat. ', 'itegzem  ', 'They were slaughtered. ', 'aƔasezegzem a  ', "I'm not having him slaughtered. ", 'anešišu aryen  ', "He didn't have the water drunk (by ", 'anybody). ', 'feji 

In [21]:
doc = fitz.open('pdfs/2011_pdfs/2011_R2_1 Warlpiri.pdf')
for page in doc:
    text = page.get_text()
    neh = text.splitlines()[:5]
    print(neh)
    
# print(len(doc))

[' ', ' ', '                                                                                                             2011 R2 ', ' ', '1. Stopping and flapping in Warlpiri (10 marks) ']
[' ', ' ', '                                                                                                             2011 R2 ', ' ', ' ']
[' ', ' ', '                                                                                                             2011 R2 ', ' ', '1. Stopping and flapping in Warlpiri (10 marks) ']


In [29]:
# Analysing the above two cells, we can have a condition where a page that has the first 5 elements same as the first page, would 
# contain the solution. This should work, at least for 2011

# Works for all but the third and the fifth pdf, for Round 2. Will manually split them now.

# Directory where your PDFs are stored
input_directory = "pdfs/2011_pdfs"
question_output_directory = "pdfs/2011_pdfs/2011_questions/"
answer_output_directory = "pdfs/2011_pdfs/2011_answers/"


def split_qna(input_dir, q_dir, a_dir):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf") and "_R2_" in filename:
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if ind == 0:
                    first_page = text.splitlines()[:5]
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)
                    continue
                if text.splitlines()[:5] == first_page:
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
            print(f"{filename} processed")
            

split_qna(input_directory, question_output_directory, answer_output_directory)

2011_R2_1 Warlpiri.pdf processed
2011_R2_2 Irish.pdf processed
2011_R2_3 Nahuatl.pdf processed
2011_R2_4 Ndyuka.pdf processed
2011_R2_5 Tadaksahak.pdf processed


<i><b>R2 - 2012 - Manual splitting</b></i>

##### 2013

<i><b>R2</b></i>

In [33]:
# Function to check
def criteria(text):
    return re.search(r'\b(solutions and marking)\b', text) is not None


def split_qna(input_dir, q_dir, a_dir):
    # Make sure output directories exist
    os.makedirs(q_dir, exist_ok=True)
    os.makedirs(a_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".pdf") and "_R2_" in filename:
            input_path = os.path.join(input_dir, filename)

            last_ques_page = 0
            doc = fitz.open(input_path)
            num_pages = len(doc)
            ques_pdf = fitz.open()
            ans_pdf = fitz.open()

            q_path = os.path.join(q_dir, f"questions_{filename}")
            for ind, page in enumerate(doc):
                text = page.get_text()
                if text and criteria(text):
                    last_ques_page = ind
                    break
                else:
                    ques_pdf.insert_pdf(doc, from_page = ind, to_page = ind)

            a_path = os.path.join(a_dir, f"answers_{filename}")
            # for i in range(last_ques_page+1,num_pages):
            ans_pdf.insert_pdf(doc, from_page = last_ques_page, to_page = num_pages - 1)


            ques_pdf.save(q_path)  
            ans_pdf.save(a_path)  

            doc.close()
            ques_pdf.close()
            ans_pdf.close()
            print(f"{filename} processed")
            

# Directory where your PDFs are stored
input_directory = f"pdfs/2013_pdfs"
question_output_directory = f"pdfs/2013_pdfs/2013_questions/"
answer_output_directory = f"pdfs/2013_pdfs/2013_answers/"
split_qna(input_directory, question_output_directory, answer_output_directory)

2013_R2_1 Quechua.pdf processed
2013_R2_2 Georgian, Armenian.pdf processed
2013_R2_3 Beja.pdf processed
2013_R2_4 Swedish.pdf processed
2013_R2_5 Indonesian, Swahili.pdf processed


##### 2015 - Done manually

## Breaking up 'Level and Score' into 2 different features 'Level' and 'Score'

In [257]:
import os
from tqdm import tqdm
import pandas as pd
import requests
from selenium import webdriver 
from selenium.webdriver.common.by import By 

In [258]:
df = pd.read_csv("scraped_data.csv")

In [259]:
df = df.drop([0]).reset_index(drop=True)

In [260]:
df.head()

Unnamed: 0,Level and Score,URL,Year_round,Subjects,Question Format,Theme,Language Family,Author
0,Breakthrough_ [72%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_1 Old English and West Frisian,_*Ph_,Text,"Classical, Comparative","Indo-European, Germanic",Babette Verhoeven
1,Breakthrough / Foundation [41% // 45%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_2 Warlpiri,_*Mo*Ph_,Pattern,,Pama-Nyungan,Mary Laughren
2,Breakthrough / Foundation [56% // 62%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_3 Khmer,_*Sy*Wr_,Rosetta,,Austroasiatic,Babette Verhoeven
3,Foundation / Intermediate [58% // 81%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_4 Xhosa,_*Mo_,Rosetta,,"Altantic-Congo, Bantu",Babette Verhoeven
4,Foundation / Intermediate [27% // 46%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_5 Tariana,_*Se*Mo_,Rosetta,Story,Arawakan,"Babette Verhoeven, Simi Hellsten"


In [261]:
inds = []
for i,row in df.iterrows():
    if "//" in row['Level and Score']:
        inds.append(i)


In [262]:
len(inds)

75

In [263]:
df['Level and Score'][:10]

0                       Breakthrough_ [72%]
1    Breakthrough / Foundation [41% // 45%]
2    Breakthrough / Foundation [56% // 62%]
3    Foundation / Intermediate [58% // 81%]
4    Foundation / Intermediate [27% // 46%]
5      Intermediate / Advanced [76% // 78%]
6      Intermediate / Advanced [41% // 39%]
7                           Advanced_ [17%]
8                           Advanced_ [23%]
9                           Advanced_ [12%]
Name: Level and Score, dtype: object

In [264]:
temp = "Breakthrough / Foundation [41% // 45%]"
level_1 = temp.split('/')[0].rstrip() +'_ '
print(level_1)
level_2 = temp.split('/')[1].strip().split()[0] +'_ '
print(level_2)
score_1 = temp.split('/')[1].strip().split()[1] +']'
print(score_1)
score_2 = '[' + temp.split('//')[1].lstrip()
print(score_2)

level_score_1 = level_1 + score_1
level_score_2 = level_2 + score_2

print(level_score_1)
print(level_score_2)

Breakthrough_ 
Foundation_ 
[41%]
[45%]
Breakthrough_ [41%]
Foundation_ [45%]


In [265]:
temp = "Foundation / Intermediate / Advanced [59% // 70% // 72%]"
# temp = "Breakthrough / Foundation [41% // 45%]"

alls = temp.split(' [')
# print(level_1)

levels = alls[0].split(' / ')
scores = alls[1].split(' // ')
scores[-1] = scores[-1][:-1]
print(levels)
print(scores)

strs = []
for l,s in zip(levels,scores):
    strs.append(l + "_ [" + s + "]")

print(strs)
# print(level_1)
# level_2 = temp.split('/')[1].strip().split()[0] +'_ '
# print(level_2)
# score_1 = temp.split('/')[1].strip().split()[1] +']'
# print(score_1)
# score_2 = '[' + temp.split('//')[1].lstrip()
# print(score_2)

# level_score_1 = level_1 + score_1
# level_score_2 = level_2 + score_2

# print(level_score_1)
# print(level_score_2)

['Foundation', 'Intermediate', 'Advanced']
['59%', '70%', '72%']
['Foundation_ [59%]', 'Intermediate_ [70%]', 'Advanced_ [72%]']


In [None]:
# Function to insert row in the dataframe
def insert_row(row_number, df, row_value):
    # Starting value of upper half
    start_upper = 0
    # End value of upper half
    end_upper = row_number
    # Start value of lower half
    start_lower = row_number
    # End value of lower half
    end_lower = df.shape[0]
    # Create a list of upper_half index
    upper_half = [*range(start_upper, end_upper, 1)]
    # Create a list of lower_half index
    lower_half = [*range(start_lower, end_lower, 1)]
    # Increment the value of lower half by 1
    lower_half = [x.__add__(1) for x in lower_half]
    # Combine the two lists
    index_ = upper_half + lower_half
    # Update the index of the dataframe
    df.index = index_
    # Insert a row at the end
    df.loc[row_number] = row_value
    # Sort the index labels
    df = df.sort_index()
    # return the dataframe
    return df

In [None]:
def level_split(level):
    alls = level.split(' [')
    levels = alls[0].split(' / ')
    scores = alls[1].split(' // ')
    scores[-1] = scores[-1][:-1]
    strs = []
    for l,s in zip(levels,scores):
        strs.append(l + "_ [" + s + "]")
    return strs

def split_level_score(df):
    buffer = 0
    new_stuff = []
    new_df = df.copy()
    for i in range(len(df)):
        if '/' in df.iloc[i,0]:
            strs = level_split(df.iloc[i,0])
            print(strs)
            if len(strs) == 2:
                new_df.iloc[i,0] = strs[0]
                row_copy = new_df.iloc[i].copy()
                row_copy[0] = strs[1]
                new_stuff.append([i, list(row_copy)])
            if len(strs) == 3:
                new_df.iloc[i,0] = strs[0]
                row_copy1 = new_df.iloc[i].copy()
                row_copy1[0] = strs[1]
                row_copy2 = new_df.iloc[i].copy()
                row_copy2[0] = strs[2]
                new_stuff.append([i, list(row_copy1), list(row_copy2)])
    return new_df, new_stuff

temp_df = df.copy()
new_temp_df, stuff_to_add = split_level_score(temp_df)
buffer = 0
for stuff in stuff_to_add:
    if len(stuff) == 2:
        new_temp_df = insert_row(stuff[0] + buffer, new_temp_df, stuff[1])
        buffer += 1
    else:
        new_temp_df = insert_row(stuff[0] + buffer, new_temp_df, stuff[1])
        new_temp_df = insert_row(stuff[0] + buffer + 1, new_temp_df, stuff[2])
        buffer += 2
        

['Breakthrough_ [41%]', 'Foundation_ [45%]']
['Breakthrough_ [56%]', 'Foundation_ [62%]']
['Foundation_ [58%]', 'Intermediate_ [81%]']
['Foundation_ [27%]', 'Intermediate_ [46%]']
['Intermediate_ [76%]', 'Advanced_ [78%]']
['Intermediate_ [41%]', 'Advanced_ [39%]']
['Breakthrough_ [18%]', 'Foundation_ [30%]']
['Breakthrough_ [28%]', 'Foundation_ [45%]']
['Foundation_ [42%]', 'Intermediate_ [61%]']
['Foundation_ [25%]', 'Intermediate_ [44%]']
['Intermediate_ [47%]', 'Advanced_ [52%]']
['Intermediate_ [28%]', 'Advanced_ [31%]']
['Breakthrough_ [55%]', 'Foundation_ [65%]']
['Breakthrough_ [38%]', 'Foundation_ [58%]']
['Foundation_ [58%]', 'Intermediate_ [79%]']
['Foundation_ [41%]', 'Intermediate_ [54%]']
['Intermediate_ [50%]', 'Advanced_ [52%]']
['Intermediate_ [22%]', 'Advanced_ [26%]']
['Breakthrough_ [44%]', 'Foundation_ [51%]']
['Breakthrough_ [42%]', 'Foundation_ [54%]']
['Foundation_ [28%]', 'Intermediate_ [51%]']
['Foundation_ [70%]', 'Intermediate_ [40%]']
['Breakthrough_ [64%]'

In [270]:
new_temp_df

Unnamed: 0,Level and Score,URL,Year_round,Subjects,Question Format,Theme,Language Family,Author
0,Breakthrough_ [72%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_1 Old English and West Frisian,_*Ph_,Text,"Classical, Comparative","Indo-European, Germanic",Babette Verhoeven
1,Foundation_ [45%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_2 Warlpiri,_*Mo*Ph_,Pattern,,Pama-Nyungan,Mary Laughren
2,Breakthrough_ [41%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_2 Warlpiri,_*Mo*Ph_,Pattern,,Pama-Nyungan,Mary Laughren
3,Foundation_ [62%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_3 Khmer,_*Sy*Wr_,Rosetta,,Austroasiatic,Babette Verhoeven
4,Breakthrough_ [56%],https://www.uklo.org/wp-content/uploads/2024/0...,2024_R1_3 Khmer,_*Sy*Wr_,Rosetta,,Austroasiatic,Babette Verhoeven
...,...,...,...,...,...,...,...,...
293,Round 2 [No Data],https://www.uklo.org/wp-content/uploads/2022/0...,2010_R2_1 Minangkabau,_*Ph_,Rosetta,,"Austronesian, Malayo-Polynesian",John Henderson
294,Round 2 [No Data],https://www.uklo.org/wp-content/uploads/2022/0...,2010_R2_2 Cree,_*Wr*Mo_,Match-up,,Algic,Patrick Littell & Julia Workman
295,Round 2 [No Data],https://www.uklo.org/wp-content/uploads/2022/0...,2010_R2_3 English,_*Wr_,Match-up,Encrypted,"Indo-European, Germanic",Richard Sproat
296,Round 2 [No Data],https://www.uklo.org/wp-content/uploads/2022/0...,2010_R2_4 Vietnamese,_*Wr*Sy_,Match-up,,Austroasiatic,David Mortensen


In [271]:
new_temp_df.to_csv("split_scraped_data.csv", index = False)