In [1]:
import pdfplumber
import os
import pandas as pd
import re
import contextlib
import io

# Identifying all PDFs
pdf_folder = 'data/modules'
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
total_files = len(pdf_files)

In [2]:
# Function to extract text from PDF using pdfplumber
def extract_pdf_text(pdf_path):
    all_text = ''
    with contextlib.redirect_stderr(io.StringIO()): # Use of AI to avoid warning messages
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    all_text += page_text + '\n'
    return all_text

In [3]:
# Function to extract department name & code
def extract_department_code(text):
    lines = text.split('\n')
    
    for line in lines:
        match = re.search(r'([A-Za-z]+) \(([A-Za-z]{2,4})\) course results', line) # Use of AI to generate generic re code that identifies string
        if match:
            department_name = match.group(1) # Use of AI to learn about .group() function
            department_code = match.group(2)
            break
    return department_code, department_name

In [4]:
# Function to extract Marksummary tables from text
def extract_marksummary(text, department_code, department_name):
    lines = text.split('\n')
    mark_data = []
    excluded_data = []

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Identifying table headers
        if line.startswith('Year marks mean sd'):
            header = line.split()
            pos = {col: idx for idx, col in enumerate(header)}

            # Identifying course code (at bottom of table)
            course = department_code # Setting department code as default
            for k in range(1, 7):
                if i + k < len(lines):
                    match = re.search(r'([A-Z0-9]+):Marksummary', lines[i + k])
                    if match:
                        course = match.group(1)
                        break

            # Moving to first data row (skipping header)
            i += 1
            course_data = []
            skipped_rows = []
            
            while i < len(lines):
                line = lines[i].strip()
                # Break when encountering table title (at bottom of each table)
                if re.match(r'([A-Z0-9]+):Marksummary', line) or line.startswith('MarksbyYear'):
                    break

                # Parsing data
                if line:
                    values = line.split()
                    if len(values) == len(pos): # cleaning data from incomplete and misaligned rows due to missing values
                        course_data.append({
                            'department': department_name,
                            'code': course,
                            'year': values[pos['Year']],
                            'marks': int(values[pos['marks']]),
                            'mean': float(values[pos['mean']]),
                            'sd': float(values[pos['sd']]),
                            'min': float(values[pos['min']]),
                            'q10': float(values[pos['q10']]),
                            'q25': float(values[pos['q25']]),
                            'median': float(values[pos['median']]),
                            'IQR': float(values[pos['IQR']]),
                            'q75': float(values[pos['q75']]),
                            'q90': float(values[pos['q90']]),
                            'q95': float(values[pos['q95']]),
                            'max': float(values[pos['max']])
                        })
                        
                    elif len(values) == 1: # Seperating excluded rows between empty rows and incomplete rows
                        skipped_rows.append({'course': course, 'year': values[pos['Year']], 'reason': 'no data in year'})
                        
                    else:
                        skipped_rows.append({'course': course, 'year': values[pos['Year']], 'reason': 'incomplete data'})
                
                            
                i += 1  # Moving to next line

            mark_data.extend(course_data)
            excluded_data.extend(skipped_rows)

        else:
            i += 1  # Moving to next line

    return mark_data, excluded_data

In [5]:
# Function to scrape all PDFs in the folder
def process_pdfs(pdf_folder):
    all_data = []
    all_excl_data = []
    for i, pdf_file in enumerate(pdf_files, 1):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f'Processing ({i}/{total_files}): {pdf_file}...', end='\r', flush=True)

        # Extracting text from the PDF
        text = extract_pdf_text(pdf_path)
        
        # Extracting department code and name
        department_code, department_name = extract_department_code(text)

        # Extracting mark summary data
        mark_data, excluded_data = extract_marksummary(text, department_code, department_name)
        
        # Appending data
        all_data.extend(mark_data)
        all_excl_data.extend(excluded_data)

    # Converting data
    df = pd.DataFrame(all_data)
    df_excl = pd.DataFrame(all_excl_data)
    
    return df, df_excl

In [6]:
# Scraping all PDFs & sorting data
df, df_excl = process_pdfs(pdf_folder)
df = df.sort_values(by=['code', 'year'], ascending=[True, True])
df.reset_index(drop=True, inplace=True)

empty_rows = len(df_excl[df_excl['reason'] == 'no data in year'])
misaligned_rows = len(df_excl[df_excl['reason'] == 'incomplete data'])

print('\n'+f'Data scraping complete, {len(df)} rows of data extracted.'+'\n')
print(f'{empty_rows} rows deleted due to empty rows for years prior to introduction of new modules.')
print(f'{misaligned_rows} rows deleted due to missing values resulting in misalignment.')

Processing (20/20): ST-results-2023-24-All-Sittings.pdf...
Data scraping complete, 1934 rows of data extracted.

516 rows deleted due to empty rows for years prior to introduction of new modules.
49 rows deleted due to missing values resulting in misalignment.


In [7]:
# Separating modules and department data
departments_df = df[df['code'].str.len() == 2]
departments_df.reset_index(drop=True, inplace=True)

modules_df = df[df['code'].str.len() > 2]
modules_df.reset_index(drop=True, inplace=True)

# Saving DataFrames to CSV files
modules_df.to_csv("data/modules/marks_summary_modules.csv", index=False)
departments_df.to_csv("data/departments/marks_summary_departments.csv", index=False)

print('Dataframes seperated and saved as CSV files')

Dataframes seperated and saved as CSV files


In [8]:
modules_df

Unnamed: 0,department,code,year,marks,mean,sd,min,q10,q25,median,IQR,q75,q90,q95,max
0,Accounting,AC100,2019/20,116,76.9,9.2,45.0,65.0,72.8,79.0,10.5,83.2,86.0,87.0,91.0
1,Accounting,AC100,2020/21,145,67.4,10.4,32.0,54.0,61.0,69.0,14.0,75.0,79.0,81.0,85.0
2,Accounting,AC100,2021/22,114,65.8,15.6,0.0,47.3,58.0,68.0,19.0,77.0,81.7,85.7,88.0
3,Accounting,AC100,2022/23,112,63.9,15.7,0.0,48.0,55.0,66.0,19.2,74.2,80.0,83.4,90.0
4,Accounting,AC102,2019/20,524,86.8,10.3,35.0,76.0,83.0,90.0,11.0,94.0,96.0,97.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1836,Statistics,ST330,2019/20,69,72.0,11.8,33.0,56.0,64.0,74.0,17.0,81.0,86.2,89.6,92.0
1837,Statistics,ST330,2020/21,70,63.1,16.4,7.0,40.9,56.2,65.5,18.8,75.0,82.0,83.5,90.0
1838,Statistics,ST330,2021/22,65,60.8,15.7,0.0,41.6,51.0,63.0,20.0,71.0,79.4,83.0,91.0
1839,Statistics,ST330,2022/23,64,56.4,20.7,0.0,28.0,43.8,56.0,27.2,71.0,82.7,85.8,89.0


In [9]:
departments_df

Unnamed: 0,department,code,year,marks,mean,sd,min,q10,q25,median,IQR,q75,q90,q95,max
0,Accounting,AC,2019/20,1442,78.2,12.8,0.0,61.0,68.0,80.0,20.0,88.0,94.0,96.0,99.0
1,Accounting,AC,2020/21,1897,68.5,12.2,0.0,52.0,61.0,70.0,16.0,77.0,84.0,86.0,98.0
2,Accounting,AC,2021/22,1607,60.0,12.9,0.0,42.6,52.0,62.0,17.0,69.0,75.0,78.0,94.0
3,Accounting,AC,2022/23,1706,64.5,13.8,0.0,47.0,57.0,66.0,15.0,72.0,81.0,86.0,98.0
4,Accounting,AC,2023/24,1705,65.5,15.2,0.0,47.0,57.0,66.0,18.0,75.0,86.0,90.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Statistics,ST,2019/20,2660,72.0,12.6,0.0,57.0,65.0,72.0,16.0,81.0,87.0,91.0,100.0
89,Statistics,ST,2020/21,2783,67.0,15.2,0.0,47.0,59.0,68.0,19.0,78.0,84.0,89.0,100.0
90,Statistics,ST,2021/22,2589,64.3,16.3,0.0,45.0,56.0,66.0,19.0,75.0,84.0,88.0,100.0
91,Statistics,ST,2022/23,2951,63.4,17.0,0.0,42.0,54.0,66.0,21.0,75.0,83.0,87.0,100.0
