In [1]:
import pdfplumber
import os
import pandas as pd
import re
import contextlib
import io

# Identifying all PDFs
pdf_folder = 'data/modules'
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')] 

In [15]:
# Function to extract text from PDF using pdfplumber
def extract_pdf_text(pdf_path):
    all_text = ''
    with contextlib.redirect_stderr(io.StringIO()): # Use of AI to avoid warning messages
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    all_text += page_text + '\n'
    return all_text

In [17]:
# Function to extract department name & code
def extract_department_code(text):
    lines = text.split('\n')
    
    for line in lines:
        match = re.search(r'([A-Za-z]+) \(([A-Za-z]{2,4})\) course results', line) # Use of AI to generate generic re code that identifies string
        if match:
            department_name = match.group(1) # Use of AI to learn about .group() function
            department_code = match.group(2)
            break
    return department_code, department_name

In [4]:
# Function to extract Marksummary tables from text
def extract_marksummary(text, department_code, department_name):
    lines = text.split('\n')
    mark_data = []
    excluded_data = []

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Identifying table headers
        if line.startswith('Year marks mean sd'):
            header = line.split()
            pos = {col: idx for idx, col in enumerate(header)}

            # Identifying course code (at bottom of table)
            course = department_code # Setting department code as default
            for k in range(1, 7):
                if i + k < len(lines):
                    match = re.search(r'([A-Z0-9]+):Marksummary', lines[i + k])
                    if match:
                        course = match.group(1)
                        break

            # Moving to first data row (skipping header)
            i += 1
            course_data = []
            skipped_rows = []
            
            while i < len(lines):
                line = lines[i].strip()
                # Break when encountering table title (at bottom of each table)
                if re.match(r'([A-Z0-9]+):Marksummary', line) or line.startswith('MarksbyYear'):
                    break

                # Parsing data
                if line:
                    values = line.split()
                    if len(values) == len(pos): # cleaning data from incomplete and misaligned rows due to missing values
                        course_data.append({
                            'department': department_name,
                            'code': course,
                            'year': values[pos['Year']],
                            'marks': int(values[pos['marks']]),
                            'mean': float(values[pos['mean']]),
                            'sd': float(values[pos['sd']]),
                            'min': float(values[pos['min']]),
                            'q10': float(values[pos['q10']]),
                            'q25': float(values[pos['q25']]),
                            'median': float(values[pos['median']]),
                            'IQR': float(values[pos['IQR']]),
                            'q75': float(values[pos['q75']]),
                            'q90': float(values[pos['q90']]),
                            'q95': float(values[pos['q95']]),
                            'max': float(values[pos['max']])
                        })
                        
                    elif len(values) == 1: # Seperating excluded rows between empty rows and incomplete rows
                        skipped_rows.append({'course': course, 'year': values[pos['Year']], 'reason': 'no data in year'})
                        
                    else:
                        skipped_rows.append({'course': course, 'year': values[pos['Year']], 'reason': 'incomplete data'})
                
                            
                i += 1  # Moving to next line

            mark_data.extend(course_data)
            excluded_data.extend(skipped_rows)

        else:
            i += 1  # Moving to next line

    return mark_data, excluded_data

In [5]:
# Function to scrape all PDFs in the folder
def process_pdfs(pdf_folder):
    all_data = []
    all_excl_data = []
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f"Processing {pdf_file}...")

        # Extracting text from the PDF
        text = extract_pdf_text(pdf_path)
        
        # Extracting department code and name
        department_code, department_name = extract_department_code(text)

        # Extracting mark summary data
        mark_data, excluded_data = extract_marksummary(text, department_code, department_name)
        
        # Appending data
        all_data.extend(mark_data)
        all_excl_data.extend(excluded_data)

    # Converting data
    df = pd.DataFrame(all_data)
    df_excl = pd.DataFrame(all_excl_data)
    
    return df, df_excl

In [19]:
# Scraping all PDFs & sorting data
df, df_excl = process_pdfs(pdf_folder)
df = df.sort_values(by=['code', 'year'], ascending=[True, True])
df.reset_index(drop=True, inplace=True)
empty_rows = len(df_excl[df_excl['reason'] == 'no data in year'])
misaligned_rows = len(df_excl[df_excl['reason'] == 'incomplete data'])

print('\n'+'Data extraction complete'+'\n')
print(f'{empty_rows} rows deleted due to empty rows for years prior to introduction of new modules.')
print(f'{misaligned_rows} rows deleted due to missing values resulting in misalignment.')

Processing AC-results-2023-24-All-Sittings.pdf...
Processing SP-results-2023-24-All-Sittings.pdf...
Processing ST-results-2023-24-All-Sittings.pdf...
Processing GV-results-2023-24-All-Sittings.pdf...
Processing PH-results-2023-24-All-Sittings.pdf...
Processing SO-results-2023-24-All-Sittings.pdf...
Processing HY-results-2023-24-All-Sittings.pdf...
Processing FM-results-2023-24-All-Sittings.pdf...
Processing MG-results-2023-24-All-Sittings.pdf...
Processing LL-results-2023-24-All-Sittings.pdf...
Processing IR-results-2023-24-All-Sittings.pdf...
Processing LS-results-2023-24-All-Sittings.pdf...
Processing MA-results-2023-24-All-Sittings.pdf...
Processing LN-results-2023-24-All-Sittings.pdf...
Processing AN-results-2023-24-All-Sittings.pdf...
Processing EH-results-2023-24-All-Sittings.pdf...
Processing EC-results-2023-24-All-Sittings.pdf...
Processing MY-results-2023-24-All-Sittings.pdf...
Processing PB-results-2023-24-All-Sittings.pdf...
Processing DS-results-2023-24-All-Sittings.pdf...


In [7]:
# Separating modules and department data
departments_df = df[df['code'].str.len() == 2]
departments_df.reset_index(drop=True, inplace=True)

modules_df = df[df['code'].str.len() > 2]
modules_df.reset_index(drop=True, inplace=True)

# Saving DataFrames to CSV files
modules_df.to_csv("data/modules/marks_summary_modules.csv", index=False)
departments_df.to_csv("data/departments/marks_summary_departments.csv", index=False)

print('Dataframes seperated and saved as CSV files')

Dataframes seperated and saved as CSV files
