In [141]:
# Necessary pip installs: 
# pip install pandas
# pip install pdfminer.six

In [1]:
# Create a data frame from PDF module catalogues

import re
from pdfminer.high_level import extract_text
import pandas as pd

# Read PDF file

text_Module_Catalogue = extract_text("BA_WI_allModules.pdf")

In [152]:

def extract_first_match(text, regex_pattern):
    pattern = re.compile(regex_pattern)
    match = pattern.search(text)
    if match:
        return match.group(1)
    else:
        return None
    
def extract_LineMatch(text, regex_pattern):
    pattern = re.compile(regex_pattern, re.MULTILINE)
    match = pattern.search(text)
    if match:
        return match.group()
    else:
        return None
    
# Generic method to remove all unnessary text to have a clean list of courses
# Might have to be adapted for each PDF later

def clean_entries(matches):
    patternRemove1 = r"Module Catalogue for the Subject\nInformation Systems\nMaster’s with 1 major, 120 ECTS credits"
    patternRemove2 = r"JMU\sWürzburg\s•\sgenerated\s\d{1,2}-[A-Za-z]+-\d{4}\s•\sexam\.\sreg\.\sda-\nta\srecord\sMaster\s\(120\sECTS\)\sInformation\sSystems\s-\s\d{4}"
    patternRemove3 = r"Master’s\s+with\s+1\s+major\s+Information\s+Systems\s+\(\d{4}\)"
    patternRemove4 = r"page\s+\d+\s+/\s+\d+"
    patternRemove5 = r'^\s*$'

    cleaned_entries = []
    for entry in matches:
        cleaned_text1 = re.sub(patternRemove1, "", entry, flags=re.MULTILINE)
        cleaned_text2 = re.sub(patternRemove2, "", cleaned_text1, flags=re.MULTILINE)
        cleaned_text3 = re.sub(patternRemove3, "", cleaned_text2, flags=re.MULTILINE)
        cleaned_text4 = re.sub(patternRemove4, "", cleaned_text3, flags=re.MULTILINE)
        cleaned_text5 = re.sub(patternRemove5, "", cleaned_text4, flags=re.MULTILINE)
        cleaned_entries.append(cleaned_text5)

    return cleaned_entries

In [153]:
# Extract each module 
patternEntireModule = r"Module title[\s\S]*?(?=Module title|$)"

foundModules = re.findall(patternEntireModule, text_Module_Catalogue)

print("Number of modules: ", len(foundModules))

Number of modules:  163


In [154]:
# Remove all unnecessary text for easier mapping to data model

foundModules = clean_entries(foundModules)


In [155]:
# prepare data frame
data = pd.read_csv('modulesCSV.csv')
column_names = data.columns
print(column_names)

Index(['Module title;Abbrevation;Module coordinator;Module offered by;ETCS;Method of grading;Only after succ. compl. of module(s);Duration;Module level;Other prerequisites;Contents;Intended learning outcomes;;Courses;Method of assessment ;Allocation of places;Additional information;Workload;Teaching cycle;Referred to in LPO I;Module appears in'], dtype='object')


In [159]:
# Mapping of module attributes to data model
# Each attribute is searched by a regex pattern 

# Methods for testing and printing the result of a regex pattern

def find_none_entries(lst):
    none_indexes = []
    for index, item in enumerate(lst):
        if item is None:
            none_indexes.append(index)
    return none_indexes

def patternMatches (pattern, method):
    foundTitles = []
    for entry in foundModules:
        if (method == "first"):
            foundTitles.append(extract_first_match(entry, pattern))
        if (method == "line"):
            foundTitles.append(extract_LineMatch(entry, pattern))
    return foundTitles

# regex for module title -> Always located after "Module title"
patternModuleTitle = r'Module title\s*\n*\s*(.*)'

# regex for abbreviaton -> Always located after "Abbreviation"
patternAbbreviation = r'Abbreviation\s*\n*\s*(.*)'

# regex for Module offered by -> random location .. but line is always starting with "Faculty" or "Institute"
patternModuleOfferedBy = r"^(Faculty|Institute).*"

# regex for Module coordinator -> random location .. but line is always starting with "Holder" or "Dean"
# some modules don't have a coordinator -> so the value will be None for this column
patternModuleCoordinator = r"^(Holder|holder|Dean).*"

# regex for ETCS
patternETCS = r"^\d{1,2}$"

# regex for Method of grading
patternMethodGrading = r".*(not\s)?successfully completed|numerical grade.*"

# regex for duration
patternDuration = r"^\d\ssemester$"

# regex for module level, some might have no level given, so the value will be None
patternModuleLevel = r"^(?:graduate|undergraduate)$"

# test regex for each attribute (method for regexEx might differ between first match or line match)
print (patternMatches(patternModuleTitle, "first"))
print (patternMatches(patternAbbreviation, "first"))
print (patternMatches(patternModuleOfferedBy, "line"))
print (patternMatches(patternModuleCoordinator, "line"))
print (patternMatches(patternETCS, "line"))
print (patternMatches(patternMethodGrading, "line"))

print (len(patternMatches(patternDuration, "line")))
print (patternMatches(patternDuration, "line"))

print (len(patternMatches(patternModuleLevel, "line")))
print (patternMatches(patternModuleLevel, "line"))










['Business Informatics', 'E-Business', 'Data Management and Analysis', 'Integrated Business Processes', 'Organization', 'Accounting', 'Managerial Accounting', 'Marketing', 'Supply, Production and Operations Management', 'Entrepreneurship', 'Differential Calculus for Economics and Management', 'Linear Algebra for Economics and Management', 'Statistics', 'Econometrics', 'Algorithms and Data Structures Level One Course', 'Fundamentals of Programming', 'Practical Course in Programming for Business Informatics', 'Software Technology', 'Data Bases', 'IT-Law for Business Informatics', 'Forward and Reverse Business Engineering', 'Seminar: Information Systems', 'Web Programming', 'Advanced Web Engineering', 'E-Business Project', 'Business Intelligence', 'Planning and Decision Making in Business Information Systems', 'Primer in Data Science', 'Supply Chain Management', 'Seminar: Logistics & Supply Chain Management', 'Toyota Supply Chain Management', 'Information Economics - Software Project', 'A

In [161]:
print (find_none_entries(patternMatches(patternModuleLevel, "line")))

print (foundModules[16])

[16, 44, 45, 46, 47, 48, 49, 150]
Module title

Practical Course in Programming for Business Informatics

Abbreviation

10-I-PPWI-202-m01

Module coordinator

--

Module offered by

Institute of Computer Science

ECTS Method of grading

Only after succ. compl. of module(s)

5

(not) successfully completed

--

Duration

Module level

Other prerequisites

1 semester

--

--

Contents

--

Intended learning outcomes

--

Courses (type, number of weekly contact hours, language — if other than German)

P (6)

Method of assessment (type, scope, language — if other than German, examination offered — if not every semester, information on whether 
module is creditable for bonus)

practical examination (programming exercises, approx. 240 hours) and written examination (approx. 60 to 120 
minutes); If announced by the lecturer at the beginning of the course, the written examination may be replaced 
by an oral examination of one candidate each (approx. 20 minutes) or an oral examination in groups