In [None]:
# Necessary pip installs: 
# pip install pandas
# pip install pdfminer.six

In [26]:
import re
import csv
import tempfile
import shutil
from pdfminer.high_level import extract_text
import pandas as pd

# Read PDF file
text_Module_Catalogue = extract_text("MA_IS_allModules.pdf")

# Pattern to remove Master Information Systems
removal_patterns_MS_IS = [
        r"Module Catalogue for the Subject\nInformation Systems\nMaster’s with 1 major, 120 ECTS credits",
        r"JMU\sWürzburg\s•\sgenerated\s\d{1,2}-[A-Za-z]+-\d{4}\s•\sexam\.\sreg\.\sda-\nta\srecord\sMaster\s\(120\sECTS\)\sInformation\sSystems\s-\s\d{4}",
        r"Master’s\s+with\s+1\s+major\s+Information\s+Systems\s+\(\d{4}\)",
        r"page\s+\d+\s+/\s+\d+",
        r'^\s*$'
    ]

In [27]:
# regex patterns to get attributes of Master Information Systems
from enum import Enum

class Patterns_MS_IS(Enum):
    PATTERN_ENTIRE_MODULE = r"Module title[\s\S]*?(?=Module title|$)"
    MODULE_TITLE = r'Module title\s*\n*\s*(.*)'
    ABBREVIATION = r'Abbreviation\s*\n*\s*(.*)'
    MODULE_OFFERED_BY = r"^(Faculty|Institute).*"
    MODULE_COORDINATOR = r"^(Holder|holder|Dean).*"
    ETCS = r"^\d{1,2}$"
    METHOD_GRADING = r".*(not\s)?successfully completed|numerical grade.*"
    DURATION = r"^\d\ssemester$"
    MODULE_LEVEL = r"^(?:graduate|undergraduate)$"
    CONTENTS = r'Contents([\s\S]*?)Intended learning outcomes'
    INTENDED_LEARNING_OUTCOMES = r'Intended learning outcomes\n\n([\s\S]*?)\n\nCourses \(type'
    COURSES = r'if other than German\)([\s\S]*?)Method of assessment'
    ASSESSMENT = r'whether\s*\nmodule is creditable for bonus\)([\s\S]*?)Allocation of places'
    ALLOCATION = r'Allocation of places([\s\S]*?)Additional information'
    ADDITIONAL_INFORMATION = r'Additional information([\s\S]*?)Workload'
    WORKLOAD = r'Workload([\s\S]*?)Teaching cycle'
    TEACHING_CYCLE = r'Teaching cycle([\s\S]*?)Referred to in LPO I'
    REFERRED_LPO = r'regulations for teaching-degree programmes\)([\s\S]*?)Module appears in'


In [31]:
from helper_methods import extract_first_match, extract_LineMatch, clean_entries, add_row_to_csv

# Extract modules to csv  -> Method shall be used generically for all modules later

def extract_modules_to_csv (text, patternsToRemove, filename):

    modules = re.findall(Patterns_MS_IS.PATTERN_ENTIRE_MODULE.value, text)
    modules = clean_entries(modules, patternsToRemove)
    
    # Extract module attributes
    for module in modules[0:2]:
        module_attributes = []
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.MODULE_TITLE.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.ABBREVIATION.value))
        module_attributes.append(extract_LineMatch(module, Patterns_MS_IS.MODULE_OFFERED_BY.value))
        module_attributes.append(extract_LineMatch(module, Patterns_MS_IS.MODULE_COORDINATOR.value))
        module_attributes.append(extract_LineMatch(module, Patterns_MS_IS.ETCS.value))
        module_attributes.append(extract_LineMatch(module, Patterns_MS_IS.METHOD_GRADING.value))
        module_attributes.append(extract_LineMatch(module, Patterns_MS_IS.DURATION.value))
        module_attributes.append(extract_LineMatch(module, Patterns_MS_IS.MODULE_LEVEL.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.CONTENTS.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.INTENDED_LEARNING_OUTCOMES.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.COURSES.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.ASSESSMENT.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.ALLOCATION.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.ADDITIONAL_INFORMATION.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.WORKLOAD.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.TEACHING_CYCLE.value))
        module_attributes.append(extract_first_match(module, Patterns_MS_IS.REFERRED_LPO.value))
        # Write to csv
        print(module_attributes)
        add_row_to_csv(filename, module_attributes)



In [32]:
# Use write to csv method for Master Information Systems



extract_modules_to_csv(text_Module_Catalogue, removal_patterns_MS_IS, "MS_IS_all_modules1.csv")

['Information Processing within Organizations', '12-IV-161-m01', 'Faculty of Business Management and Economics', 'holder of the Chair of Business Management and Business ', '5', 'numerical grade', '1 semester', 'graduate', '\n\nContent:\nThis course provides students with an in-depth overview of the structure and the application areas of business \nmanagement information systems in enterprises and public institutions.\n\nOutline of syllabus:\n1. What is software: concepts, categories, application\n2. Software life cycle: duration, phases, steps\n3. As-is analysis: tasks, problems\n4. To-be concept: system design, data design, dialog design, function design\n5. Object orientation: paradigm shift\n6. Change management: meaning, methodologies, project management\n7. Office automation: tasks, areas of application\n\n', 'After completing the course "Integrated Information Processing", students will be able to\n(i) understand the importance of integration in enterprises, especially in inform