# Parse CSE File to Get Course List

In [None]:
import PyPDF2
import pandas as pd

course_type = 'CSE'
pdfFileObj = open('ComputingScienceSummaries.pdf', 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
  
n_pages = len(pdfReader.pages)
text = ''

for n in range(n_pages):
    pageObj = pdfReader.pages[n]
    text += pageObj.extract_text()

pdfFileObj.close()

courses = {}
use_next_line = False

for i, line in enumerate(text.splitlines()):
    if line.strip().startswith(course_type) and line[9] == ' ':
        course = line.replace('  ', ' ').strip()
    if '©' in line and course_type in line:
        # 100 © Alberta Education, Alberta, Canada   Revised 2010  CSE2910:  CSE PROJECT B  
        course = course_type + line.split(course_type)[1]
        course = course.replace('  ', ' ').strip()
    if 'Prerequ' in line:
        preq = line.split(': ')[1].strip().replace(' ', '')
        if 'None' in preq:
            preq = ''
        courses[course] = [preq]
    if use_next_line:
        #print('True', line)
        preq2 = line.split(': ')[0].strip().replace(' ', '')
        if len(preq2) != 0: # in case there is no prereq on this line
            preq = preq + ',' + preq2
        courses[course] = [preq]
        use_next_line = False
    if 'Prerequisites' in line:
        #print('use next line', line)
        #print(i, course, preq, '---', line)
        use_next_line = True
    #if '1120' in line:
    #if '1010' in line:
    #    print(i, use_next_line, line)
df = pd.DataFrame.from_dict(courses, orient='index', columns=['Prerequisites']).reset_index()
df.columns = ['Course', 'Prerequisites']
df

In [None]:
for i, line in enumerate(text.splitlines()):
    if '©' in line:
        print(i, line)

In [None]:
df[df['Prerequisites'].str.contains(course_type)]

In [None]:
df[df['Course'].str[3]=='3']

In [None]:
df.to_csv(course_type+'.csv', index=False)

In [None]:
import graphviz
dot = graphviz.Digraph()

for row in df.itertuples():
    course_number = row.Course.split(':')[0]
    if '50' not in course_number:
        if course_number[3] == '3':
            dot.node(course_number, shape='diamond')
        else:
            dot.node(course_number, shape='box')
        if row.Prerequisites != '':
            for preq in row.Prerequisites.split(','):
                dot.edge(preq, course_number)

dot

In [None]:
# save the output to a file
dot.render(course_type, view=True)
import os
# delete file
os.remove(course_type)

# NET Courses

In [None]:
import PyPDF2
import pandas as pd

course_type = 'NET'
pdfFileObj = open('NetworkingSummary.pdf', 'rb')

pdfReader = PyPDF2.PdfReader(pdfFileObj)
n_pages = len(pdfReader.pages)
text = ''
for n in range(n_pages):
    pageObj = pdfReader.pages[n]
    text += pageObj.extract_text()
pdfFileObj.close()

courses = {}
use_next_line = False

for i, line in enumerate(text.splitlines()):
    if line.strip().startswith(course_type):# and line[9] == ' ':
        course = line.replace('  ', ' ').strip()
        use_next_line = False
    if '©' in line and course_type in line and ':' in line:
        course = line.split(':')[0].strip().split(' ')[-1] + ':' + line.split(':')[1].strip()
        use_next_line = False
    if 'Prerequ' in line:
        try:
            preq = line.split(':')[1].strip().replace(' ', '')
        except:
            preq = ''
        if 'None' in preq:
            preq = ''
        courses[course] = [preq]
    if use_next_line:
        preq2 = line.split(': ')[0].strip().replace(' ', '')
        if len(preq2) != 0: # in case there is no prereq on this line
            preq = preq + ',' + preq2
        courses[course] = [preq]
    if 'Prerequisites' in line:
        use_next_line = True
df = pd.DataFrame.from_dict(courses, orient='index', columns=['Prerequisites']).reset_index()
df.columns = ['Course', 'Prerequisites']
df

In [None]:
df['Prerequisites'] = df['Prerequisites'].str.replace('NET2030','NET2030,NET2040,NET2050,NET2060,NET2070')

In [None]:
df[df['Prerequisites'].str.contains(course_type)]

In [None]:
import graphviz
dot = graphviz.Digraph()

for row in df.itertuples():
    course_number = row.Course.split(':')[0]
    if '50' not in course_number:
        if course_number[3] == '3':
            dot.node(course_number, shape='diamond')
        else:
            dot.node(course_number, shape='box')
        if row.Prerequisites != '':
            for preq in row.Prerequisites.split(','):
                dot.edge(preq, course_number)

dot

In [None]:
# save the output to a file
dot.render(course_type, view=True)
import os
# delete file
os.remove(course_type)

# PDFMiner.six

In [None]:
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # Create a PDF parser object
        parser = PDFParser(fh)
        
        # Create a PDF document object that stores the document structure
        document = PDFDocument(parser)
        
        # Connect the parser and document objects
        parser.set_document(document)
        
        # Create a PDF resource manager object that stores shared resources
        resource_manager = PDFResourceManager()
        
        # Create a buffer for the extracted text
        text_buffer = StringIO()
        
        # Create a PDF page aggregator object
        device = TextConverter(resource_manager, text_buffer, laparams=LAParams())
        
        # Create a PDF interpreter object
        interpreter = PDFPageInterpreter(resource_manager, device)
        
        # Process each page contained in the document
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
        
        # Get the extracted text
        text = text_buffer.getvalue()
        
        # Close the text buffer
        text_buffer.close()
        
        # Close the device
        device.close()
        
        # Return the extracted text
        return text

# Extract text from a PDF
pdf_text = extract_text_from_pdf('ComputerScienceSummaries.pdf')

# Print the extracted text
print(pdf_text)
