In [1]:
import os
import fitz

In [2]:
def extract_questions_and_reviews_from_pdf(pdf_path):
    TERM = "(Spring 2024)"

    pdf_data = {}
    pdf_data["questions"] = {}

    try:
        with fitz.open(pdf_path) as pdf:
            for page in pdf:
                text = page.get_text()
                lines = text.splitlines()

                line_idx = 0
                while line_idx < len(lines):
                    line = lines[line_idx]
                    
                    if TERM in line:
                        pdf_data["course_name"] = line
    
                    elif "Instructor: " in line:
                        pdf_data["instructor"] = line.split("Instructor: ")[1]

                    elif "Subject: " in line:
                        pdf_data["subject"] = line.split("Subject: ")[1]
    
                    elif "Catalog & Section: " in line:
                        pdf_data["course_number"] = line.split("Catalog & Section: ")[1].split(" ")[0]

                    elif "Course ID: " in line:
                        pdf_data["crn"] = line.split("Course ID: ")[1]

                    elif "Q: " in line:
                        question = line.split("Q: ")[-1]
                        reviews = []

                        line_idx += 1
                        while line_idx < len(lines):
                            if lines[line_idx].isnumeric():
                                skip_idx = 1
                                while lines[line_idx + skip_idx] == "":
                                    skip_idx += 1
                                    
                                reviews.append(
                                    lines[line_idx + skip_idx]
                                )

                                line_idx += skip_idx
                            elif "Q: " in lines[line_idx]:
                                break
                            else:
                                line_idx += 1

                        pdf_data["questions"][question] = reviews
                        continue
                    
                    line_idx += 1
                        
    except Exception as e:
        print(f"An error occurred: {e}")

    return pdf_data

In [3]:
# Path to the uploaded PDF file
pdf_file_path = './data/39696.pdf'

# Extract questions and reviews
pdf_data = extract_questions_and_reviews_from_pdf(pdf_file_path)
pdf_data

{'questions': {'What were the strengths of this course and/or this instructor?': ['Prof. Maryam is very good at explaining how data structure works. But for the coding part ( c language), we need to spend more time on ourselves.',
   'She is very accommodating to her students. She listens to feedback and ensures the students are in a good learning environment. She is very caring towards her class and shows excellent interpersonal',
   'The instructor is very enthusiastic and explains concepts very well. She is also very interactive and engaging. It was difficult to keep the class attentive for 3 hours, but she was very good at making it',
   "She is very kind and provide a lot help for us, I like her class. She let us join the discussion in canvas in each class, which ensure everyone's engagement. She also has sense of humor.",
   'Maryam is so patient and detail-oriented, and she shows every step of logic process in solving problems as she explains, and she always prepares a lot mater

In [4]:
pdf_data["questions"].keys()

dict_keys(['What were the strengths of this course and/or this instructor?', 'What could the instructor do to make this course better?', 'Please expand on the instructor’s strengths and/or areas for improvement in facilitating inclusive learning.', 'Please comment on your experience of the online course environment in the open-ended text box.', 'What I could have done to make this course better for myself.'])

In [5]:
len(
    pdf_data["questions"]
    ["Please comment on your experience of the online course environment in the open-ended text box."]
)

12

In [26]:
import os
import logging
import fitz

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)

class TraceDataScraper:
    def extract_questions_and_reviews_from_pdf(self, pdf_path):
        """
        Extracts structured data, including questions and their reviews, from a PDF.

        Parameters:
            pdf_path (str): The file path of the PDF file.

        Returns:
            dict: A dictionary containing extracted data such as course details and questions with reviews.
        """

        TERM = "(Spring 2024)"  # Term identifier to filter course-specific data
        pdf_data = {}
        pdf_data["questions"] = {}

        try:
            # Open the PDF using fitz (PyMuPDF)
            with fitz.open(pdf_path) as pdf:
                for page_number, page in enumerate(pdf, start=1):
                    text = page.get_text()  # Extract text from the page
                    lines = text.splitlines()  # Split text into lines

                    line_idx = 0
                    while line_idx < len(lines):
                        line = lines[line_idx]

                        if TERM in line:
                            pdf_data["course_name"] = line

                        elif "Instructor: " in line:
                            pdf_data["instructor"] = line.split("Instructor: ")[1]

                        elif "Subject: " in line:
                            pdf_data["subject"] = line.split("Subject: ")[1]

                        elif "Catalog & Section: " in line:
                            pdf_data["course_number"] = line.split("Catalog & Section: ")[1].split(" ")[0]

                        elif "Course ID: " in line:
                            pdf_data["crn"] = line.split("Course ID: ")[1]

                        elif "Q: " in line:
                            question = line.split("Q: ")[-1]
                            reviews = []

                            line_idx += 1
                            while line_idx < len(lines):
                                if lines[line_idx].isnumeric():
                                    skip_idx = 1
                                    while lines[line_idx + skip_idx] == "":
                                        skip_idx += 1

                                    reviews.append(lines[line_idx + skip_idx])

                                    line_idx += skip_idx
                                elif "Q: " in lines[line_idx]:
                                    break
                                else:
                                    line_idx += 1

                            pdf_data["questions"][question] = reviews
                            continue

                        line_idx += 1

        except Exception as e:
            logging.error(f"An error occurred while processing the PDF {pdf_path}: {e}")

        return pdf_data

    def process_directory(self, data_directory):
        """
        Processes all PDF files in the given directory and consolidates review information.

        Parameters:
            data_directory (str): The path to the directory containing PDF files.

        Returns:
            dict: A dictionary containing consolidated data for all PDF files.
        """
        logging.info(f"Starting to process directory: {data_directory}")
        consolidated_data = {}

        try:
            # Iterate through all files in the directory
            for filename in os.listdir(data_directory):
                if filename.endswith(".pdf"):
                    crn = filename.split(".pdf")[0]  # Extract CRN from filename

                    pdf_path = os.path.join(data_directory, filename)

                    # Extract data from the PDF
                    pdf_data = self.extract_questions_and_reviews_from_pdf(pdf_path)
                    consolidated_data[crn] = pdf_data

        except Exception as e:
            logging.error(f"An error occurred while processing the directory {data_directory}: {e}")

        logging.info(f"Completed processing of directory: {data_directory}")
        return consolidated_data

# Example Usage
data_scraper = TraceDataScraper()
data_directory = "./data"
consolidated_reviews = data_scraper.process_directory(data_directory)
# consolidated_reviews

2024-12-26 20:13:01,656 - INFO - Starting to process directory: ./data
2024-12-26 20:13:12,616 - INFO - Completed processing of directory: ./data


In [27]:
consolidated_reviews

{'35056': {'questions': {'What were the strengths of this course and/or this instructor?': ['Timing and Canvas are Organized.',
    'very responsive professor',
    'Passionate, knowledgeable, extremely accommodating.',
    'Scott was a pleasure to have as a professor! I can tell that he really enjoys what he is teaching us, and that enthusiasm is contagious. He is very approachable and flexible, and makes it abundantly clear that',
    'Assignments (homework, labs), and live lectures were very helpful and interesting.',
    'He is enthusiastic about the material which helps to make it more interesting.',
    'Very responsive to student questions and very, very nice attitude',
    'Very responsible and passionate about teaching. I think he offered us great help and some mental support',
    'We can understand the points deeply through several kinds of teaching approaches.',
    'The strengths of this course include lots of contents and interactive lessons, while the instructor excelled

In [29]:
import pandas as pd

def json_to_dataframe(consolidated_data):
    """
    Converts the JSON output from the TraceDataScraper into a pandas DataFrame.

    Parameters:
        consolidated_data (dict): The JSON data output from the TraceDataScraper.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the structured data.
    """
    # Create a list to hold structured data
    rows = []

    for crn, course_data in consolidated_data.items():
        course_name = course_data.get("course_name", "")
        instructor = course_data.get("instructor", "")
        subject = course_data.get("subject", "")
        course_number = course_data.get("course_number", "")
        questions = course_data.get("questions", {})
        
        # Iterate over questions and reviews
        for question, reviews in questions.items():
            for review in reviews:
                rows.append({
                    "CRN": crn,
                    "Course Name": course_name,
                    "Instructor": instructor,
                    "Subject": subject,
                    "Course Number": course_number,
                    "Question": question,
                    "Review": review
                })

    # Create DataFrame from rows
    dataframe = pd.DataFrame(rows)
    return dataframe

df = json_to_dataframe(consolidated_reviews)
df

Unnamed: 0,CRN,Course Name,Instructor,Subject,Course Number,Question,Review
0,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Timing and Canvas are Organized.
1,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,very responsive professor
2,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Passionate, knowledgeable, extremely accommoda..."
3,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,Scott was a pleasure to have as a professor! I...
4,35056,"Data Str, Algo App in CmpSys (Spring 2024)","Valcourt, Scott",CS,5008,What were the strengths of this course and/or ...,"Assignments (homework, labs), and live lecture..."
...,...,...,...,...,...,...,...
12657,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Studied more before class so that I could foll...
12658,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,watch vedios and books ahead may be better.
12659,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,I can utilize the additional resources provide...
12660,35062,Object-Oriented Design (Spring 2024),"Domino, Molly",CS,5004,What I could have done to make this course bet...,Spent more time outside of course hours review...


In [30]:
df.to_csv("./data/reviews.csv", index=False)