In [1]:
import openai
import pdfplumber as pdfplumber
import re as re 
import reportlab
from functools import reduce
from functools import partial
import os
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Frame, PageTemplate
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle

In [2]:
#pdf to text
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [146]:
import os

# Specify the directory path
directory = 'docs'
docs = []
names = []
# Iterate through files in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    if os.path.isfile(file_path):
        docs.append(extract_text_from_pdf(file_path))
        names.append(filename)

In [148]:
names_doc_dict = dict(zip(docs, names))

In [7]:
#testing
doc = extract_text_from_pdf(r"docs/L3_CSE156_FA24_FFN-1.pdf")

In [4]:
#cleaning text up
# Download necessary NLTK data

def clean_text(text):
    # Preserve math formulas
    math_formulas = re.findall(r'\$.*?\$|\\\(.*?\\\)|\\\[.*?\\\]', text)
    
    # Replace math formulas with placeholders
    for i, formula in enumerate(math_formulas):
        text = text.replace(formula, f'MATHFORMULA{i}')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = text.split()
    
    # Rejoin tokens
    cleaned_text = ' '.join(tokens)
    
    # Restore math formulas
    for i, formula in enumerate(math_formulas):
        cleaned_text = cleaned_text.replace(f'mathformula{i}', formula)
    
    return cleaned_text

In [5]:
#testing
#cleaned_text = clean_text(doc)
cleaned_text = list(map(clean_text, docs))

In [None]:
#function to call api
API_KEY = 

In [None]:
from openai import OpenAI
#Api call to collect summaries
API_KEY = 

def analyze_text_for_students(text, api_key = API_KEY):
    client = OpenAI(api_key=api_key)
    out = []
    prompt = """Analyze the following text and identify exactly 15 core concepts related to text classification and machine learning. Format your response as a numbered list of concepts with their explanations. Use clear, simple language suitable for students.

Text to analyze:
{text}

For each concept, provide:
1. A concise definition (20-30 words)
2. Exactly 2 key points or applications (15-25 words each)

Use this exact format for each concept:

n. Concept Name:
   - Definition: [20-30 word definition]
   - Key points:
     • [15-25 word key point or application]
     • [15-25 word key point or application]
     • [15-25 word key point or application]
     

Include relevant mathematical notations where appropriate, using LaTeX formatting (e.g., \(f(x) = wx + b\)).

Focus on fundamental ideas in text classification, machine learning, and natural language processing.

Example format:
1. Neural Networks:
   - Definition: Computational models inspired by biological neural networks, consisting of interconnected nodes (neurons) that process and transmit information.
   - Key points:
     • Used in various machine learning tasks, including image recognition and natural language processing.
     • Employ backpropagation algorithm for training, adjusting weights to minimize error between predicted and actual outputs.

[Repeat this format for all 15 concepts]
    """
    for doc in text:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful teaching assistant. Your task is to analyze the given text, identify core concepts, and provide detailed explanations to help students understand these concepts."},
                    {"role": "user", "content":prompt.format(text=doc)}
                ],
                max_tokens=16384,
                n=1,
                temperature=0.5,
            )

            analysis = response.choices[0].message.content.strip()
            out.append(analysis)
        except Exception as e:
            return f"Error with OpenAI API: {str(e)}"
    return out

# Example usage
#result = analyze_text_for_students(cleaned_text, API_KEY)
#print(result)

In [30]:
analyzed = analyze_text_for_students(cleaned_text, api_key = API_KEY)
analyzed

["1. Text Classification:\n   - Definition: The process of assigning predefined categories to text data based on its content, enabling automated organization and analysis.\n   - Key points:\n     • Utilizes machine learning algorithms to improve accuracy and efficiency in categorizing large text datasets.\n     • Commonly applied in sentiment analysis, spam detection, and topic categorization.\n\n2. Language Models (LMs):\n   - Definition: Statistical models that predict the likelihood of a sequence of words, enabling tasks such as text generation and completion.\n   - Key points:\n     • Can be autoregressive, predicting the next token based on previous tokens, represented as \\(P(w_n | w_1, w_2, \\ldots, w_{n-1})\\).\n     • Serve as the backbone for applications in chatbots, translation, and content generation.\n\n3. Prompting:\n   - Definition: The technique of providing specific instructions or examples to a language model to guide its output for a particular task.\n   - Key point

In [31]:
analyzed

["1. Text Classification:\n   - Definition: The process of assigning predefined categories to text data based on its content, enabling automated organization and analysis.\n   - Key points:\n     • Utilizes machine learning algorithms to improve accuracy and efficiency in categorizing large text datasets.\n     • Commonly applied in sentiment analysis, spam detection, and topic categorization.\n\n2. Language Models (LMs):\n   - Definition: Statistical models that predict the likelihood of a sequence of words, enabling tasks such as text generation and completion.\n   - Key points:\n     • Can be autoregressive, predicting the next token based on previous tokens, represented as \\(P(w_n | w_1, w_2, \\ldots, w_{n-1})\\).\n     • Serve as the backbone for applications in chatbots, translation, and content generation.\n\n3. Prompting:\n   - Definition: The technique of providing specific instructions or examples to a language model to guide its output for a particular task.\n   - Key point

In [32]:
import re

def extract_numbered_list(texts):
    out = []
    for text in texts:
        pattern = r'(\d+)\.\s+([^:\n]+):\n\s*- Definition:\s*([^\n]+)\n\s*- Key points:\n\s*•\s*([^\n]+)\n\s*•\s*([^\n]+)'
        
        matches = re.findall(pattern, text, re.MULTILINE)
        
        result = ""
        for number, title, definition, point1, point2 in matches:
            result += f"{number}. {title}:\n"
            result += f"   - Definition: {definition}\n"
            result += f"   - Key points:\n"
            result += f"     • {point1}\n"
            result += f"     • {point2}\n\n"
        
        out.append(result.strip())
    return out

# Example usage:
# texts = ['Your long text string here...']
# summaries = extract_numbered_list(texts)
# for summary in summaries:
#     print(summary)

In [33]:
extracted = extract_numbered_list(analyzed)

In [35]:
empty_extracts = [extract for extract in extracted if not extract.strip()]

for empty_extract in empty_extracts:
    print(f"Empty extract found: '{empty_extract}'")

if not empty_extracts:
    print("No empty extracts found.")

No empty extracts found.


In [36]:
extracted[1]

'1. Text Classification:\n   - Definition: The process of assigning predefined categories to text documents based on their content using machine learning algorithms.\n   - Key points:\n     • Commonly used in spam detection, sentiment analysis, and topic categorization.\n     • Involves feature extraction and model training on labeled datasets.\n\n2. Machine Learning:\n   - Definition: A subset of artificial intelligence that enables systems to learn from data and improve their performance on tasks without explicit programming.\n   - Key points:\n     • Utilizes algorithms to identify patterns in data and make predictions or decisions.\n     • Can be supervised, unsupervised, or semi-supervised based on the availability of labeled data.\n\n3. Retrieval-Augmented Generation (RAG):\n   - Definition: A model that enhances text generation by retrieving relevant information from external sources to improve context and accuracy.\n   - Key points:\n     • Combines generative and retrieval-bas

In [51]:
# Regular expression pattern to extract titles and their content
def find_topics(texts):
        # Regular expression pattern to match numbered titles
    out = []
    
    for text in texts:
        pattern = r'^\d+\.\s(.+?):'

        # Find all matches in the text
        matches = re.findall(pattern, text, re.MULTILINE)
        out.append(matches)
    return out

topics = find_topics(extracted)

In [139]:
 # Create a dictionary from the matches
def text_to_dict(texts):
     # Regular expression pattern to extract titles and their content
    out = []
    for text in texts:
        pattern = r'(\d+\.\s*)(.+?):\s*((?:(?!^\d+\.).)+)'

        # Find all matches in the text
        matches = re.findall(pattern, text, re.MULTILINE | re.DOTALL)
        # Create a dictionary from the matches
        result_dict = {title.strip(): f"{number}{title.strip()}:" + "\n" + content.strip() 
                       for number, title, content in matches}
        
        out.append(result_dict)
        
    return out
list_of_dicts = text_to_dict(extracted)
print(list_of_dicts[0])

{'Text Classification': '1. Text Classification:\n- Definition: The process of assigning predefined categories to text data based on its content, enabling automated organization and analysis.\n   - Key points:\n     • Utilizes machine learning algorithms to improve accuracy and efficiency in categorizing large text datasets.\n     • Commonly applied in sentiment analysis, spam detection, and topic categorization.', 'Language Models (LMs)': '2. Language Models (LMs):\n- Definition: Statistical models that predict the likelihood of a sequence of words, enabling tasks such as text generation and completion.\n   - Key points:\n     • Can be autoregressive, predicting the next token based on previous tokens, represented as \\(P(w_n | w_1, w_2, \\ldots, w_{n-1})\\).\n     • Serve as the backbone for applications in chatbots, translation, and content generation.', 'Prompting': "3. Prompting:\n- Definition: The technique of providing specific instructions or examples to a language model to gui

In [140]:
def get_selected_values(list_of_dicts, selected_keys):
    return ["; ".join(str(d.get(key)) for key in selected_keys) for d in list_of_dicts]

In [141]:
selected_keys = topics[0]
test = get_selected_values(list_of_dicts,selected_keys)

In [None]:
#### from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Frame, PageTemplate
from reportlab.lib.styles import getSampleStyleSheet

def create_columned_pdf(output_filename, text_content, num_columns=2):
    # Create document with zero margins
    doc = SimpleDocTemplate(output_filename, pagesize=letter, 
                            leftMargin=0, rightMargin=0, topMargin=0, bottomMargin=0)
    styles = getSampleStyleSheet()
    
    # Modify the 'Normal' style to set font size to 10
    styles['Normal'].fontSize = 5
    styles['Normal'].leading = 7  # Set leading to be slightly larger than font size
    styles['Normal'].spaceAfter = 0  # Remove space after paragraphs
    
    
    styles.add(ParagraphStyle(name='Highlight',
                              parent=styles['Normal'],
                              backColor=colors.yellow))
    
    # Split the text content into paragraphs
    paragraphs = []
    for content in text_content:
        paragraphs.extend([Paragraph(p, styles['Normal']) for p in content.split('\n\n')])
    
    # Create frames for columns (now using full page width)
    page_width, page_height = letter
    frame_width = page_width / num_columns
    frames = []
    for i in range(num_columns):
        x = i * frame_width
        frame = Frame(x, 0, frame_width, page_height, leftPadding=0, bottomPadding=0, rightPadding=0, topPadding=0)
        frames.append(frame)
    
    # Create a page template
    page_template = PageTemplate(frames=frames)
    doc.addPageTemplates(page_template)
    
    # Build the document
    doc.build(paragraphs)

create_columned_pdf('output.pdf', extracted[0])

In [16]:
 def process_paragraph(p):
        import re
        pattern = r'(\d+\.\s*[^:]+:)'
        match = re.findall(pattern, p)
        return match
process_paragraph(extracted[3])

['1. Parameter-Efficient Fine-Tuning (PEFT):',
 '2. Transfer Learning:',
 '5.\n• Helps in scenarios with limited labeled data for specific tasks.\n\n3. Sparse Fine-Tuning:',
 '4. Lottery Ticket Hypothesis:',
 '5. Prompt-Based Learning:',
 '6. Adapters:',
 '7. Low-Rank Adaptation (LoRA):',
 '8. Pruning:',
 '9. In-Context Learning:',
 '3.\n• Example:']

In [99]:
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Frame, PageTemplate
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import re

def create_columned_pdf(output_filename, text_content, num_columns=2):
    doc = SimpleDocTemplate(output_filename, pagesize=letter, 
                            leftMargin=0, rightMargin=0, topMargin=0, bottomMargin=0)
    styles = getSampleStyleSheet()
    
    styles['Normal'].fontSize = 5
    styles['Normal'].leading = 7
    styles['Normal'].spaceAfter = 0
    
    styles.add(ParagraphStyle(name='Highlight',
                              parent=styles['Normal'],
                              backColor=colors.yellow))
    
    def process_paragraph(p, styles):
        elements = []
        pattern = r'^\d+\.\s*[^:]+:'
        matches = re.finditer(pattern, p, re.MULTILINE)
        last_end = 0
        
        for match in matches:
            start, end = match.span()
            
            if start > last_end:
                elements.append(Paragraph(p[last_end:start], styles['Normal']))
            
            highlighted_text = f'<span backColor="yellow">{p[start:end]}</span>'
            elements.append(Paragraph(highlighted_text, styles['Normal']))
            
            last_end = end
        
        if last_end < len(p):
            elements.append(Paragraph(p[last_end:], styles['Normal']))
        
        return elements
    
    elements = []
    for content in text_content:
        elements.extend(process_paragraph(content, styles))
    
    page_width, page_height = letter
    frame_width = page_width / num_columns
    frames = []
    for i in range(num_columns):
        x = i * frame_width
        frame = Frame(x, 0, frame_width, page_height, leftPadding=0, bottomPadding=0, rightPadding=0, topPadding=0)
        frames.append(frame)
    
    page_template = PageTemplate(frames=frames)
    doc.addPageTemplates(page_template)
    
    doc.build(elements)

create_columned_pdf('output.pdf', extracted[1])

In [101]:
extracted[3]

"1. Parameter-Efficient Fine-Tuning (PEFT):\n   - Definition: A technique in machine learning that allows for the fine-tuning of large models while only updating a small subset of parameters, improving efficiency.\n   - Key points:\n     • Reduces computational resources and time needed for model training on specific tasks.\n     • Maintains high performance similar to full fine-tuning by leveraging pre-trained model capabilities.\n\n2. Transfer Learning:\n   - Definition: A machine learning method where a model trained on one task is adapted for a different but related task, facilitating knowledge transfer.\n   - Key points:\n     • Enables faster training and improved performance on new tasks with limited data.\n     • Commonly used in natural language processing (NLP) and image recognition.\n\n3. Lottery Ticket Hypothesis:\n   - Definition: A theory suggesting that within a large neural network, there exist smaller subnetworks (winning tickets) that can be trained to achieve compara

In [143]:
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import BaseDocTemplate, Paragraph, Frame, PageTemplate, NextPageTemplate
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import re

def clean_text(text):
    text = text.replace('&', '&amp;')
    text = text.replace('<', '&lt;')
    text = text.replace('>', '&gt;')
    text = text.replace('\\(', '')
    text = text.replace('\\)', '')
    text = text.replace('\\[', '')
    text = text.replace('\\]', '')
    return text

def create_columned_pdf(output_filename, text_content, num_columns=4):
    # Use BaseDocTemplate instead of SimpleDocTemplate
    doc = BaseDocTemplate(output_filename, pagesize=letter,
                         leftMargin=1, rightMargin=1, 
                         topMargin=1, bottomMargin=1)
    
    styles = getSampleStyleSheet()
    
    # Normal text style
    styles['Normal'].fontSize = 3
    styles['Normal'].leading = 4
    styles['Normal'].leftIndent = 0
    styles['Normal'].spaceBefore = 0
    styles['Normal'].spaceAfter = 0
    
    #highlighted style
    styles.add(ParagraphStyle(name='Highlight',
                            parent=styles['Normal'],
                            backColor=colors.yellow))
        
    # Title style
    styles.add(ParagraphStyle(
        name='CustomTitle',
        parent=styles['Normal'],
        fontSize=3,
        leading=4,
        leftIndent=0,
        bold=True,
        spaceBefore=0,
        spaceAfter=0
    ))
    
    def process_paragraph(p, styles):
        elements = []
        sections = p.split('\n\n')
        
        for section in sections:
            if not section.strip():
                continue
                
            match = re.match(r'(\d+\.\s*[^:]+):', section)
            if match:
                title = clean_text(match.group(1))
                highlighted_text = f'<span backColor="yellow">{title}:</span>'
                elements.append(Paragraph(highlighted_text, styles['Normal']))
                
                content = section[match.end():].strip()
                parts = content.split('- Key points:')
                
                if len(parts) > 1:
                    definition = clean_text(parts[0].replace('- Definition:', '').strip())
                    elements.append(Paragraph(f"<b>Definition:</b> {definition}", 
                                           styles['Normal']))
                    
                    elements.append(Paragraph("<b>Key points:</b>", 
                                           styles['Normal']))
                    
                    key_points = parts[1].strip().split('•')
                    for point in key_points:
                        if point.strip():
                            cleaned_point = clean_text(point.strip())
                            elements.append(Paragraph(f"• {cleaned_point}", 
                                                   styles['Normal']))
        return elements
    
    # Process content
    elements = []
    for content in text_content:
        if content.strip():
            elements.extend(process_paragraph(content, styles))
            # Add NextPageTemplate to maintain two-column layout
            elements.append(NextPageTemplate('TwoCol'))
    
    # Set up frames
    page_width, page_height = letter
    frame_width = (page_width - 20) / num_columns
    frames = []
    
    for i in range(num_columns):
        x = 10 + (i * frame_width)
        frame = Frame(x, 10,
                     frame_width, page_height - 20,
                     leftPadding=5, bottomPadding=5,
                     rightPadding=5, topPadding=5)
        frames.append(frame)
    
    # Create template and build
    template = PageTemplate(id='TwoCol', frames=frames)
    doc.addPageTemplates(template)
    doc.build(elements)

In [144]:
create_columned_pdf("output.pdf", test)

In [113]:
type(extracted)

list

In [114]:
type(test)

list

In [142]:
test

["1. Text Classification:\n- Definition: The process of assigning predefined categories to text data based on its content, enabling automated organization and analysis.\n   - Key points:\n     • Utilizes machine learning algorithms to improve accuracy and efficiency in categorizing large text datasets.\n     • Commonly applied in sentiment analysis, spam detection, and topic categorization.; 2. Language Models (LMs):\n- Definition: Statistical models that predict the likelihood of a sequence of words, enabling tasks such as text generation and completion.\n   - Key points:\n     • Can be autoregressive, predicting the next token based on previous tokens, represented as \\(P(w_n | w_1, w_2, \\ldots, w_{n-1})\\).\n     • Serve as the backbone for applications in chatbots, translation, and content generation.; 3. Prompting:\n- Definition: The technique of providing specific instructions or examples to a language model to guide its output for a particular task.\n   - Key points:\n     • Ca