In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [20]:
!  pip install groq pandas gradio



In [21]:
import groq
import os
import re
import json
import pandas as pd
import gradio as gr
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict
import tempfile

In [22]:
import requests  # Make sure this is properly imported


In [23]:
import os

# Set the GROQ_API_KEY environment variable
os.environ['GROQ_API_KEY'] = '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'

# Verify it's set
print(os.getenv('GROQ_API_KEY'))

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


In [24]:
# Groq API setup
GROQ_API_KEY = "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"
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"


In [25]:
def extract_authors(text):
    """
    Simplified and more robust function to extract author names from scientific articles.
    """
    # Try to extract authors with titles first
    authors = []
    
    # Look for patterns like "Dr. Name" or "Prof. Name"
    title_name_pattern = r'(Dr\.|Prof\.|Professor|Doctor)\s+([A-Z][a-zA-Z\-]+(?:\s+[A-Z][a-zA-Z\-]+)*)'
    title_matches = re.finditer(title_name_pattern, text)
    
    for match in title_matches:
        title = match.group(1)
        name = match.group(2)
        authors.append(f"{title} {name}")
    
    # If we found authors with titles, return them
    if authors:
        return ", ".join(authors)
    
    # Otherwise try more general patterns
    general_patterns = [
        r'by\s+([\w\s\.,]+?)(?:and|&|\.|,|\(|\n)',
        r'author[s]?:?\s+([\w\s\.,]+?)(?:and|&|\.|,|\(|\n)',
        r'([\w\s\.]+? et al\.)',
    ]
    
    for pattern in general_patterns:
        matches = re.search(pattern, text, re.IGNORECASE)
        if matches:
            return matches.group(1).strip()
    
    # If still no authors found, try to extract names that look like they could be authors
    # This is a fallback approach for the specific example
    # if "Vargas" in text and "Takahashi" in text:
    #     return "Dr. Eliza Vargas and Prof. Y. Takahashi"
    
    return "Authors could not be automatically extracted."

In [26]:
def extract_authors_with_llm(text):
    """
    Use the LLM to extract authors as a backup method.
    """
    prompt = f"""
    Extract ONLY the author names from this scientific text. Return ONLY the names without any additional text.
    If no authors are found, respond with "Authors not found".

    Text: {text}
    """
    
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": "llama3-70b-8192",
        "messages": [
            {"role": "system", "content": "You are a precise information extraction tool."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.2,
        "max_tokens": 100
    }
    
    try:
        response = requests.post(GROQ_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        
        response_data = response.json()
        extracted_authors = response_data["choices"][0]["message"]["content"].strip()
        
        return extracted_authors
    except Exception as e:
        return f"Error in LLM extraction: {str(e)}"

In [27]:
def generate_summaries_with_groq(article_text):
    """
    Generate summaries for three different audiences using the Groq API.
    Also extracts author names from the article.
    """
    # Try regex-based extraction first
    authors = extract_authors(article_text)
    
    # If regex fails or returns a generic message, try LLM-based extraction
    if authors.startswith("Authors could not") or authors.startswith("Error"):
        authors = extract_authors_with_llm(article_text)
    
    # The prompt template for generating the summaries for scientific articles
    prompt = f"""
    Here is a scientific article that needs to be summarized for three different audiences:

    {article_text}

    Please follow these instructions carefully:
    
    1. Create a summary for an EXPERT IN THE FIELD:
       - Use all relevant technical terminology
       - Maintain scientific precision
       - Focus on novel contributions and methodological details
       - Keep advanced concepts intact
       - Assume deep domain knowledge
    
    2. Create a summary for a HIGH SCHOOL STUDENT:
       - Maintain necessary technical terms but provide clear explanations
       - Use accessible language and analogies where helpful
       - Explain complex concepts step-by-step
       - Focus on the educational value and fundamental scientific principles
       - Assume basic science education
    
    3. Create a summary for a GENERAL ADULT READER:
       - Use everyday language with minimal jargon
       - Only include essential technical terms with simple explanations
       - Focus on the significance and real-world implications
       - Make the core message accessible to someone with no background in the field
       - Use analogies to familiar concepts when possible
    
    Format your response with clear headings for each audience type.
    """
    
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": "llama3-70b-8192",
        "messages": [
            {"role": "system", "content": "You are a professional science communicator skilled at adapting complex scientific content for different audiences while preserving essential information."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.5,
        "max_tokens": 2000
    }
    
    try:
        response = requests.post(GROQ_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        
        response_data = response.json()
        summary_text = response_data["choices"][0]["message"]["content"]
        
        return authors, summary_text
    
    except Exception as e:
        error_message = f"An error occurred while generating summaries: {str(e)}"
        return authors, error_message


In [28]:
def process_article(article_text, audience_focus=None):
    """
    Process the article and return the results.
    """
    if not article_text or len(article_text.strip()) < 50:
        return "Please provide a longer article text (at least 50 characters).", ""
    
    authors, summaries = generate_summaries_with_groq(article_text)
    
    # If an audience focus is selected, highlight that section
    if audience_focus and audience_focus != "All Audiences":
        # Extract the specific audience section using regex
        if audience_focus == "Expert":
            pattern = r"(.*?EXPERT IN THE FIELD.*?)((?=.*?HIGH SCHOOL STUDENT)|$)"
        elif audience_focus == "High School":
            pattern = r"(.*?HIGH SCHOOL STUDENT.*?)((?=.*?GENERAL ADULT READER)|$)"
        elif audience_focus == "General Adult":
            pattern = r"(.*?GENERAL ADULT READER.*?$)"
        
        audience_match = re.search(pattern, summaries, re.DOTALL | re.IGNORECASE)
        if audience_match:
            focused_summary = audience_match.group(1).strip()
            summaries = f"**FOCUSED SUMMARY FOR {audience_focus.upper()}:**\n\n{focused_summary}\n\n---\n\nFull summaries are available by selecting 'All Audiences'."
    
    return authors, summaries

In [29]:
# Create a more interactive Gradio interface
with gr.Blocks(title="Article Summarizer") as app:
    gr.Markdown("# 🧪 Article Summarizer")
    gr.Markdown("This tool analyzes articles, extracts author names, and creates tailored summaries for different audiences.")
    
    with gr.Row():
        with gr.Column(scale=2):
            article_input = gr.Textbox(
                lines=12, 
                label="Paste your article here", 
                placeholder="Paste the full text of the article here...",
                value=""
            )
            
            with gr.Row():
                audience_dropdown = gr.Dropdown(
                    choices=["All Audiences", "Expert", "High School", "General Adult"],
                    value="All Audiences",
                    label="Focus on specific audience (optional)"
                )
                submit_btn = gr.Button("Generate Summaries", variant="primary")
            
            with gr.Accordion("Advanced Options", open=False):
                gr.Markdown("""
                * The **Expert** summary focuses on technical precision and advanced concepts
                * The **High School** summary explains technical terms clearly for educational use
                * The **General Adult** summary uses everyday language for accessibility
                """)
        
        with gr.Column(scale=3):
            authors_output = gr.Textbox(label="📝 Extracted Authors")
            
            with gr.Tabs():
                with gr.TabItem("Summaries"):
                    summaries_output = gr.Markdown(label="Summaries for Different Audiences")
                
                with gr.TabItem("About"):
                    gr.Markdown("""
                    ## How this works
                    
                    This tool uses advanced natural language processing to:
                    
                    1. **Extract authors** from scientific articles using pattern recognition and AI assistance
                    2. **Generate tailored summaries** for three different audience types:
                       - **Experts**: Preserving technical depth and precision
                       - **High School Students**: Educational with clear explanations
                       - **General Adults**: Accessible with minimal jargon
                    
                    The application uses Groq's LLM API with the llama3-70b-8192 model.
                    
                    ### Sample Article
                    A sample neutrino physics article is provided. Try it out or paste your own!
                    """)
    
    # Set up the interactive elements
    submit_btn.click(
        fn=process_article,
        inputs=[article_input, audience_dropdown],
        outputs=[authors_output, summaries_output]
    )
    


In [30]:
# Launch the Gradio app
if __name__ == "__main__":
    app.launch()

* Running on local URL:  http://127.0.0.1:7862
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://28d92a61a6e3b8f588.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
