In [7]:
!pip install requests beautifulsoup4 faiss-cpu sentence-transformers transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [21]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def generate_response(relevant_chunks, query, model_name='gpt2'):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Ensure the relevant chunks are concise and deduplicated
    max_chunk_length = 300  # Adjust to model limits
    relevant_chunks = ' '.join(list(dict.fromkeys(relevant_chunks.split())))[:max_chunk_length]  # Deduplicate and truncate

    # Format prompt
    prompt = f"Context: {relevant_chunks}\n\nQuestion: {query}\nAnswer:"
    inputs = tokenizer.encode(prompt, return_tensors='pt')

    # Generate response
    outputs = model.generate(
        inputs,
        max_new_tokens=100,  # Limit the number of tokens generated
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,  # Enable sampling for temperature to take effect
        temperature=0.7,  # Controls randomness
        top_k=50  # Filters to top-k tokens
    )

    # Decode and clean up the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = answer.find("Answer:") + len("Answer:")
    final_answer = answer[answer_start:].strip()

    return final_answer

# Full Pipeline Integration
urls = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

# Step 1: Data Ingestion
all_text_chunks = []
for url in urls:
    content = crawl_and_scrape(url)
    text_chunks = segment_text(content)
    all_text_chunks.extend(text_chunks)

# Create embeddings
embeddings = create_embeddings(all_text_chunks)

# Step 2: Store Embeddings
store_embeddings(embeddings)

# Step 3: Query Handling and Response Generation
index = load_index()

# Example Query
query = "What is the main focus of research at Stanford University?"
relevant_chunk_indices = query_embeddings(query, index)

relevant_chunks = ' '.join([all_text_chunks[i] for i in relevant_chunk_indices])
response = generate_response(relevant_chunks, query)

print(response)


The goal of our research is to inform the public about the best ways to learn from the world around us. We are focused on the idea of learning from others, where the world is not about sharing knowledge in one way or another; in the process building a new understanding of how life is made.

In order to do this, we need to educate students by working with local communities to better understand the way that our society works. We find it much easier to read a newspaper than to think


In [24]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def generate_response(relevant_chunks, query, model_name='google/flan-t5-base'):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Filter and truncate chunks
    max_chunk_length = 300  # Adjust to fit model limits
    relevant_chunks = ' '.join(relevant_chunks.split()[:max_chunk_length])

    # Refined prompt
    prompt = (
        f"Context: {relevant_chunks}\n\n"
        f"Question: {query}\n"
        f"Provide a concise and specific answer based on the context."
    )
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate response
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=100,
        temperature=0.7,
        top_k=50,
        num_beams=3,  # Beam search for better coherence
    )

    # Decode and clean output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.strip()

# Full Pipeline Integration (unchanged from before)
urls = [
    "https://www.uchicago.edu/",
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

all_text_chunks = []
for url in urls:
    content = crawl_and_scrape(url)  # Implement this to fetch content
    text_chunks = segment_text(content)  # Implement this to split content into chunks
    all_text_chunks.extend(text_chunks)

# Create embeddings
embeddings = create_embeddings(all_text_chunks)  # Implement this to generate embeddings
store_embeddings(embeddings)  # Implement this to store embeddings
index = load_index()  # Implement this to load embedding index

# Query Example
query = "What is the main focus of research at Stanford University?"
relevant_chunk_indices = query_embeddings(query, index)  # Implement this to get relevant indices
relevant_chunks = ' '.join([all_text_chunks[i] for i in relevant_chunk_indices])
response = generate_response(relevant_chunks, query)

print(response)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

effective clinical therapies


In [25]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-base"  # Adjust based on the model you're using
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = (
    "Context: Effective clinical therapies have been developed for various medical "
    "conditions, including diabetes, cardiovascular diseases, and mental health disorders. "
    "These therapies are evidence-based and focus on improving patient outcomes through "
    "personalized approaches.\n\n"
    "Question: What are effective clinical therapies for cardiovascular diseases?\n"
    "Provide a concise and specific answer."
)

inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs["input_ids"], max_new_tokens=100, temperature=0.7, top_k=50)

answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(answer)


diabetes, cardiovascular diseases, and mental health disorders


In [36]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def generate_response(relevant_chunks, query, model_name='google/flan-t5-base'):
    """
    Generate a concise and specific answer based on the provided context and query.
    """
    # Load model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Ensure relevant chunks are concise and within model limits
    max_chunk_length = 300  # Adjust to avoid token overflow
    relevant_chunks = ' '.join(relevant_chunks.split()[:max_chunk_length])

    # Refined prompt to provide the model with specific focus
    prompt = (
        f"Context: {relevant_chunks}\n\n"
        f"Question: {query}\n"
        f"Provide a highly focused, detailed, and specific answer on the research involving genomic data and AI for cancer treatment prediction at Stanford University."
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)

    # Generate response using controlled parameters
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=150,  # Lengthened slightly for detail
        top_k=50,  # Focus on high-probability words
        num_beams=5,  # Beam search for better coherence
        no_repeat_ngram_size=2,  # Avoid repetition
        do_sample=False  # Use beam search for deterministic output
    )

    # Decode and clean the output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.strip()


# Example query and focused context on AI-genomic cancer research
query = "How is genomic data integrated with AI models to predict cancer outcomes and tailor treatments at Stanford University?"
relevant_chunks = """
Stanford University has developed AI models that integrate genomic data to predict cancer outcomes, focusing on tailoring treatments based on genetic markers. The program analyzes patient-specific mutations and expression profiles, allowing for personalized treatment plans. By utilizing advanced machine learning algorithms, these models predict tumor progression and recommend targeted therapies, improving patient response to treatment.
"""

response = generate_response(relevant_chunks, query)
print(response)


Stanford University has developed AI models that integrate genomic data to predict cancer outcomes, focusing on tailoring treatments based on genetic markers
