In [1]:
from transformers import pipeline
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the question answering pipeline with a model fine-tuned for question answering
question_answerer = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Load the dataset
dataset = load_dataset("Open-Orca/OpenOrca")

# Extract contexts and system prompts and turn them into a DataFrame
contexts = dataset["train"]["response"]
system_prompts = dataset["train"]["system_prompt"]
df = pd.DataFrame({'context': contexts, 'system_prompt': system_prompts})

# Define your question
question = "What are thermal mass systems?"

# Filter for relevant system prompts
def is_relevant_prompt(prompt, keywords):
    return any(keyword.lower() in prompt.lower() for keyword in keywords)

# Define keywords to identify relevant system prompts
prompt_keywords = ["thermal mass system", "heat storage", "thermal energy storage"]

# Filter system prompts for relevance
relevant_prompts = [prompt for prompt in system_prompts if is_relevant_prompt(prompt, prompt_keywords)]

# Print the selected system prompts for debugging
print("Relevant system prompts:")
for i, prompt in enumerate(relevant_prompts[:10]):  # Print only the top 10 for brevity
    print(f"System Prompt {i+1}: {prompt}\n")

# Combine relevant system prompts with contexts
df['combined'] = df.apply(lambda row: ' '.join([row['system_prompt'], row['context']]) 
                          if row['system_prompt'] in relevant_prompts else row['context'], axis=1)

# Retrieve relevant contexts using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=50000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined'])

# Transform the question into the same vector space
question_vector = tfidf_vectorizer.transform([question])

# Compute cosine similarity between the question and all contexts
cosine_similarities = cosine_similarity(question_vector, tfidf_matrix).flatten()

# Set a high threshold for cosine similarity
threshold = 0.1
top_n_indices = np.where(cosine_similarities > threshold)[0]
top_contexts = df.iloc[top_n_indices]['combined'].drop_duplicates().tolist()

# Filter contexts for relevance and length
def is_relevant_context(context, keywords):
    return any(keyword.lower() in context.lower() for keyword in keywords)

# Define more specific keywords for context filtering
context_keywords = ["thermal mass system", "heat storage", "thermal energy storage"]

# Filter for relevant contexts
relevant_contexts = [context for context in top_contexts if is_relevant_context(context, context_keywords)]

# Print the selected contexts for debugging
print("Relevant contexts:")
for i, context in enumerate(relevant_contexts[:10]):  # Print only the top 10 for brevity
    print(f"Context {i+1}: {context}\n")

# Concatenate the top relevant contexts into a single string
combined_contexts = " ".join(relevant_contexts)

# Create a detailed system prompt for the AI
system_prompt = "You are an AI assistant that helps people find information. Provide a detailed answer so the user doesn’t need to search outside to understand the answer."

# Combine the system prompt with the combined contexts
final_context = system_prompt + " " + combined_contexts

# Perform the question answering on the combined contexts
detailed_result = question_answerer(question=question, context=final_context)

# Print the result for debugging
print("QA Result:", detailed_result)

# Extract the answer span from the context using the provided indices
answer_start = detailed_result['start']
answer_end = detailed_result['end']
answer = final_context[answer_start:answer_end].strip()

# Print the detailed answer
print("Detailed Answer:", answer)



Relevant system prompts:
Relevant contexts:
Context 1: Yes, the question is answered satisfactorily. The answer provided highlights a key function of a thermal mass system, which is to store solar energy in the form of heat at useful temperatures for daily or interseasonal durations.

Context 2: Title: "Harnessing Solar Energy with Thermal Mass Systems: Maximizing Efficiency and Reducing Demand Through High Specific Heat Capacity Materials"

Context 3: Among the options provided, air does not accept heat very well as it is a poor conductor of heat. Now, let's examine each of the options to better understand their thermal conductivity and ability to accept and conduct heat.

1. Corn - Being a plant-based material, corn can absorb some heat, but it is not a particularly good conductor. Its thermal conductivity depends on several factors, such as moisture content and density. However, when compared to metals or other solid materials, it does not conduct heat very well.

2. Air - Air is an

In [2]:
from transformers import pipeline
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the question answering pipeline with a model fine-tuned for question answering
question_answerer = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Load the dataset
dataset = load_dataset("Open-Orca/OpenOrca")

# Extract contexts and turn them into a DataFrame
contexts = dataset["train"]["response"]
df = pd.DataFrame({'context': contexts})

# Define your question
question = "What are thermal mass systems?"

# Retrieve relevant contexts using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=50000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['context'])

# Transform the question into the same vector space
question_vector = tfidf_vectorizer.transform([question])

# Compute cosine similarity between the question and all contexts
cosine_similarities = cosine_similarity(question_vector, tfidf_matrix).flatten()

# Set a high threshold for cosine similarity
threshold = 0.1
top_n_indices = np.where(cosine_similarities > threshold)[0]
top_contexts = df.iloc[top_n_indices]['context'].drop_duplicates().tolist()

# Filter contexts for relevance and length
def is_relevant_context(context, keywords):
    return any(keyword.lower() in context.lower() for keyword in keywords)

# Define more specific keywords
keywords = ["thermal mass system", "heat storage", "thermal energy storage"]

# Filter for relevant contexts
relevant_contexts = [context for context in top_contexts if is_relevant_context(context, keywords)]

# Print the selected contexts for debugging
print("Relevant contexts:")
for i, context in enumerate(relevant_contexts[:10]):  # Print only the top 10 for brevity
    print(f"Context {i+1}: {context}\n")

# Concatenate the top relevant contexts into a single string
combined_contexts = " ".join(relevant_contexts)

# Perform the question answering on the combined contexts
result = question_answerer(question=question, context=combined_contexts)

# Print the result for debugging
print("QA Result:", result)

# Extract the answer span from the context using the provided indices
answer_start = result['start']
answer_end = result['end']
answer = combined_contexts[answer_start:answer_end].strip()

# Print the answer
print("Answer:", answer)


Relevant contexts:
Context 1: Yes, the question is answered satisfactorily. The answer provided highlights a key function of a thermal mass system, which is to store solar energy in the form of heat at useful temperatures for daily or interseasonal durations.

Context 2: Title: "Harnessing Solar Energy with Thermal Mass Systems: Maximizing Efficiency and Reducing Demand Through High Specific Heat Capacity Materials"

Context 3: Among the options provided, air does not accept heat very well as it is a poor conductor of heat. Now, let's examine each of the options to better understand their thermal conductivity and ability to accept and conduct heat.

1. Corn - Being a plant-based material, corn can absorb some heat, but it is not a particularly good conductor. Its thermal conductivity depends on several factors, such as moisture content and density. However, when compared to metals or other solid materials, it does not conduct heat very well.

2. Air - Air is an insulator and a poor con