In [13]:
from langchain_community.llms import Ollama
import numpy as np
model="phi4:latest" 
# Initialize the Ollama LLM
llm = Ollama(model = model , temperature= 0.8)




In [14]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import JsonOutputParser




# JSON parser
parser = JsonOutputParser()


prompt_1 = PromptTemplate.from_template("""
You are a helpful assistant.

There are two contexts provided below.

Context A:
{context_a}

Context B:
{context_b}

Your task:
1. Generate one question that is directly relevant to Context A and provide an accurate response based on Context A.
2. Generate another question that is not related to Context A but is relevant to Context B.

Important: All output must be in Persian (Farsi) language.

Return the result strictly in the following JSON format (but translated to Persian):

{{
  "relevant_question_to_context_a": "...",
  "answer_based_on_context_a": "...",
  "irrelevant_question_to_context_a_but_relevant_to_context_b": "..."
}}
""")


# Define chain
chain_relevent = (
    RunnableMap({
        "context_a": RunnablePassthrough(),
        "context_b": RunnablePassthrough()
    })
    | prompt_1
    | llm
    | parser
)




In [15]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import JsonOutputParser

# JSON parser
parser = JsonOutputParser()

# English prompt: clearer definition of the general question
prompt_2 = PromptTemplate.from_template("""
You are a helpful assistant.

A single context is provided below.

Context A:
{context_a}

Your task:
1. Generate one question that is directly relevant to Context A and provide an accurate answer based only on Context A.
2. Generate a general question about anything in the world that is completely unrelated to Context A.

Important: All output must be in Persian (Farsi) language.

Return the result strictly in the following JSON format (but translated to Persian):

{{
  "relevant_question_to_context_a": "...",
  "answer_based_on_context_a": "...",
  "general_unrelated_question": "..."
}}
""")

# Define the processing chain
chain_general = (
    RunnableMap({
        "context_a": RunnablePassthrough()
    })
    | prompt_2
    | llm
    | parser
)


In [16]:
import pandas as pd
data = pd.read_excel("MEC-data_question-V0.7.xlsx").to_numpy()
dataset = []
for i in data:
    dataset.append(i[0] + " " + i[1])

In [17]:
final_dataset = []

for iteration in range(1):
    for i in range(len(dataset)):
        decision = np.random.rand()
        
        if decision <= 0.70:
            rand = i
            while rand == i:
                rand = np.random.randint(0, len(dataset))
            
            result = chain_relevent.invoke({
                "context_a": dataset[i],
                "context_b": dataset[rand]
            })
            
            final_dataset.append({
                "anchor": result.get("answer_based_on_context_a"),
                "positive": result.get("relevant_question_to_context_a"),
                "negetive": result.get("irrelevant_question_to_context_a_but_relevant_to_context_b"),
                "mode": "relevent"
            })
            
        else:
            result = chain_general.invoke({
                "context_a": dataset[i],
            })
            
            final_dataset.append({
                "anchor": result.get("answer_based_on_context_a"),
                "positive": result.get("relevant_question_to_context_a"),
                "negetive": result.get("general_unrelated_question"),
                "mode": "general"
            })


In [None]:
import time
import numpy as np

final_dataset = []

for iteration in range(40):
    for i in range(len(dataset)):
        decision = np.random.rand()
        
        if decision <= 0.70:
            rand = i
            while rand == i:
                rand = np.random.randint(0, len(dataset))
            
            result = chain_relevent.invoke({
                "context_a": dataset[i],
                "context_b": dataset[rand]
            })
            
            final_dataset.append({
                "anchor": result.get("answer_based_on_context_a"),
                "positive": result.get("relevant_question_to_context_a"),
                "negetive": result.get("irrelevant_question_to_context_a_but_relevant_to_context_b"),
                "mode": "relevent"
            })
            
        else:
            result = chain_general.invoke({
                "context_a": dataset[i],
            })
            
            final_dataset.append({
                "anchor": result.get("answer_based_on_context_a"),
                "positive": result.get("relevant_question_to_context_a"),
                "negetive": result.get("general_unrelated_question"),
                "mode": "general"
            })

    # Wait 30 minutes (1800 seconds) after each iteration
    print(f"Iteration {iteration + 1} complete. Waiting 30 minutes before the next iteration...")
    time.sleep(1800)


In [None]:
from datasets import Dataset

# Create dataset from list
dataset = Dataset.from_list(final_dataset)


dataset.save_to_disk("APN_dataset_new")


Saving the dataset (1/1 shards): 100%|██████████| 26/26 [00:00<00:00, 14992.01 examples/s]


In [22]:
df = pd.DataFrame(final_dataset)

# Save to Excel
df.to_excel("output.xlsx", index=False)