In [None]:
!pip install --upgrade --quiet openai langchain langchain_community

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

# Define a function to get the first three terms from the list
def get_first_three_terms(term_list):
    return term_list[:3]

def find_distant_terms(terms, n, threshold=0.3):
    if len(terms) < max(3, n):
        return []  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)
    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    distant_terms = []
    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)
        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding, term_embedding).item()
        if similarity < threshold:
            distant_terms.append(term)
    return distant_terms

def find_most_distant_term(terms, n, threshold=0.4):
    if len(terms) < max(3, n):
        return None  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)
    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    most_distant_term = None
    lowest_similarity = 1.0  # Initialize with the maximum possible similarity

    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)
        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding, term_embedding).item()
        if similarity < threshold and similarity < lowest_similarity:
            lowest_similarity = similarity
            most_distant_term = term

    return most_distant_term

# Specify the nth term here, e.g., 4th term would be n=4
nth_term = 4
weak_signal_df['distant_terms'] = weak_signal_df['Representation'].apply(lambda x: find_distant_terms(x, nth_term))
weak_signal_df['most_distant_terms'] = weak_signal_df['Representation'].apply(lambda x: find_most_distant_term(x, nth_term))
weak_signal_df['core_topic'] = weak_signal_df['Representation'].apply(get_first_three_terms)

# Output the DataFrame
print(weak_signal_df)

In [None]:
from langchain import PromptTemplate, OpenAI

prompt_template = """
You are receiving two lists of terms. The first list represents a topic in the field of food safety. The second list contains terms that might pose a risk to that topic.
Based on your expert knowledge in the food safety domain your task is to infer what kind of food safety risk might occur based on the given data and produce a short statement to illustrate that risk.
Example: Topic: ['cruzi', 'chagas', 'trypanosoma'];
Risk: ['attalea'];
Output: Deforestation effects on Attalea palms and their resident Rhodnius, vectors of Chagas disease, in eastern Amazonia.
"""

In [None]:
# interprete topic representations


from langchain import PromptTemplate, OpenAI

prompt_template2 = """
You are receiving the following topic representation within the realm of food contamination.
Based on this topic representation and your expert knowledge in the food safety domain your task is to infer an emerging food safety risk.
If you can not infer an emerging food safety risk, just say: no issue found.
"""

In [None]:
import openai
import os
import json
import pandas as pd
from openai import OpenAI

# Function to generate text based on topic
def infer_text2(label):
    prompt = prompt_template + f"\ntopic: {topic}\n" + f"\nrisk: {risk}\n"
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
                {"role": "system", "content": "You are an expert in food safety."},
                {"role": "user", "content": prompt}
            ],
        temperature=0.6,
        )
    return response.choices[0].message.content

In [None]:
# Create an empty DataFrame to store the results
results = []

# Loop through each category and generate project descriptions
for index, row in weak_signal_df.iterrows():
    topic = row['core_topic']
    risk = row['distant_terms']
    text = infer_text2(topic)
    results.append(text)


weak_signal_df['odd_term_interpretaton'] = results

In [None]:
weak_signal_df.to_excel('/home/jupyter/WSM/living_lab/results_final.xlsx', index=False)

In [None]:
duplicates = random_samples['Representation'].isin(weak_signal_df['Representation'])

# If any duplicates are found, duplicates will contain True values
has_duplicates = duplicates.any()

# Display the result
if has_duplicates:
    print("There are identical rows in the 'Representation' column.")
else:
    print("No identical rows in the 'Representation' column.")

In [None]:
# Create an empty DataFrame to store the results
results = []

# Loop through each category and generate project descriptions
for index, row in random_samples.iterrows():
    topic = row['Representation']
    text = infer_text4(topic)
    results.append(text)

random_samples['interpretation'] = results

In [None]:
random_samples.to_excel('/home/jupyter/WSM/living_lab/random_results.xlsx', index=False)