In [None]:
!pip install --upgrade --quiet openai langchain langchain_community bertopic

In [9]:
import pandas as pd
weak_signal_df = pd.read_csv('/home/jupyter/WSM/data/weak_signal_df.csv')
weak_signal_df

Unnamed: 0,Name,dov,Topic,Count,Representation,Representative_Docs
0,582_foragers_social_collective_colony,0.004269,582,5,"['foragers', 'social', 'collective', 'colony',...","[""Foraging is a result of innate and acquired ..."
1,700_pregnancy_hypospadias_glucose_discourses,0.004397,700,4,"['pregnancy', 'hypospadias', 'glucose', 'disco...",['Overweight and obesity pre pregnancy or duri...
2,924_microscopically_bioinformaticsenabled_libr...,0.008287,924,3,"['microscopically', 'bioinformaticsenabled', '...",['Since the advent of the use of matrix-assist...
3,678_pgamended_peppers_ferralsols_moderately,0.005038,678,4,"['pgamended', 'peppers', 'ferralsols', 'modera...",['Lead (Pb) contamination of soil poses severe...
4,482_afb1_aflatoxin_indonesia_cancer,0.004123,482,6,"['afb1', 'aflatoxin', 'indonesia', 'cancer', '...",['This mini review article described the expos...
5,442_lenses_device_maude_sentinel,0.004003,442,6,"['lenses', 'device', 'maude', 'sentinel', 'str...",['All contact lenses (corrective/noncorrective...
6,453_sanguineus_sachet_salivary_am,0.00516,453,6,"['sanguineus', 'sachet', 'salivary', 'am', 'iv...",['The purpose of this study was to develop an ...
7,847_buttonwood_site_stem_cadmium,0.006593,847,3,"['buttonwood', 'site', 'stem', 'cadmium', 'lea...",['Soil contaminated with cadmium presents a po...
8,690_sfr_bottles_asepticuht_1011,0.004046,690,4,"['sfr', 'bottles', 'asepticuht', '1011', 'stea...",['Aseptic ultra-high-temperature (UHT)-type pr...
9,656_timetemperature_git_options_simulated,0.005065,656,4,"['timetemperature', 'git', 'options', 'simulat...",['Even though a plethora of barriers are emplo...


In [11]:
import getpass
import os
import openai
import json
import pandas as pd

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key:  ········


In [24]:
from langchain import PromptTemplate, OpenAI

prompt_template = """
You are receiving the following topic representation within the realm of food safety.
Based on this topic representation and your expert knowledge in the food safety domain your task is to infer an emerging food safety risk. Generate a concise statement about the risk.
If you can not infer an emerging food safety risk, just say: no issue found.
"""

In [25]:
# Set the OpenAI API key
client = openai
openai.api_key = os.environ["OPENAI_API_KEY"]

# Function to generate text based on topic and risk
def infer_text(topic):
    prompt = prompt_template + f"\ntopic: {topic}\n"
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in food safety."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.6,
    )
    
    # Return the generated content
    return response.choices[0].message.content

# Create an empty list to store the results
results = []

# Loop through each row in your DataFrame and generate text
for index, row in weak_signal_df.iterrows():
    topic = row['Representation']
    
    # Generate the text using the infer_text function
    text = infer_text(topic)
    
    # Append the result to the list
    results.append(text)

# Add the generated text to the DataFrame in a new column
weak_signal_df['weak_signal_interpretation'] = results

# Display first n statements
weak_signal_df["weak_signal_interpretation"].tolist()[:6]

['Emerging food safety risk: Increased foraging and social interactions among bees could lead to higher transmission of pathogens within colonies, potentially impacting honey production and safety.',
 'Emerging food safety risk: The consumption of high-glucose foods during pregnancy may increase the risk of hypospadias in male offspring, particularly in cases where the mother is overweight or has glucose tolerance issues.',
 'Emerging food safety risk: The detection of strain-level variations of foodborne pathogens, such as Methylobacterium, using advanced techniques like bioinformatics-enabled microdissection and TOF-MALDI, highlights the potential for identifying previously overlooked microbial contaminants in food products, necessitating enhanced monitoring and control measures.',
 'Emerging food safety risk: Elevated levels of heavy metals (Pb, Zn, Cr) in peppers grown in ferralsols and cambisols could pose health risks to consumers.',
 'Emerging food safety risk: The presence of a

In [26]:
# odd term interpretation

from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

sentence_model = SentenceTransformer("all-mpnet-base-v2")

# Define a function to get the first three terms from the list
def get_first_three_terms(term_list):
    # Check if input is a string, if so split it into words
    if isinstance(term_list, str):
        term_list = term_list.split()  # You can customize the splitting logic if needed
    return term_list[:3]

def find_distant_terms(terms, n, threshold=0.3):
    if isinstance(terms, str):
        terms = terms.split()  # Split terms if provided as a single string

    if len(terms) < max(3, n):
        return []  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)
    
    # Ensure the embedding has a batch dimension
    if len(initial_terms_embeddings.shape) == 1:
        initial_terms_embeddings = initial_terms_embeddings.unsqueeze(0)

    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    distant_terms = []
    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)

        # Ensure term_embedding also has the correct shape
        if len(term_embedding.shape) == 1:
            term_embedding = term_embedding.unsqueeze(0)

        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding.unsqueeze(0), term_embedding).item()
        if similarity < threshold:
            distant_terms.append(term)
    return distant_terms

def find_most_distant_term(terms, n, threshold=0.4):
    if isinstance(terms, str):
        terms = terms.split()  # Split terms if provided as a single string

    if len(terms) < max(3, n):
        return None  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)

    # Ensure the embedding has a batch dimension
    if len(initial_terms_embeddings.shape) == 1:
        initial_terms_embeddings = initial_terms_embeddings.unsqueeze(0)

    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    most_distant_term = None
    lowest_similarity = 1.0  # Initialize with the maximum possible similarity

    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)

        # Ensure term_embedding also has the correct shape
        if len(term_embedding.shape) == 1:
            term_embedding = term_embedding.unsqueeze(0)

        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding.unsqueeze(0), term_embedding).item()
        if similarity < threshold and similarity < lowest_similarity:
            lowest_similarity = similarity
            most_distant_term = term

    return most_distant_term

# Specify the nth term here, e.g., 4th term would be n=4
nth_term = 4

# Apply the functions to extract distant terms and core topics
weak_signal_df['distant_terms'] = weak_signal_df['Representation'].apply(lambda x: find_distant_terms(x, nth_term))
weak_signal_df['most_distant_terms'] = weak_signal_df['Representation'].apply(lambda x: find_most_distant_term(x, nth_term))
weak_signal_df['core_topic'] = weak_signal_df['Representation'].apply(get_first_three_terms)

# Output the DataFrame
print(weak_signal_df)




                                                 Name       dov  Topic  Count  \
0               582_foragers_social_collective_colony  0.004269    582      5   
1        700_pregnancy_hypospadias_glucose_discourses  0.004397    700      4   
2   924_microscopically_bioinformaticsenabled_libr...  0.008287    924      3   
3         678_pgamended_peppers_ferralsols_moderately  0.005038    678      4   
4                 482_afb1_aflatoxin_indonesia_cancer  0.004123    482      6   
5                    442_lenses_device_maude_sentinel  0.004003    442      6   
6                   453_sanguineus_sachet_salivary_am  0.005160    453      6   
7                    847_buttonwood_site_stem_cadmium  0.006593    847      3   
8                     690_sfr_bottles_asepticuht_1011  0.004046    690      4   
9           656_timetemperature_git_options_simulated  0.005065    656      4   
10              749_fluorescence_urine_hai_microfiber  0.004124    749      4   
11                 548_house

In [27]:
from langchain import PromptTemplate, OpenAI

prompt_template = """
You are receiving two lists of terms. The first list represents a topic in the field of food safety. The second list contains terms that might pose a risk to that topic.
Based on your expert knowledge in the food safety domain your task is to infer what kind of food safety risk might occur based on the given data and produce a short statement to illustrate that risk.
Example: Topic: ['cruzi', 'chagas', 'trypanosoma'];
Risk: ['attalea'];
Output: Deforestation effects on Attalea palms and their resident Rhodnius, vectors of Chagas disease, in eastern Amazonia.
"""

In [28]:
# Function to generate text based on topic and risk
def infer_text(topic, risk):
    prompt = prompt_template + f"\ntopic: {topic}\n" + f"\nrisk: {risk}\n"
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in food safety."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.6,
    )
    
    # Return the generated content
    return response.choices[0].message.content

# Create an empty list to store the results
results = []

# Loop through each row in your DataFrame and generate text
for index, row in weak_signal_df.iterrows():
    topic = row['core_topic']
    risk = row['distant_terms']
    
    # Generate the text using the infer_text function
    text = infer_text(topic, risk)
    
    # Append the result to the list
    results.append(text)

# Add the generated text to the DataFrame in a new column
weak_signal_df['odd_term_interpretation'] = results

# Display first n statements
weak_signal_df["odd_term_interpretation"].tolist()[:6]

['Foragers, when operating in a social and collective manner, may face risks related to immunity. This can occur as close interactions and shared resources increase the likelihood of spreading pathogens, thereby affecting the collective immunity of the group. Additionally, any failure to properly adjust for changing environmental or health conditions can exacerbate these risks.',
 'Inadequate or misleading discourses and unnecessary testing practices during pregnancy, particularly in the context of 1-hour glucose tolerance tests, may lead to misdiagnosis or undue stress, potentially impacting fetal development and increasing the risk of conditions such as hypospadias.',
 'Potential risks in the field of food safety can arise from the limitations and challenges associated with analytical techniques. In this case:\n\nOutput: The resolution limitations of Time-of-Flight (TOF) and Matrix-Assisted Laser Desorption/Ionization (MALDI) mass spectrometry could impact the effectiveness of micros

In [29]:
weak_signal_df.to_excel('/home/jupyter/WSM/data/results_llm_inference.xlsx', index=False)

In [None]:
duplicates = random_samples['Representation'].isin(weak_signal_df['Representation'])

# If any duplicates are found, duplicates will contain True values
has_duplicates = duplicates.any()

# Display the result
if has_duplicates:
    print("There are identical rows in the 'Representation' column.")
else:
    print("No identical rows in the 'Representation' column.")

In [None]:
# Create an empty DataFrame to store the results
results = []

# Loop through each category and generate project descriptions
for index, row in random_samples.iterrows():
    topic = row['Representation']
    text = infer_text4(topic)
    results.append(text)

random_samples['interpretation'] = results

In [None]:
random_samples.to_excel('/home/jupyter/WSM/living_lab/random_results.xlsx', index=False)