In [1]:
!pip install --upgrade --quiet openai langchain langchain_community bertopic

In [2]:
import pandas as pd
weak_signal_df = pd.read_csv('/home/jupyter/WSM/data/weak_signal_df.csv')
weak_signal_df

Unnamed: 0,Name,dov,Topic,Count,Representation,Representative_Docs
0,762_butcher_quetta_shops_groceries,0.002993,762,4,"['butcher', 'quetta', 'shops', 'groceries', 't...","['All foods carry microbes, many of which are ..."
1,901_vitamin_fortification_arich_ofsp,0.002615,901,3,"['vitamin', 'fortification', 'arich', 'ofsp', ...",['Recent recognition of the early onset and hi...
2,354_mlvhrma_typhimurium_tr_tandem,0.002599,354,7,"['mlvhrma', 'typhimurium', 'tr', 'tandem', 're...","['To assess the distribution of Salmonella 4,[..."
3,605_cropping_philippi_horticultural_summer,0.002511,605,5,"['cropping', 'philippi', 'horticultural', 'sum...",['Contamination of the food chain with heavy m...
4,282_variability_rope_amyloliquefaciens_pan,0.002582,282,8,"['variability', 'rope', 'amyloliquefaciens', '...","[""This study aims at the characterisation of g..."
5,1113_lmf_lmfs_lowmoisture_cfd,0.004032,1113,2,"['lmf', 'lmfs', 'lowmoisture', 'cfd', 'lwafs',...",['Low-moisture foods have been responsible for...
6,1130_clavata_dairy_casecase_saprochaete,0.004032,1130,2,"['clavata', 'dairy', 'casecase', 'saprochaete'...",['Few studies have investigated the diversity ...
7,872_ed_alkylphenols_longchain_octylphenols,0.003644,872,3,"['ed', 'alkylphenols', 'longchain', 'octylphen...","['Long-chain alkylphenols, such as nonylphenol..."
8,403_sporulation_differentiation_nprrdependent_...,0.003433,403,6,"['sporulation', 'differentiation', 'nprrdepend...",['The spore-forming bacterium Bacillus thuring...
9,593_pastrami_jerky_beef_grinding,0.002725,593,5,"['pastrami', 'jerky', 'beef', 'grinding', 'tri...",['Intense manipulation during beef jerky produ...


In [3]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

sentence_model = SentenceTransformer("all-mpnet-base-v2")

# Define a function to get the first three terms from the list
def get_first_three_terms(term_list):
    # Check if input is a string, if so split it into words
    if isinstance(term_list, str):
        term_list = term_list.split()  # You can customize the splitting logic if needed
    return term_list[:3]

def find_distant_terms(terms, n, threshold=0.3):
    if isinstance(terms, str):
        terms = terms.split()  # Split terms if provided as a single string

    if len(terms) < max(3, n):
        return []  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)
    
    # Ensure the embedding has a batch dimension
    if len(initial_terms_embeddings.shape) == 1:
        initial_terms_embeddings = initial_terms_embeddings.unsqueeze(0)

    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    distant_terms = []
    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)

        # Ensure term_embedding also has the correct shape
        if len(term_embedding.shape) == 1:
            term_embedding = term_embedding.unsqueeze(0)

        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding.unsqueeze(0), term_embedding).item()
        if similarity < threshold:
            distant_terms.append(term)
    return distant_terms

def find_most_distant_term(terms, n, threshold=0.4):
    if isinstance(terms, str):
        terms = terms.split()  # Split terms if provided as a single string

    if len(terms) < max(3, n):
        return None  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)

    # Ensure the embedding has a batch dimension
    if len(initial_terms_embeddings.shape) == 1:
        initial_terms_embeddings = initial_terms_embeddings.unsqueeze(0)

    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    most_distant_term = None
    lowest_similarity = 1.0  # Initialize with the maximum possible similarity

    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)

        # Ensure term_embedding also has the correct shape
        if len(term_embedding.shape) == 1:
            term_embedding = term_embedding.unsqueeze(0)

        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding.unsqueeze(0), term_embedding).item()
        if similarity < threshold and similarity < lowest_similarity:
            lowest_similarity = similarity
            most_distant_term = term

    return most_distant_term

# Specify the nth term here, e.g., 4th term would be n=4
nth_term = 4

# Apply the functions to extract distant terms and core topics
weak_signal_df['distant_terms'] = weak_signal_df['Representation'].apply(lambda x: find_distant_terms(x, nth_term))
weak_signal_df['most_distant_terms'] = weak_signal_df['Representation'].apply(lambda x: find_most_distant_term(x, nth_term))
weak_signal_df['core_topic'] = weak_signal_df['Representation'].apply(get_first_three_terms)

# Output the DataFrame
print(weak_signal_df)


  from tqdm.autonotebook import tqdm, trange


                                                 Name       dov  Topic  Count  \
0                  762_butcher_quetta_shops_groceries  0.002993    762      4   
1                901_vitamin_fortification_arich_ofsp  0.002615    901      3   
2                   354_mlvhrma_typhimurium_tr_tandem  0.002599    354      7   
3          605_cropping_philippi_horticultural_summer  0.002511    605      5   
4          282_variability_rope_amyloliquefaciens_pan  0.002582    282      8   
5                       1113_lmf_lmfs_lowmoisture_cfd  0.004032   1113      2   
6             1130_clavata_dairy_casecase_saprochaete  0.004032   1130      2   
7          872_ed_alkylphenols_longchain_octylphenols  0.003644    872      3   
8   403_sporulation_differentiation_nprrdependent_...  0.003433    403      6   
9                    593_pastrami_jerky_beef_grinding  0.002725    593      5   
10                    952_cit_dhcit_secreted_citrinin  0.002543    952      3   
11         415_ntm_mycobacte

In [4]:
from langchain import PromptTemplate, OpenAI

prompt_template = """
You are receiving two lists of terms. The first list represents a topic in the field of food safety. The second list contains terms that might pose a risk to that topic.
Based on your expert knowledge in the food safety domain your task is to infer what kind of food safety risk might occur based on the given data and produce a short statement to illustrate that risk.
Example: Topic: ['cruzi', 'chagas', 'trypanosoma'];
Risk: ['attalea'];
Output: Deforestation effects on Attalea palms and their resident Rhodnius, vectors of Chagas disease, in eastern Amazonia.
"""

In [15]:
# interprete topic representations


from langchain import PromptTemplate, OpenAI

prompt_template2 = """
You are receiving the following topic representation within the realm of food contamination.
Based on this topic representation and your expert knowledge in the food safety domain your task is to infer an emerging food safety risk.
If you can not infer an emerging food safety risk, just say: no issue found.
"""

In [5]:
import getpass
import os
import openai
import json
import pandas as pd

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key:  ········


In [12]:

# Set the OpenAI API key
openai.api_key = os.environ["OPENAI_API_KEY"]

# Function to generate text based on topic and risk
def infer_text(topic, risk):
    prompt = prompt_template + f"\ntopic: {topic}\n" + f"\nrisk: {risk}\n"
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in food safety."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.6,
    )
    
    # Return the generated content
    return response.choices[0].message.content

# Create an empty list to store the results
results = []

# Loop through each row in your DataFrame and generate text
for index, row in weak_signal_df.iterrows():
    topic = row['core_topic']
    risk = row['most_distant_terms']
    
    # Generate the text using the infer_text function
    text = infer_text(topic, risk)
    
    # Append the result to the list
    results.append(text)

# Add the generated text to the DataFrame in a new column
weak_signal_df['odd_term_interpretation'] = results

# Output the DataFrame
print(weak_signal_df)

                                                 Name       dov  Topic  Count  \
0                  762_butcher_quetta_shops_groceries  0.002993    762      4   
1                901_vitamin_fortification_arich_ofsp  0.002615    901      3   
2                   354_mlvhrma_typhimurium_tr_tandem  0.002599    354      7   
3          605_cropping_philippi_horticultural_summer  0.002511    605      5   
4          282_variability_rope_amyloliquefaciens_pan  0.002582    282      8   
5                       1113_lmf_lmfs_lowmoisture_cfd  0.004032   1113      2   
6             1130_clavata_dairy_casecase_saprochaete  0.004032   1130      2   
7          872_ed_alkylphenols_longchain_octylphenols  0.003644    872      3   
8   403_sporulation_differentiation_nprrdependent_...  0.003433    403      6   
9                    593_pastrami_jerky_beef_grinding  0.002725    593      5   
10                    952_cit_dhcit_secreted_citrinin  0.002543    952      3   
11         415_ntm_mycobacte

In [13]:
weak_signal_df.to_excel('/home/jupyter/WSM/data/results_llm_inference.xlsx', index=False)

In [None]:
duplicates = random_samples['Representation'].isin(weak_signal_df['Representation'])

# If any duplicates are found, duplicates will contain True values
has_duplicates = duplicates.any()

# Display the result
if has_duplicates:
    print("There are identical rows in the 'Representation' column.")
else:
    print("No identical rows in the 'Representation' column.")

In [None]:
# Create an empty DataFrame to store the results
results = []

# Loop through each category and generate project descriptions
for index, row in random_samples.iterrows():
    topic = row['Representation']
    text = infer_text4(topic)
    results.append(text)

random_samples['interpretation'] = results

In [None]:
random_samples.to_excel('/home/jupyter/WSM/living_lab/random_results.xlsx', index=False)