In [11]:
!pip install --upgrade --quiet openai langchain langchain_community bertopic openpyxl

In [2]:
import pandas as pd
ws_df1 = pd.read_csv('/home/jupyter/WSM/data/weak_signal_df_100k1.csv')
ws_df2 = pd.read_csv('/home/jupyter/WSM/data/weak_signal_df_100k2.csv')
ws_df3 = pd.read_csv('/home/jupyter/WSM/data/weak_signal_df_100k3.csv')

In [3]:
import getpass
import os
import openai
import json
import pandas as pd

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key:  ········


In [4]:
from langchain import PromptTemplate, OpenAI

prompt_template = """
You are receiving the following topic representation within the realm of food safety.
Based on this topic representation and your expert knowledge in the food safety domain your task is to infer an emerging food safety risk. Generate a concise statement about the risk.
If you can not infer an emerging food safety risk, just say: no issue found.
"""

In [5]:
# Set the OpenAI API key
client = openai
openai.api_key = os.environ["OPENAI_API_KEY"]

# Function to generate text based on topic and risk
def infer_text(topic):
    prompt = prompt_template + f"\ntopic: {topic}\n"
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in food safety."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.6,
    )
    
    # Return the generated content
    return response.choices[0].message.content

# Function to process any DataFrame
def process_dataframe(df):
    # Create an empty list to store the results
    results = []
    
    # Loop through each row in the DataFrame and generate text
    for index, row in df.iterrows():
        topic = row['Representation']
        
        # Generate the text using the infer_text function
        text = infer_text(topic)
        
        # Append the result to the list
        results.append(text)
    
    # Add the generated text to the DataFrame in a new column
    df['weak_signal_interpretation'] = results

    # Display first n statements
    return df["weak_signal_interpretation"].tolist()[:3]

# Process each DataFrame
print("ws_df1:", process_dataframe(ws_df1))
print("ws_df2:", process_dataframe(ws_df2))
print("ws_df3:", process_dataframe(ws_df3))

ws_df1: ['Emerging food safety risk: The co-manufacturing of soft hamburger and hot dog buns by bakeries for the Hostess brand may pose a risk of Listeria contamination.', 'No issue found.', 'No issue found.']
ws_df2: ['No issue found.', 'Emerging food safety risk: There is a potential risk of Listeria contamination in soft hamburger and hotdog buns produced by Hostess or its co-manufacturers and bakeries.', 'No issue found.']
ws_df3: ['no issue found', 'No issue found.', 'No issue found.']


In [12]:
ws_df1.to_excel('/home/jupyter/WSM/data/100k_interpretation1.xlsx', index=False)
ws_df2.to_excel('/home/jupyter/WSM/data/100k_interpretation2.xlsx', index=False)
ws_df3.to_excel('/home/jupyter/WSM/data/100k_interpretation3.xlsx', index=False)

In [13]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

# Load the sentence transformer model
sentence_model = SentenceTransformer("all-mpnet-base-v2")

# Define a function to get the first three terms from the list
def get_first_three_terms(term_list):
    # Check if input is a string, if so split it into words
    if isinstance(term_list, str):
        term_list = term_list.split()  # You can customize the splitting logic if needed
    return term_list[:3]

def find_distant_terms(terms, n, threshold=0.3):
    if isinstance(terms, str):
        terms = terms.split()  # Split terms if provided as a single string

    if len(terms) < max(3, n):
        return []  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)
    
    # Ensure the embedding has a batch dimension
    if len(initial_terms_embeddings.shape) == 1:
        initial_terms_embeddings = initial_terms_embeddings.unsqueeze(0)

    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    distant_terms = []
    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)

        # Ensure term_embedding also has the correct shape
        if len(term_embedding.shape) == 1:
            term_embedding = term_embedding.unsqueeze(0)

        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding.unsqueeze(0), term_embedding).item()
        if similarity < threshold:
            distant_terms.append(term)
    return distant_terms

def find_most_distant_term(terms, n, threshold=0.4):
    if isinstance(terms, str):
        terms = terms.split()  # Split terms if provided as a single string

    if len(terms) < max(3, n):
        return None  # Not enough terms to proceed

    # Encoding the first three terms and averaging their embeddings
    initial_terms_embeddings = sentence_model.encode(terms[:3], convert_to_tensor=True)

    # Ensure the embedding has a batch dimension
    if len(initial_terms_embeddings.shape) == 1:
        initial_terms_embeddings = initial_terms_embeddings.unsqueeze(0)

    cumulated_embedding = torch.mean(initial_terms_embeddings, dim=0)

    most_distant_term = None
    lowest_similarity = 1.0  # Initialize with the maximum possible similarity

    for term in terms[n-1:]:  # Start from the nth term
        term_embedding = sentence_model.encode(term, convert_to_tensor=True)

        # Ensure term_embedding also has the correct shape
        if len(term_embedding.shape) == 1:
            term_embedding = term_embedding.unsqueeze(0)

        # Calculate cosine similarity
        similarity = util.pytorch_cos_sim(cumulated_embedding.unsqueeze(0), term_embedding).item()
        if similarity < threshold and similarity < lowest_similarity:
            lowest_similarity = similarity
            most_distant_term = term

    return most_distant_term

# Specify the nth term here, e.g., 4th term would be n=4
nth_term = 4

# Function to process any DataFrame
def process_distant_terms(df):
    df['distant_terms'] = df['Representation'].apply(lambda x: find_distant_terms(x, nth_term))
    df['most_distant_terms'] = df['Representation'].apply(lambda x: find_most_distant_term(x, nth_term))
    df['core_topic'] = df['Representation'].apply(get_first_three_terms)
    return df[['distant_terms', 'most_distant_terms', 'core_topic']].head(3)  # Return first 3 rows for inspection

# Apply the function to ws_df1, ws_df2, and ws_df3
print("ws_df1:", process_distant_terms(ws_df1))
print("ws_df2:", process_distant_terms(ws_df2))
print("ws_df3:", process_distant_terms(ws_df3))

  from tqdm.autonotebook import tqdm, trange


ws_df1:   distant_terms most_distant_terms                            core_topic
0            []            'soft',  [['hostess',, 'buns',, 'hamburger',]
1      ['fpm']]             'fpm']     [['gaza',, 'israel',, 'israeli',]
2  ['towards',]         'towards',   [['antwerp',, 'olivieri',, 'crew',]
ws_df2:             distant_terms most_distant_terms  \
0            ['reminds',]         'reminds',   
1                      []            'soft',   
2  ['340',, 'provinces']]             '340',   

                                       core_topic  
0  [['plaintiff',, 'shareholders',, 'korsinsky',]  
1            [['hostess',, 'buns',, 'hamburger',]  
2            [['zucchini',, 'foodle',, 'veggie',]  
ws_df3:                                        distant_terms most_distant_terms  \
0  [whos',, 'reacting, elements',, 'repeated, par...              'whos   
1                                                 []               None   
2                                                 []      

In [27]:
from langchain import PromptTemplate, OpenAI

prompt_template = """
You are receiving two lists of terms. The first list represents a topic in the field of food safety. The second list contains terms that might pose a risk to that topic.
Based on your expert knowledge in the food safety domain your task is to infer what kind of food safety risk might occur based on the given data and produce a short statement to illustrate that risk.
Example: Topic: ['cruzi', 'chagas', 'trypanosoma'];
Risk: ['attalea'];
Output: Deforestation effects on Attalea palms and their resident Rhodnius, vectors of Chagas disease, in eastern Amazonia.
"""

In [16]:
# Set the OpenAI API key
client = openai
openai.api_key = os.environ["OPENAI_API_KEY"]

# Function to generate text based on topic and risk
def infer_text_distant(topic, risk):
    prompt = prompt_template + f"\ntopic: {topic}\n" + f"\nrisk: {risk}\n"
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in food safety."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.6,
    )
    
    # Return the generated content
    return response.choices[0].message.content


# Function to process each DataFrame
def process_infer_text_distant(df):
    results = []

    # Loop through each row in the DataFrame and generate text
    for index, row in df.iterrows():
        topic = row['core_topic']  # Assuming 'core_topic' column exists
        risk = row['distant_terms']  # Assuming 'distant_terms' column exists
        
        # Generate the text using the infer_text function
        text = infer_text_distant(topic, risk)
        
        # Append the result to the list
        results.append(text)

    # Add the generated text to the DataFrame in a new column
    df['odd_term_interpretation'] = results

    # Return the first 3 rows for inspection
    return df['odd_term_interpretation'].tolist()[:3]

# Process each DataFrame (ws_df1, ws_df2, ws_df3)
print("ws_df1:", process_infer_text_distant(ws_df1))
print("ws_df2:", process_infer_text_distant(ws_df2))
print("ws_df3:", process_infer_text_distant(ws_df3))

ws_df1: ['No issue found.', 'No issue found.', 'No issue found.']
ws_df2: ['No issue found.', 'No issue found.', 'No issue found.']
ws_df3: ['No issue found.', 'no issue found.', 'no issue found.']


In [17]:
ws_df1.to_excel('/home/jupyter/WSM/data/100k_odd_term_interpretation1.xlsx', index=False)
ws_df2.to_excel('/home/jupyter/WSM/data/100k_odd_term_interpretation2.xlsx', index=False)
ws_df3.to_excel('/home/jupyter/WSM/data/100k_odd_term_interpretation3.xlsx', index=False)