In [1]:
import pandas as pd, numpy as np
import os
import numpy as np
import time
import math
from openai import OpenAI
import json
import pickle
from sklearn.preprocessing import OneHotEncoder

notebook_path = os.path.abspath("")
os.chdir(notebook_path)
current_dir = os.getcwd()

#Feedbacks need to be progressed
latest_file = os.path.join(current_dir, "Feedbacks", "feedback_latest.xlsx")
# We’ll collect our prior knowledge from this labeled dataset.
prior_file = os.path.join(current_dir, "Feedbacks", "feedback_prior.xlsx") 

df_latest = pd.read_excel(latest_file)  
df_prior = pd.read_excel(prior_file)

In [2]:
feedback_list = df_latest['Feedback'].dropna().tolist()
print(F for F in feedback_list)

<generator object <genexpr> at 0x000002886D094790>


# Section 1: USE YOUR OWN API 

If you're not going to use your api, jump to section2 where I'll provide outputs via my API and simulates a score update

In [None]:
USE_YOUR_API = False

##Your API key
API_KEY = "YOUR API"
MODEL = 'deepseek-chat'
BASE_URL = "https://api.deepseek.com"

In [4]:
#Be careful and don't touch this or edit as little as possible...
system_prompt = """
You are a feedback analysis tool for an online web product. For each given user feedback, 
analyze the content and output a JSON object with the following three keys:

1. `C`: A probability vector [UI, UX, Other] (sum = 1.00).  
   - UI: Feedback is mainly about visual design (e.g., colors, icons, layout).  
   - UX: Feedback is mainly about usability, experience, or interaction.  
   - Other: Feedback provides specific suggestions unrelated to UI/UX
   - If the feedback is irrelevant to the product, contains vague or unclear requirements, 
   or follows a generic positive/negative review template, distribute the probabilities more evenly across categories.
   
2. `S`: A probability vector (length 15, sum = 1.00), indicating the likelihood of each feedback category:  
   [Page layout, Image and illustration, Color scheme, Icon, 
   Text formatting, Content/Education, Lack of visual, No specific function, 
   Hard to find a function, Using a function, Slow loading, Responsive design issue, Login issue, Function overload, Other].
   - If the feedback is vague or unclear, distribute probabilities more evenly across categories.

3. `M`: A one-hot vector [Positive, Negative, Neutral] (only one element is 1, the rest are 0):  
   - Positive: Feedback expresses praise or satisfaction.  
   - Negative: Feedback expresses criticism or frustration.  
   - Neutral: Feedback is neither clearly positive nor negative.

**Instructions:**  
- For each feedback, output exactly one JSON object describing your analysis.  
- For `C` and `S`, values are floats (probabilities) between 0.00 and 1.00 and must sum to 1.00.  
- For `M`, values are integers (0 or 1) and must sum to 1.  
- All probability values should be rounded to two decimal places.
- Do not add any explanations or text before or after the JSON.  
- The variance in example vectors is for reference only - in practice, generate probability vectors 
based on the specific feedback content, as long as they sum to 1.00.

**Example output:**
{
  "C": [0.80, 0.10, 0.10],
  "S": [0.00, 0.10, 0.00, 0.00, 0.00, 0.20, 0.00, 0.00, 0.70, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
  "M": [1, 0, 0]
}
"""

In [5]:
client = OpenAI(
        api_key=API_KEY,
        base_url=BASE_URL,  
        )

def analyze_feedback(feedback_text):
    response = client.chat.completions.create(
        model=MODEL, 
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": feedback_text}
        ],
        response_format={"type": "json_object"},  
        temperature=0.0  
    )
    return json.loads(response.choices[0].message.content)

In [6]:
all_results = []
if USE_YOUR_API:
    for feedback in feedback_list:
        try:
            result = analyze_feedback(feedback)
            all_results.append(result)
        except Exception as e:
            print(f"Error analyzing: '{feedback}'\n{e}")

    C_matrix = np.array([r["C"] for r in all_results])  # shape: (n, 3)
    S_matrix = np.array([r["S"] for r in all_results])  # shape: (n, 15)
    M_matrix = np.array([r["M"] for r in all_results])  # shape: (n, 3)
    print(C_matrix,S_matrix,M_matrix)

    #SAVE
    save_path = os.path.join(current_dir, "all_results.pkl")

    with open(save_path, 'wb') as f:
        pickle.dump(all_results, f)

# Section 2: Use the provided probs via my API

The provided probs are stored in "all_results.pkl"

In [7]:
if not USE_YOUR_API: 
    with open('all_results.pkl', 'rb') as f:
        loaded_results = pickle.load(f)

    print(loaded_results) 

    C_matrix = np.array([item['C'] for item in loaded_results]) 
    S_matrix = np.array([item['S'] for item in loaded_results])  
    M_matrix = np.array([item['M'] for item in loaded_results])  

[{'C': [0.33, 0.33, 0.34], 'S': [0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07], 'M': [1, 0, 0]}, {'C': [0.7, 0.2, 0.1], 'S': [0.0, 0.0, 0.0, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], 'M': [0, 1, 0]}, {'C': [0.6, 0.2, 0.2], 'S': [0.1, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.2], 'M': [0, 1, 0]}, {'C': [0.0, 0.0, 1.0], 'S': [0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2], 'M': [0, 0, 1]}, {'C': [0.33, 0.33, 0.34], 'S': [0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07], 'M': [0, 1, 0]}]


In [8]:
Product = "X"

#hyperparameters for over weight on score 1-3, regarding origin, category and information amount
w = np.array([0.10, 0.45, 0.45])

#hyperparameters on Score1: Wo 
w_o = np.array([0.35,0.25,0.25,0.15])
#hyperparameters on Sentiment: Wl
w_l = np.array([0.2, 0.7, 0.1])

#The hyperparameter rou
rou_1 = 0.25
rou_2 = 1-rou_1

#The cost 
cost = np.array([22.00, 24.00, 42.00])
scores = cost/np.average(cost)
cost_scores = 1/scores 

# Priors on Category C - dirichlet parameters - We'll add 1 for all
#The order should be UI UX OTHER
feedback_M_counts = df_prior[df_prior["Product"] == Product]["Feedback_Type"].value_counts().to_dict()
print(f"initial prior for C:{feedback_M_counts}")
a_1, a_2, a_3 = feedback_M_counts['UI'] + 1, feedback_M_counts['UX'] + 1, feedback_M_counts['Other'] + 1
a_prior = np.array([a_1, a_2, a_3])

# Similarly, priors on Sentiment
#The order should be P, N, Neutral
feedback_S_counts = df_prior[df_prior["Product"] == Product]["Sentiment"].value_counts().to_dict()
print(f"initial prior for M:{feedback_S_counts}")
l_1, l_2, l_3 = feedback_S_counts['Positive'] + 1, feedback_S_counts['Negative'] + 1, feedback_S_counts['Neutral'] + 1
l_prior = np.array([l_1, l_2, l_3])

#Source matrix: Oc for ["Patient","Doctor","Care_Giver","Other"]
encoder = OneHotEncoder(categories=[["Patient", "Doctor", "Care_Giver", "Other"]],sparse_output=False)
O_c = encoder.fit_transform(df_latest[["Sources"]])

initial prior for C:{'UX': 16, 'UI': 4, 'Other': 2}
initial prior for M:{'Positive': 12, 'Neutral': 5, 'Negative': 5}


In [9]:
# The larger entropy, the sparse category we have, the LOWER score we ended in
def info_score(S):
    probs = S / np.sum(S)
    return np.sum([p * np.log2(p) for p in probs if p > 0])

# Dirichlet_expectation 
def dir_E(alpha):
    total = np.sum(alpha)
    return alpha / total

def LLMs_to_LLMc(LLM_s):
    LLM_c = np.array([
        np.sum(LLM_s[0:7]),    
        np.sum(LLM_s[1:14]),  
        LLM_s[14]           
    ])
    return LLM_c


In [10]:
#store the scroes
df_latest['score'] = 0.0

#calculate the scores
for i, id in enumerate(df_latest["ID"]):
    
    score_1 = w[0] * ( w_o @ O_c[i] )

    score_2_1 = rou_1 * (cost_scores @ dir_E(a_prior)) * (w_l @ dir_E(l_prior)) 
    score_2_2 = rou_2 * (cost_scores @ LLMs_to_LLMc(S_matrix[i])) * (w_l @ dir_E(M_matrix[i]))
    score_2 = w[1] * (score_2_1 + score_2_2)

    score_3 = w[2] * info_score(S_matrix[i])

    score = score_1 + score_2 + score_3

    df_latest.loc[i, 'score'] = score

    # Update the priors
    a_prior = a_prior + LLMs_to_LLMc(S_matrix[i])
    l_prior = l_prior + M_matrix[i]

df_latest["original_order"] = np.arange(len(df_latest))
df_latest = df_latest.sort_values(by='score', ascending=False).reset_index(drop=True) 

In [11]:
df_latest.head()
#Work for me, the most informative ones are sorted out:)

Unnamed: 0,ID,Sources,Feedback,Product,Date,Type_of_Cancer,score,original_order
0,948260,Patient,Icons look creepy without eyes:),X,2024-03-12,Multiple_Myeloma,0.422779,1
1,697696,Patient,Should talk more about male breast though,X,2024-12-12,Breast Cancer,-0.173154,3
2,948260,Doctor,"This site does what it’s supposed to, I guess....",X,2024-03-20,,-0.302199,2
3,697696,Patient,WTF is this,X,2024-02-08,Kidney Cancer,-1.252411,4
4,389023,Other,"I’ve been using this website for years, ever s...",X,2024-07-08,,-1.581279,0
