In [None]:
import sys
import  os

import numpy as np
import pandas as pd
from io import StringIO
import graphviz
import lingam
from lingam.utils import make_dot, make_prior_knowledge
from causallearn.search.ConstraintBased.FCI import fci
from causallearn.utils.GraphUtils import GraphUtils
from copy import  deepcopy
from scipy import stats
from causallearn.utils.cit import CIT
from sklearn.cluster import KMeans
import json 
from itertools import combinations

# load groundtruth
meta = pd.read_excel('./Apple_Gastronome_AG7_v20240513.xlsx')

# san check for groundtruth
values = meta.values[:,:-1].astype(float)
names = list(meta.columns[:-1])

meta.head()

In [2]:
import numpy as np
import pandas as pd

from copy import  deepcopy
from scipy import stats
from scipy.stats import entropy

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

y = values[:,0]
x = values[:,1:]

def get_entropy_from_samples(y):
    _, sk = np.unique(y, return_counts=True)
    pk = sk / sk.sum()
    return entropy(pk) 
    
def get_conditional_entropy(y, X=None, n_clusters=10):
    if X is None:
        return get_entropy_from_samples(y)

    clf = LogisticRegression().fit(X, y.astype(int))
    return log_loss(y.astype(int), clf.predict_proba(X))


def get_conditional_mutual_infomaion(y, x, h=None):
    if (h is None) or (h.shape[1]==0):
        return get_conditional_entropy(y) - get_conditional_entropy(y, x)

    return get_conditional_entropy(y, h) - get_conditional_entropy(y, x)

def avg10(y, x, h=None):
    result = np.array([get_conditional_mutual_infomaion(y, x, h) for _ in range(10)])
    return result.mean(), result.std()



In [4]:

# Causal Discovery (FCI)
g, edges = fci(values, alpha = 0.05, verbose=False, independence_test_method='gsq')

# visualization
pdy = GraphUtils.to_pydot(g, labels=names)
print(pdy.to_string())

Depth=0, working on node 6: 100%|██████████| 7/7 [00:00<00:00, 636.52it/s]

X1 --> X7
digraph  {
dpi=200;
fontsize=18;
0 [label=X1];
0 [label=score];
1 [label=X2];
1 [label=nutrition];
2 [label=X3];
2 [label=size];
3 [label=X4];
3 [label=smell];
4 [label=X5];
4 [label=taste];
5 [label=X6];
5 [label=juiciness];
6 [label=X7];
6 [label=recmd];
2 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
3 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
4 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
0 -> 6  [arrowhead=normal, arrowtail=none, dir=both];
1 -> 6  [arrowhead=normal, arrowtail=odot, dir=both];
4 -> 5  [arrowhead=odot, arrowtail=odot, dir=both];
}






# Utils

In [5]:
def GetMB(G, node_name, y_node = 0):
    mbset = set([y_node])
    d = G.shape[0]

    get_direct_set = lambda x: set([idx for idx in range(d) if np.abs(G[x, idx]) + np.abs(G[idx, x]) > 0])

    direct_set = get_direct_set(y_node)
    mbset = mbset.union(direct_set)

    for idx in direct_set:
        if G[idx, y_node ] == -1:
            continue

        for each_secondary in get_direct_set(idx):
            if G[idx, each_secondary] == -1:
                continue
            mbset.add(each_secondary)
    
    return set([node_name[i] for i in mbset])

In [6]:
def get_factor_name(this_factor):

    this_factor = this_factor.split('\n')[0].lower()

    for i in range(20):
        if f'{i}.' in this_factor:
            this_factor = this_factor.replace(f'{i}.', '').strip()
    
    for s in ":":
        if s in this_factor:
            this_factor = this_factor.replace(s, '').strip()

    return this_factor

def check_factor_list(factor_list):
    factor_list = [f for f in factor_list if len(f)>10]
    factor_list = [f for f in factor_list if (" 1:" in f)and((" -1:" in f)and(" 0:" in f))]

    for outer_idx, each_str in enumerate(factor_list):
        for idx, ch in enumerate(each_str):
            if ch.isalpha():
                break 
        each_str_list = each_str[idx:].split('\n')

        each_str = each_str_list[0].lower()
        each_str_2 = '\n'.join(each_str_list[1:])

        for i in range(100):
            if f'{i}' in each_str:
                each_str = each_str.replace(f'{i}', '').strip()
            
        
        for s in ['factor', "*", 'name', ":", '.']:
            if s in each_str:
                each_str = each_str.replace(s, '').strip()
                
        if len(each_str)>0:
            factor_list[outer_idx] = each_str+'\n'+each_str_2
        else:
            factor_list[outer_idx] = ''
    
    factor_list = [f for f in factor_list if len(f)>10]

    return factor_list


def propose_prompt_with_data_in_front(example, used_factors = None):
    
    used_factors_hint = ''
    if used_factors is not None:
        used_factors_hint = "\n - Avoid overlapping with **those existing factors:**\n\t- " + '\n\t- '.join(used_factors) + '\n'
    prompt = f'''
You are an excellently helpful AI assistant for analyzing, abstracting, and processing data. Now try your best.

# Data

{example}

# Tasks: Factor Abstraction.

**What are the high-level factors behind the text that contribute to the allocation of groups?**

- Propose your candidate factors based on your observation on given data.
- The proposed factors should be diverse and different senmatically. Their overlapping should be minimized.

# About Output

Your output should contain parts described as follows.

**Part 1**: Consideration.

In this part, feel free to write down the process of your considerations. 
Hint: You need to abstract, identify, and design suitable factors with coresponding criteria, and each factor is only allowed to take value from  [-1, 0, 1] .  

**Part 2**: Factor filtration. You shoud decide whether to use each of the proposed factors by following criteria:
- Each new factor should be helpful to distinguish the groups.
- Each Factor should focus on one concrete aspect and try to avoid overlape. {used_factors_hint}

**Part 3**: Final Output. 

 In this part, you are required to report the factors that you decided to use. 
- Each factors' value should from **[-1, 0, 1]** 
- **You should give specific criterion for each value of each factor.**
- Keep the criterion for value 0 fixed as "Otherwise; or not mentioned". 


Report the factors **in following template:**

```
**Factor Name**
- 1: [Positive Criterion]
- 0: [Otherwise; or not mentioned].
- -1: [Negative Criterion]
```

'''
    return prompt

In [7]:
def update(data_interface, file_name):
    annotation_pd = pd.read_excel(file_name)
    for f_name in annotation_pd.columns:
        data_interface[f_name] = annotation_pd[f_name]
    return data_interface

In [8]:
data = ''
for g in np.unique(values[:,0]):
    data += f'\n## Group with \'Score\' = {int(g)}\n\n'
    g_indeces = np.arange(len(values[:,0]))[values[:,0]==g]
    g_indeces = np.random.choice(g_indeces, min(len(g_indeces), 3), replace=False)
    for i in g_indeces:
        this_review = meta.values[i,-1].replace('\n','').strip()
        data += f"- {this_review}\n"
print(data)


## Group with 'Score' = -4

- Despite its rich nutrient content, featuring dietary fiber and vitamin C, this small apple fails to impress with its musty, potentially rotten aroma and an overpowering sourness that eclipses any sweetness. Its disappointing quality and taste profile suggest it might not be profitable for suppliers or satisfying for customers, posing a risk of financial loss.
- This apple variety, with its notably low nutrient profile and diminutive stature, falls short in meeting desired quality marks. A pervasive mustiness indicates potential spoilage, detracting from its culinary appeal. Its sour overtones overshadow any sweetness, and a disappointing dryness further saps the fruit of vitality. For suppliers and customers alike, investing in this apple could unfortunately equate to financial risk rather than reward.
- Despite the apple's hydrating quality, it disappointingly presents with a low nutrient profile, an underwhelming small size, and an off-putting musty odo

# ----- exp begin ------

In [8]:
target_name = 'score'
benchmark_name = 'AG7'
exp_uid = '2024_05_16_1422_hk_Mixtral'
annotation = {}

# GPT-3.5-Turbo

https://poe.com/GPT-3.5-Turbo

https://poe.com/s/tYvcQuIyRB54wVpyUZyx


zeroshot: https://poe.com/s/Suv0XqninMg9nQTGFEiy


In [9]:
this_bot_name = "GPT-3.5-Turbo" 
anno_model  = "Mixtral-8x22b" 
V = set([target_name])
annotation[this_bot_name] = {}
data_interface = pd.DataFrame({target_name:values[:,0]})
selected_factors = set()

In [10]:
avg10(y, x, h=None)

(0.9171624448484529, 1.1102230246251565e-16)

## iter 1

### prompt

In [12]:
print(propose_prompt_with_data_in_front(data))


You are an excellently helpful AI assistant for analyzing, abstracting, and processing data. Now try your best.

# Data


## Group with 'Score' = -4

- Despite its rich nutrient content, featuring dietary fiber and vitamin C, this small apple fails to impress with its musty, potentially rotten aroma and an overpowering sourness that eclipses any sweetness. Its disappointing quality and taste profile suggest it might not be profitable for suppliers or satisfying for customers, posing a risk of financial loss.
- This apple variety, with its notably low nutrient profile and diminutive stature, falls short in meeting desired quality marks. A pervasive mustiness indicates potential spoilage, detracting from its culinary appeal. Its sour overtones overshadow any sweetness, and a disappointing dryness further saps the fruit of vitality. For suppliers and customers alike, investing in this apple could unfortunately equate to financial risk rather than reward.
- Despite the apple's hydrating q

In [11]:
result = """ 
<Factor Begin>


**Factor 1: Nutrient Content**
- 1: The apple has a rich nutrient profile, including dietary fiber and vitamin C.
- 0: Otherwise; or not mentioned.
- -1: The apple has a low nutrient profile and falls short of expectations.

**Factor 2: Size**
- 1: The apple is large in size.
- 0: Otherwise; or not mentioned.
- -1: The apple is small in size.

**Factor 3: Aroma**
- 1: The apple has a robust and pleasant aroma.
- 0: Otherwise; or not mentioned.
- -1: The apple has a musty or rotten scent.

**Factor 4: Taste Profile**
- 1: The apple has a delightful sweetness and a balanced flavor.
- 0: Otherwise; or not mentioned.
- -1: The apple is overly sour or leans towards tartness, lacking sweetness.

**Factor 5: Juiciness**
- 1: The apple is juicy and refreshing.
- 0: Otherwise; or not mentioned.
- -1: The apple has a dry texture and lacks juiciness.



"""
factor_list = result.split('<Factor Begin>')[-1].split('\n\n')

factor_list = check_factor_list(factor_list)
factor_list

['nutrient content\n- 1: The apple has a rich nutrient profile, including dietary fiber and vitamin C.\n- 0: Otherwise; or not mentioned.\n- -1: The apple has a low nutrient profile and falls short of expectations.',
 'size\n- 1: The apple is large in size.\n- 0: Otherwise; or not mentioned.\n- -1: The apple is small in size.',
 'aroma\n- 1: The apple has a robust and pleasant aroma.\n- 0: Otherwise; or not mentioned.\n- -1: The apple has a musty or rotten scent.',
 'taste profile\n- 1: The apple has a delightful sweetness and a balanced flavor.\n- 0: Otherwise; or not mentioned.\n- -1: The apple is overly sour or leans towards tartness, lacking sweetness.',
 'juiciness\n- 1: The apple is juicy and refreshing.\n- 0: Otherwise; or not mentioned.\n- -1: The apple has a dry texture and lacks juiciness.']

### confirmation

In [12]:
iteration = 1
file_name = rf"annotations\Apple_Gastronome_AG7_v20240513_colab_annoation_{exp_uid}_{this_bot_name}_iter{iteration}.xlsx"
file_name

'annotations\\Apple_Gastronome_AG7_v20240513_colab_annoation_2024_05_16_1422_hk_Mixtral_GPT-3.5-Turbo_iter1.xlsx'

In [13]:
data_interface = update(data_interface, file_name)


all_factors = list(data_interface.columns)

for f in all_factors:
    annotation[this_bot_name][f] = deepcopy(data_interface[f].to_list())
    
data_interface.head()

Unnamed: 0,score,nutrient content,size,aroma,taste profile,juiciness
0,-3.0,-1,-1,0,-1,-1
1,0.0,1,-1,1,-1,1
2,3.0,-1,1,1,1,0
3,2.0,-1,1,-1,-1,1
4,1.0,1,1,-1,1,1


In [14]:
#annotated_name = [target_name]+list(V.difference([target_name])) + [get_factor_name(fstr) for fstr in factor_list]
annotated_name = [target_name]+list(V.difference([target_name])) + list(set(all_factors).difference(V).difference([target_name]))
annoted_values = data_interface[annotated_name].values

ci_test = CIT(annoted_values, 'kci')
alpha = 0.05

V_indeces = [i+1 for i in range(len(V)-1)]

new_factors = set()

for i in range(len(V), len(annotated_name)):
    p_value = ci_test(0, i, V_indeces)
    check = p_value < alpha 
    if check:
        new_factors.add(annotated_name[i])
        print(f"{'√' if check else '×'}\t{p_value}\t{annotated_name[i]}")

print('\n\n')

new_factors_from_pools = set()
V_indeces_pools = [i for i,f in enumerate(annotated_name) if f in V.union(new_factors).difference([target_name])]

for i in range(len(V), len(annotated_name)):
    if i in V_indeces_pools:
        continue
    
    p_value = ci_test(0, i, V_indeces_pools)
    check = p_value < alpha 
    if check:
        new_factors_from_pools.add(annotated_name[i])
    print(f"{'√' if check else '×'}\t{p_value}\t{annotated_name[i]}")

√	1.0236655967332808e-10	taste profile
√	0.0007950928723781114	juiciness
√	2.848621338813473e-11	aroma
√	2.2870594307278225e-14	size



×	0.23633539289391092	nutrient content


In [15]:
selected_factors = V.union(new_factors).union(new_factors_from_pools)
#annotated_name = [target_name]+ list(V.difference([target_name])) +list(new_factors.difference([target_name]))
annotated_name = [target_name]+ list(selected_factors.difference([target_name]))
annoted_values = data_interface[annotated_name].values

corr_matrix = data_interface[annotated_name].corr().abs().values

V_indeces = []
for node_index in range(1, len(annotated_name)):
    if corr_matrix[node_index, :node_index].max() > 0.9:
        continue
    V_indeces.append(node_index)

annotated_name = [target_name]+ [annotated_name[i] for i in V_indeces]
annoted_values = data_interface[annotated_name].values

g, edges = fci(annoted_values, alpha = 0.05, independence_test_method='kci', depth=3, verbose=False)
new_V = GetMB(g.graph, annotated_name, y_node = 0)

new_V


Depth=0, working on node 4: 100%|██████████| 5/5 [00:00<00:00, 277.78it/s]


{'aroma', 'juiciness', 'score', 'size', 'taste profile'}

In [16]:
old_v = deepcopy(V)
V = deepcopy(new_V)
factors = [ f for f in V if f!= target_name]
annotated_name = [target_name]+factors
annoted_values = data_interface[annotated_name].values

# Causal Discovery (FCI)
g, edges = fci(annoted_values, alpha = 0.05, independence_test_method='kci', verbose=False)

# visualization
pdy = GraphUtils.to_pydot(g, labels=annotated_name)
print(pdy.to_string())

Depth=0, working on node 4: 100%|██████████| 5/5 [00:00<00:00, 295.08it/s]


digraph  {
dpi=200;
fontsize=18;
0 [label=X1];
0 [label=score];
1 [label=X2];
1 [label=aroma];
2 [label=X3];
2 [label=size];
3 [label=X4];
3 [label="taste profile"];
4 [label=X5];
4 [label=juiciness];
1 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
2 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
3 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
3 -> 4  [arrowhead=odot, arrowtail=odot, dir=both];
}



In [17]:
con_mi_old, _ = avg10(y, x, h=data_interface[list(old_v.difference([target_name]))].values)

con_mi_add, _ = avg10(y, x, h=data_interface[list(old_v.union(new_factors).difference([target_name]))].values)

con_mi_current, _ = avg10(y, x, h=data_interface[list(V.difference([target_name]))].values)

n_new_factor = len(set([get_factor_name(fstr) for fstr in factor_list]).intersection(new_factors))

C = 1 - (con_mi_add/con_mi_old)**(1/n_new_factor) if n_new_factor>0 else '-'

P = n_new_factor/ len(factor_list)

print(P, C, con_mi_current, sep='\t')

0.8	0.20578886775739225	0.36491367942348685


## iter 2

### feedback

In [None]:
focus_factor = 'score'
focus_label = data_interface[focus_factor].values[:100]
background_factor = [f for f in V if f not in [focus_factor]]
cond_vectors = data_interface[background_factor].values[:100,:]

g = KMeans(n_clusters=len(background_factor)+1, random_state=0, n_init="auto").fit(cond_vectors).labels_

entropy_values = []
set_g = list(set(g))
for g_idx in set_g:
    qk = np.unique(focus_label[g==g_idx], return_counts=True)[1]
    n = qk.sum()
    qk = qk/ n
    this_entropy_values = stats.entropy(qk)
    if n < len(qk) * 3:
        this_entropy_values = -1
    entropy_values.append(this_entropy_values)
    print(g_idx, entropy_values[-1],n, qk)

g_chosen = set_g[np.argmax(entropy_values)]
print(f'choose g=={g_chosen}')


In [22]:
example = ''

chosen_idx = g==g_chosen
for f_v in np.sort(list(set(focus_label[chosen_idx]))):
    example +=  f'\n\n## group with \'{focus_factor}\' = {f_v}\n'
    these_indeces = np.arange(len(focus_label))[(focus_label == f_v)&(chosen_idx)]
    if len(these_indeces) > 3:
        these_indeces = np.random.choice(these_indeces, 3, replace=False)
    for each_idx in these_indeces:
        this_review = meta.values[each_idx,-1]
        pieces = this_review.split('\n\n')[:2]
        del_idx = []
        for idx, piece in enumerate(pieces):
            if 'pattern' in piece.lower():
                del_idx += [idx, idx+1]
        this_review = '\n\n'.join([pieces[idx] for idx in range(len(pieces)) if idx not in del_idx])
        example +=  f'\n- {this_review}'
print(propose_prompt_with_data_in_front(example, [f  for f in background_factor if f not in [target_name]]))



You are an excellently helpful AI assistant for analyzing, abstracting, and processing data. Now try your best.

# Data



## group with 'score' = -1.0

- This apple variety impresses with its robust nutritional profile, packing a hearty dose of dietary fiber and vitamin C. Health-conscious consumers will appreciate the natural boost to their daily diet, bolstering both digestive wellness and immune system strength. Its rich nutrient content makes this apple an excellent choice for suppliers to offer and customers to enjoy.

## group with 'score' = 0.0

-  The apple delights with an intoxicating, strong aroma that promises to captivate the senses. This olfactory pleasure hints at the exceptional quality and freshness, distinguishing it as a choice fruit for both suppliers and consumers seeking a truly aromatic indulgence. The unmistakable scent elevates the apple beyond mere taste to a full sensory experience.
-  Despite boasting a robust nutrient profile with ample dietary fiber and 

In [18]:
result = """ 
<Factor Begin>


**Factor 1: Nutritional Profile**
- 1: The apple variety has an impressive nutrient profile, packing a hearty dose of dietary fiber and vitamin C.
- 0: Otherwise; or not mentioned.
- -1: The apple variety has low nutrient content and lacks essential nutrients.

**Factor 2: Freshness**
- 1: The apple variety has exceptional quality and freshness, distinguishing it as a choice fruit.
- 0: Otherwise; or not mentioned.
- -1: The apple variety has quality issues, such as being musty, rotten, or past prime freshness.

**Factor 3: Market Potential**
- 1: The apple variety has notable market potential and holds promise for market expansion.
- 0: Otherwise; or not mentioned.
- -1: The apple variety has limited market potential and may result in disappointing sales figures.

**Factor 4: Indulgent Taste**
- 1: The apple variety offers a more indulgent taste, with delightful sweetness and abundant, refreshing moisture.
- 0: Otherwise; or not mentioned.
- -1: The apple variety has a relatively low nutrient content but offers a more indulgent taste.

**Factor 5: Moisture Content**
- 1: The apple variety has abundant and refreshing moisture, ensuring a juicy and satisfying eating experience.
- 0: Otherwise; or not mentioned.
- -1: The apple variety lacks moisture and may be dry.




"""
factor_list = result.split('<Factor Begin>')[-1].split('\n\n')

factor_list = check_factor_list(factor_list)
factor_list

['nutritional profile\n- 1: The apple variety has an impressive nutrient profile, packing a hearty dose of dietary fiber and vitamin C.\n- 0: Otherwise; or not mentioned.\n- -1: The apple variety has low nutrient content and lacks essential nutrients.',
 'freshness\n- 1: The apple variety has exceptional quality and freshness, distinguishing it as a choice fruit.\n- 0: Otherwise; or not mentioned.\n- -1: The apple variety has quality issues, such as being musty, rotten, or past prime freshness.',
 'market potential\n- 1: The apple variety has notable market potential and holds promise for market expansion.\n- 0: Otherwise; or not mentioned.\n- -1: The apple variety has limited market potential and may result in disappointing sales figures.',
 'indulgent taste\n- 1: The apple variety offers a more indulgent taste, with delightful sweetness and abundant, refreshing moisture.\n- 0: Otherwise; or not mentioned.\n- -1: The apple variety has a relatively low nutrient content but offers a mor

### confirmation

In [19]:
iteration = 2
file_name = rf"annotations\Apple_Gastronome_AG7_v20240513_colab_annoation_{exp_uid}_{this_bot_name}_iter{iteration}.xlsx"
file_name

'annotations\\Apple_Gastronome_AG7_v20240513_colab_annoation_2024_05_16_1422_hk_Mixtral_GPT-3.5-Turbo_iter2.xlsx'

In [20]:
data_interface = update(data_interface, file_name)


all_factors = list(data_interface.columns)

for f in all_factors:
    annotation[this_bot_name][f] = deepcopy(data_interface[f].to_list())
    
data_interface.head()

Unnamed: 0,score,nutrient content,size,aroma,taste profile,juiciness,nutritional profile,freshness,market potential,indulgent taste,moisture content
0,-3.0,-1,-1,0,-1,-1,-1,-1,-1,0,-1
1,0.0,1,-1,1,-1,1,-1,1,1,0,1
2,3.0,-1,1,1,1,0,0,0,1,0,0
3,2.0,-1,1,-1,-1,1,-1,-1,1,1,1
4,1.0,1,1,-1,1,1,1,-1,1,1,1


In [21]:
#annotated_name = [target_name]+list(V.difference([target_name])) + [get_factor_name(fstr) for fstr in factor_list]
annotated_name = [target_name]+list(V.difference([target_name])) + list(set(all_factors).difference(V).difference([target_name]))
annoted_values = data_interface[annotated_name].values

ci_test = CIT(annoted_values, 'kci')
alpha = 0.05

V_indeces = [i+1 for i in range(len(V)-1)]

new_factors = set()

for i in range(len(V), len(annotated_name)):
    p_value = ci_test(0, i, V_indeces)
    check = p_value < alpha 
    if check:
        new_factors.add(annotated_name[i])
        print(f"{'√' if check else '×'}\t{p_value}\t{annotated_name[i]}")

print('\n\n')

new_factors_from_pools = set()
V_indeces_pools = [i for i,f in enumerate(annotated_name) if f in V.union(new_factors).difference([target_name])]

for i in range(len(V), len(annotated_name)):
    if i in V_indeces_pools:
        continue
    
    p_value = ci_test(0, i, V_indeces_pools)
    check = p_value < alpha 
    if check:
        new_factors_from_pools.add(annotated_name[i])
    print(f"{'√' if check else '×'}\t{p_value}\t{annotated_name[i]}")

√	0.0018704621599650029	freshness
√	2.7859217743841924e-07	market potential



√	0.004018400677470946	indulgent taste
×	0.1538428560713233	nutritional profile
×	0.28332416389905957	nutrient content
×	0.5222711519140202	moisture content


In [22]:
selected_factors = V.union(new_factors).union(new_factors_from_pools)
#annotated_name = [target_name]+ list(V.difference([target_name])) +list(new_factors.difference([target_name]))
annotated_name = [target_name]+ list(selected_factors.difference([target_name]))
annoted_values = data_interface[annotated_name].values

corr_matrix = data_interface[annotated_name].corr().abs().values

V_indeces = []
for node_index in range(1, len(annotated_name)):
    if corr_matrix[node_index, :node_index].max() > 0.9:
        continue
    V_indeces.append(node_index)

annotated_name = [target_name]+ [annotated_name[i] for i in V_indeces]
annoted_values = data_interface[annotated_name].values

g, edges = fci(annoted_values, alpha = 0.05, independence_test_method='kci', depth=3, verbose=False)
new_V = GetMB(g.graph, annotated_name, y_node = 0)

new_V


Depth=0, working on node 7: 100%|██████████| 8/8 [00:00<00:00, 186.04it/s]


X1 --> X4


{'indulgent taste', 'market potential', 'score', 'size', 'taste profile'}

In [23]:
old_v = deepcopy(V)
V = deepcopy(new_V)
factors = [ f for f in V if f!= target_name]
annotated_name = [target_name]+factors
annoted_values = data_interface[annotated_name].values

# Causal Discovery (FCI)
g, edges = fci(annoted_values, alpha = 0.05, independence_test_method='kci', verbose=False)

# visualization
pdy = GraphUtils.to_pydot(g, labels=annotated_name)
print(pdy.to_string())

Depth=0, working on node 4: 100%|██████████| 5/5 [00:00<00:00, 289.97it/s]


X1 --> X5
digraph  {
dpi=200;
fontsize=18;
0 [label=X1];
0 [label=score];
1 [label=X2];
1 [label=size];
2 [label=X3];
2 [label="indulgent taste"];
3 [label=X4];
3 [label="taste profile"];
4 [label=X5];
4 [label="market potential"];
1 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
3 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
0 -> 4  [arrowhead=normal, arrowtail=none, dir=both];
2 -> 3  [arrowhead=odot, arrowtail=odot, dir=both];
}



In [24]:
con_mi_old, _ = avg10(y, x, h=data_interface[list(old_v.difference([target_name]))].values)

con_mi_add, _ = avg10(y, x, h=data_interface[list(old_v.union(new_factors).difference([target_name]))].values)

con_mi_current, _ = avg10(y, x, h=data_interface[list(V.difference([target_name]))].values)

n_new_factor = len(set([get_factor_name(fstr) for fstr in factor_list]).intersection(new_factors))

C = 1 - (con_mi_add/con_mi_old)**(1/n_new_factor) if n_new_factor>0 else '-'

P = n_new_factor/ len(factor_list)

print(P, C, con_mi_current, sep='\t')

0.4	0.1986471650332019	0.3605296428478349


## iter 3

### feedback

In [31]:
focus_factor = 'score'
focus_label = data_interface[focus_factor].values[:100]
background_factor = [f for f in V if f not in [focus_factor]]
cond_vectors = data_interface[background_factor].values[:100,:]

g = KMeans(n_clusters=len(background_factor)+1, random_state=0, n_init="auto").fit(cond_vectors).labels_

entropy_values = []
set_g = list(set(g))
for g_idx in set_g:
    qk = np.unique(focus_label[g==g_idx], return_counts=True)[1]
    n = qk.sum()
    qk = qk/ n
    this_entropy_values = stats.entropy(qk)
    if n < len(qk) * 3:
        this_entropy_values = -1
    entropy_values.append(this_entropy_values)
    print(g_idx, entropy_values[-1],n, qk)

g_chosen = set_g[np.argmax(entropy_values)]
print(f'choose g=={g_chosen}')


0 1.6932630222740752 22 [0.09090909 0.22727273 0.13636364 0.36363636 0.09090909 0.04545455
 0.04545455]
1 -1 8 [0.125 0.25  0.125 0.375 0.125]
2 1.3994886078590607 20 [0.05 0.15 0.35 0.35 0.1 ]
3 1.135900959294373 20 [0.1  0.25 0.1  0.55]
4 1.5501909511534504 30 [0.1        0.16666667 0.33333333 0.06666667 0.3        0.03333333]
choose g==0


In [32]:
example = ''

chosen_idx = g==g_chosen
for f_v in np.sort(list(set(focus_label[chosen_idx]))):
    example +=  f'\n\n## group with \'{focus_factor}\' = {f_v}\n'
    these_indeces = np.arange(len(focus_label))[(focus_label == f_v)&(chosen_idx)]
    if len(these_indeces) > 3:
        these_indeces = np.random.choice(these_indeces, 3, replace=False)
    for each_idx in these_indeces:
        this_review = meta.values[each_idx,-1]
        pieces = this_review.split('\n\n')[:2]
        del_idx = []
        for idx, piece in enumerate(pieces):
            if 'pattern' in piece.lower():
                del_idx += [idx, idx+1]
        this_review = '\n\n'.join([pieces[idx] for idx in range(len(pieces)) if idx not in del_idx])
        example +=  f'\n- {this_review}'
print(propose_prompt_with_data_in_front(example, [f  for f in background_factor if f not in [target_name]]))



You are an excellently helpful AI assistant for analyzing, abstracting, and processing data. Now try your best.

# Data



## group with 'score' = -3.0

-  The apple in question, regrettably, fails to meet satisfactory standards, offering minimal nutritional value and an unpleasant, musty flavor profile that leans excessively sour. Its dry texture further detracts from the eating experience. Given these shortcomings, it's unlikely to satisfy consumers, posing a significant risk of commercial underperformance and potential financial loss for suppliers.
-  Upon evaluation, this apple variety comes up short on essential nutrients and disappoints with a musty, near-rotten taste profile. Its flavor leans excessively sour, lacking the balance of sweetness one might anticipate. The fruit's dryness suggests a deficiency in moisture, which significantly undermines its culinary appeal. Consequently, it's a poor investment for suppliers and a regrettable choice for consumers, potentially yieldin

In [25]:
result = """ 
<Factor Begin>

**Nutritional Value**
- 1: Higher nutritional value, rich in essential nutrients.
- 0: Otherwise; or not mentioned.
- -1: Lower nutritional value, minimal nutritional content.

**Flavor Profile**
- 1: Balanced flavor profile with a mix of sweetness and sourness.
- 0: Otherwise; or not mentioned.
- -1: Unbalanced flavor profile leaning towards sourness.

**Texture**
- 1: Juicy and rich texture.
- 0: Otherwise; or not mentioned.
- -1: Dry texture.

**Moisture Content**
- 1: Higher moisture content.
- 0: Otherwise; or not mentioned.
- -1: Lower moisture content.

**Aroma**
- 1: Strong and enticing aroma.
- 0: Otherwise; or not mentioned.
- -1: Musty or rotten aroma.

**Freshness**
- 1: Higher freshness.
- 0: Otherwise; or not mentioned.
- -1: Lower freshness.
"""
factor_list = result.split('<Factor Begin>')[-1].split('\n\n')

factor_list = check_factor_list(factor_list)
factor_list

['nutritional value\n- 1: Higher nutritional value, rich in essential nutrients.\n- 0: Otherwise; or not mentioned.\n- -1: Lower nutritional value, minimal nutritional content.',
 'flavor profile\n- 1: Balanced flavor profile with a mix of sweetness and sourness.\n- 0: Otherwise; or not mentioned.\n- -1: Unbalanced flavor profile leaning towards sourness.',
 'texture\n- 1: Juicy and rich texture.\n- 0: Otherwise; or not mentioned.\n- -1: Dry texture.',
 'moisture content\n- 1: Higher moisture content.\n- 0: Otherwise; or not mentioned.\n- -1: Lower moisture content.',
 'aroma\n- 1: Strong and enticing aroma.\n- 0: Otherwise; or not mentioned.\n- -1: Musty or rotten aroma.',
 'freshness\n- 1: Higher freshness.\n- 0: Otherwise; or not mentioned.\n- -1: Lower freshness.\n']

### confirmation

In [26]:
iteration = 3
file_name = rf"annotations\Apple_Gastronome_AG7_v20240513_colab_annoation_{exp_uid}_{this_bot_name}_iter{iteration}.xlsx"
file_name

'annotations\\Apple_Gastronome_AG7_v20240513_colab_annoation_2024_05_16_1422_hk_Mixtral_GPT-3.5-Turbo_iter3.xlsx'

In [28]:
data_interface = update(data_interface, file_name)


all_factors = list(data_interface.columns)

for f in all_factors:
    annotation[this_bot_name][f] = deepcopy(data_interface[f].to_list())
    
data_interface.head()

Unnamed: 0,score,nutrient content,size,aroma,taste profile,juiciness,nutritional profile,freshness,market potential,indulgent taste,moisture content,nutritional value,flavor profile,texture
0,-3.0,-1,-1,0,-1,-1,-1,-1,-1,0,-1,-1,-1,-1
1,0.0,1,-1,1,-1,1,-1,1,1,0,1,1,-1,1
2,3.0,-1,1,1,1,0,0,0,1,0,0,0,0,0
3,2.0,-1,1,-1,-1,1,-1,-1,1,1,1,-1,1,1
4,1.0,1,1,-1,1,1,1,-1,1,1,1,1,0,1


In [29]:
#annotated_name = [target_name]+list(V.difference([target_name])) + [get_factor_name(fstr) for fstr in factor_list]
annotated_name = [target_name]+list(V.difference([target_name])) + list(set(all_factors).difference(V).difference([target_name]))
annoted_values = data_interface[annotated_name].values

ci_test = CIT(annoted_values, 'kci')
alpha = 0.05

V_indeces = [i+1 for i in range(len(V)-1)]

new_factors = set()

for i in range(len(V), len(annotated_name)):
    p_value = ci_test(0, i, V_indeces)
    check = p_value < alpha 
    if check:
        new_factors.add(annotated_name[i])
        print(f"{'√' if check else '×'}\t{p_value}\t{annotated_name[i]}")

print('\n\n')

new_factors_from_pools = set()
V_indeces_pools = [i for i,f in enumerate(annotated_name) if f in V.union(new_factors).difference([target_name])]

for i in range(len(V), len(annotated_name)):
    if i in V_indeces_pools:
        continue
    
    p_value = ci_test(0, i, V_indeces_pools)
    check = p_value < alpha 
    if check:
        new_factors_from_pools.add(annotated_name[i])
    print(f"{'√' if check else '×'}\t{p_value}\t{annotated_name[i]}")

√	3.701218185903077e-05	nutritional value
√	1.2800871473928055e-12	aroma
√	0.00017371623159445626	freshness
√	3.8587890410002323e-05	nutritional profile
√	1.9079969638902128e-05	nutrient content



×	0.7087925098871173	texture
×	0.5098746453758228	moisture content
×	0.6854193167136001	juiciness
√	0.036611117597464604	flavor profile


In [30]:
selected_factors = V.union(new_factors).union(new_factors_from_pools)
#annotated_name = [target_name]+ list(V.difference([target_name])) +list(new_factors.difference([target_name]))
annotated_name = [target_name]+ list(selected_factors.difference([target_name]))
annoted_values = data_interface[annotated_name].values

corr_matrix = data_interface[annotated_name].corr().abs().values

V_indeces = []
for node_index in range(1, len(annotated_name)):
    if corr_matrix[node_index, :node_index].max() > 0.9:
        continue
    V_indeces.append(node_index)

annotated_name = [target_name]+ [annotated_name[i] for i in V_indeces]
annoted_values = data_interface[annotated_name].values

g, edges = fci(annoted_values, alpha = 0.05, independence_test_method='kci', depth=3, verbose=False)
new_V = GetMB(g.graph, annotated_name, y_node = 0)

new_V


Depth=0, working on node 8: 100%|██████████| 9/9 [00:00<00:00, 160.52it/s]


X4 --> X3
X6 --> X4
X4 --> X9
X7 --> X6
X9 --> X7


{'market potential', 'nutritional value', 'score', 'size', 'taste profile'}

In [31]:
old_v = deepcopy(V)
V = deepcopy(new_V)
factors = [ f for f in V if f!= target_name]
annotated_name = [target_name]+factors
annoted_values = data_interface[annotated_name].values

# Causal Discovery (FCI)
g, edges = fci(annoted_values, alpha = 0.05, independence_test_method='kci', verbose=False)

# visualization
pdy = GraphUtils.to_pydot(g, labels=annotated_name)
print(pdy.to_string())

Depth=0, working on node 4: 100%|██████████| 5/5 [00:00<00:00, 285.19it/s]


X1 --> X5
digraph  {
dpi=200;
fontsize=18;
0 [label=X1];
0 [label=score];
1 [label=X2];
1 [label="nutritional value"];
2 [label=X3];
2 [label=size];
3 [label=X4];
3 [label="taste profile"];
4 [label=X5];
4 [label="market potential"];
2 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
3 -> 0  [arrowhead=normal, arrowtail=odot, dir=both];
0 -> 4  [arrowhead=normal, arrowtail=none, dir=both];
1 -> 4  [arrowhead=normal, arrowtail=odot, dir=both];
3 -> 4  [arrowhead=normal, arrowtail=none, dir=both];
}



In [32]:
con_mi_old, _ = avg10(y, x, h=data_interface[list(old_v.difference([target_name]))].values)

con_mi_add, _ = avg10(y, x, h=data_interface[list(old_v.union(new_factors).difference([target_name]))].values)

con_mi_current, _ = avg10(y, x, h=data_interface[list(V.difference([target_name]))].values)

n_new_factor = len(set([get_factor_name(fstr) for fstr in factor_list]).intersection(new_factors))

C = 1 - (con_mi_add/con_mi_old)**(1/n_new_factor) if n_new_factor>0 else '-'

P = n_new_factor/ len(factor_list)

print(P, C, con_mi_current, sep='\t')

0.5	0.24809703109881054	0.3327250039595729
