In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# expanded commonsense dataset 

commonsense_data=pd.read_excel('/Users/juniorcedrictonga/RA_MBZUAI/gpt_4o_CCKG_expanded_prime_final_prompt.xlsx',sheet_name='Indonesia')
commonsense_data

In [None]:
#cleaning 
clean_commonsense_data=commonsense_data[['event','knowledge','relation','llm_result','location','sub_topic']].drop_duplicates()
clean_commonsense_data=clean_commonsense_data[clean_commonsense_data['event'] != clean_commonsense_data['knowledge']]
clean_commonsense_data

## Sub-graph extraction

In [None]:
# DFS approach to extract paths even if the subgraph is cyclic but too consuming
#all_paths = []
#paths_with_relations = []
#for source in G_sub.nodes:
    #    stack = [(source, [source])]  # Pile pour DFS : (noeud courant, chemin actuel)
        
     #   while stack:
    #        current_node, path = stack.pop()
            
            # Ajouter un chemin s'il est valide
     #       if len(path) >= 3:
      #          relations = [
      #              G_sub.get_edge_data(path[i], path[i + 1])['relation']
      #              for i in range(len(path) - 1)
      #          ]
      #          paths_with_relations.append((path, relations))
      #          all_paths.append(path)
      #      
            # Étendre le chemin en explorant les voisins
      #      for neighbor in G_sub.neighbors(current_node):
       #         if len(path) + 1 > len(G_sub):  # Prévenir les boucles infinies dans des graphes cycliques
      #              continue
        #        stack.append((neighbor, path + [neighbor]))

    # Structurer les données des chemins extraits
    #paths_data = [
       # {
    #        'path': " -> ".join(path),
      #      'relations': " - ".join(relations),
      #      'sub_topic': G_sub.nodes[path[0]].get('sub_topic', 'N/A'),  
     #       'path_len': len(path)
    #    }
    #    for path, relations in paths_with_relations
    #    ]


In [None]:
df=clean_commonsense_data
G = nx.DiGraph()

for _, row in df.iterrows():
    G.add_edge(row['event'], row['knowledge'], relation=row['relation'])

sub_topics = df['sub_topic'].unique()
global_paths_df = pd.DataFrame(columns=['path', 'relations', 'sub_topic', 'path_len'])

subtopic_stats = {}

for sub_topic in sub_topics:
    df_sub = df[df['sub_topic'] == sub_topic]
    G_sub = nx.DiGraph()
    for _, row in df_sub.iterrows():
        G_sub.add_edge(row['event'], row['knowledge'], relation=row['relation'],
                       location=row['location'], sub_topic=row['sub_topic'])
    
    # stat per subtopics
    num_nodes = G_sub.number_of_nodes()
    num_edges = G_sub.number_of_edges()
    degree_distribution = dict(G_sub.degree())  
    average_degree = sum(degree_distribution.values()) / num_nodes if num_nodes > 0 else 0
    
    central_nodes = sorted(degree_distribution.items(), key=lambda x: x[1], reverse=True)[:5]
    subtopic_stats[sub_topic] = {
        'num_nodes': num_nodes,
        'num_edges': num_edges,
        'average_degree': average_degree,
        'central_nodes': central_nodes
    }
   # path extraction with len >= 3
    if nx.is_directed_acyclic_graph(G_sub):
        all_simple_paths = []
        paths_with_relations = []
        longest_path = nx.dag_longest_path(G_sub)
        print("\nLongest path : {}".format(longest_path))
        for source in G_sub.nodes:
            for target in G_sub.nodes:
                if source != target:
                    # Extraction of path
                    paths = list(nx.all_simple_paths(G_sub, source=source, target=target, cutoff=None))
                    filtered_paths = [path for path in paths if len(path) >= 3]
                    all_simple_paths.extend(filtered_paths)

                    # Extraction of relations
                    for path in filtered_paths:
                        relations = [
                            G_sub.get_edge_data(path[i], path[i + 1])['relation']
                            for i in range(len(path) - 1)
                        ]
                        paths_with_relations.append((path, relations))
        paths_data = [
            {   'path': " -> ".join(path),
                'relations': " - ".join(relations),
                'sub_topic': sub_topic,   
                'path_len':len(path)
               
            }
            for path, relations in paths_with_relations
        ]
        global_paths_df = pd.concat([global_paths_df, pd.DataFrame(paths_data)], ignore_index=True)

            #print("\n path with (len >= 3) with relation:")
            #for path, relations in paths_with_relations:
            #    print(f"Path: {' -> '.join(path)}")
            #   print(f"Relations: {' -> '.join(relations)}")
    else:
        print("\n graph is cyclic")

    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(G_sub, seed=42) 
    nx.draw(G_sub, pos, with_labels=True, node_size=50, edge_color="blue", alpha=0.7, font_size=8)
    edge_labels = nx.get_edge_attributes(G_sub, 'relation')
    nx.draw_networkx_edge_labels(G_sub, pos, edge_labels=edge_labels, font_size=8, font_color="red")

    plt.title(f"Graph per sub-topics : {sub_topic}")
    plt.show()


print("\nStatistics by sub-topics :")
for sub_topic, stats in subtopic_stats.items():
    print(f"\nSub-topic : {sub_topic}")
    print(f"- Number of nodes : {stats['num_nodes']}")
    print(f"- Number of edge: {stats['num_edges']}")
    print(f"- mean degree : {stats['average_degree']:.2f}")
    print(f"- Most central node (per degre) :")
    for node, degree in stats['central_nodes']:
        print(f"  - {node}: {degree} connexions")

In [None]:
global_paths_df

In [None]:
# export
global_paths_df.to_excel('/Users/juniorcedrictonga/RA_MBZUAI/Cultural_Commonsense_Knowledge_Graph/path_per_countries_subtopics_test.xlsx',sheet_name='Indonesia',index=False)

In [None]:
total_samples = 250
min_per_subtopic = max(1, total_samples // global_paths_df['sub_topic'].nunique())
grouped = global_paths_df.groupby('sub_topic', group_keys=False)
samples = grouped.apply(lambda x: x.sample(n=min(len(x), min_per_subtopic), random_state=42))
remaining = total_samples - len(samples)
if remaining > 0:
    additional_samples = global_paths_df.drop(samples.index).sample(n=remaining, random_state=42)
    samples = pd.concat([samples, additional_samples])

samples


In [None]:
#export
samples.to_excel('/Users/juniorcedrictonga/RA_MBZUAI/Cultural_Commonsense_Knowledge_Graph/250_path_per_countries_subtopics.xlsx',sheet_name='Indonesia')

# Comparaison with others CKG : MAMGO and CANDLE

In [None]:
model=SentenceTransformer("all-MiniLM-L6-v2")
threshold=0.8

## 1. Mango

In [None]:
file_path='/Users/juniorcedrictonga/RA_MBZUAI/mango_dataset_v1.jsonl'
mango_data=pd.read_json(file_path,lines=True)
mango_data

In [None]:
def extract_matching_assertinons_with_mango(mango_data, commonsense_data,model, threshold, use_just_indonesian_data_in_mango=True):
    if use_just_indonesian_data_in_mango:
        # Extract Indonesian data in Mango
        terms = ["Indonesia", "Southeast Asia", "Asia-Pacific","Global South","Indonesian"]
        regex_pattern = '|'.join(terms)
        mango_data['contains_relevant_terms'] = mango_data['culture'].str.contains(regex_pattern, case=False, na=False)
        relevant_rows = mango_data[mango_data['contains_relevant_terms']]
    else:
        relevant_rows=mango_data


    # find overlapping with our dataset and relevant rows of mango
    relevant_rows_embedd=model.encode(relevant_rows['assertion'].tolist())
    commonsense_data_embedd=model.encode(commonsense_data['llm_result'].tolist())
    similarities=model.similarity(commonsense_data_embedd,relevant_rows_embedd)
    results=[]
    for i, idx in tqdm(enumerate(commonsense_data.index)): 
        for j in range (len(relevant_rows)): 
            similarity = similarities[i][j].item()
            if similarity > threshold:
                results.append({
                    'mango_assertion': relevant_rows['assertion'].iloc[j], 
                    'llm_result': commonsense_data.at[idx,'llm_result'],  
                    'relation': commonsense_data.at[idx, 'relation'],  
                    'similarity': similarity,
                    'mango_concept': relevant_rows['concept'].iloc[j],
                    'mango_culture': relevant_rows['culture'].iloc[j],
                    'sub_topic': commonsense_data.at[idx, 'sub_topic'],
                    'event': commonsense_data.at[idx, 'event'],
                    'knowledge': commonsense_data.at[idx,'knowledge'],
                    'commonsense_data_index':idx
                })
    
    matches_df = pd.DataFrame(results)
    return relevant_rows, matches_df



In [None]:
idonesian_rows_in_mango, mango_matches_data=extract_matching_assertinons_with_mango(mango_data,
                                                                                    clean_commonsense_data,model,threshold,
                                                                                    use_just_indonesian_data_in_mango=True)

In [None]:
idonesian_rows_in_mango

In [None]:
mango_matches_data

In [None]:
# number of sample of mango that overlapped with our data is 
mango_matches_data['mango_assertion'].unique().shape

## 2. Candle

In [None]:
file_path='/Users/juniorcedrictonga/RA_MBZUAI/candle_dataset_v1.jsonl'
candle_data=pd.read_json(file_path,lines=True)
candle_data

In [None]:
# find overlapping with our dataset and relevant rows of candle
commonsense_data=clean_commonsense_data
candle_rows_embedd=model.encode(candle_data['assertion'].tolist())
commonsense_data_embedd=model.encode(commonsense_data['llm_result'].tolist())
similarities=model.similarity(commonsense_data_embedd,candle_rows_embedd)
results=[]
for i, idx in tqdm(enumerate(commonsense_data.index)): 
    for j in range (len(candle_data)): 
        similarity = similarities[i][j].item()
        if similarity > threshold:
            results.append({
                'candle_assertion': candle_data['assertion'].iloc[j], 
                'llm_result': commonsense_data.at[idx,'llm_result'],  
                'relation': commonsense_data.at[idx, 'relation'],  
                'similarity': similarity,
                'candle_facet': candle_data['facet'].iloc[j],
                'candle_subject': candle_data['subject'].iloc[j],
                'candle_domain':candle_data['domain'].iloc[j],
                'candle_concepts':candle_data['concepts'].iloc[j],
                'sub_topic': commonsense_data.at[idx, 'sub_topic'],
                'event': commonsense_data.at[idx, 'event'],
                'knowledge': commonsense_data.at[idx,'knowledge'],
                'commonsense_data_index':idx
            })

candle_matches_df = pd.DataFrame(results)

In [None]:
candle_matches_df

In [None]:
# number of sample of candle that overlappes with our data is 
candle_matches_df['candle_assertion'].unique().shape