In [None]:
import os
import pickle
import shutil
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [None]:
os.environ["OPENAI_API_KEY"] = 'xxx'

# All data

## Load data

In [None]:
data_dir = Path('data/kg_dr/processed')
save_dir = Path('saved/kuzu_rag')

In [None]:
kg_df = pd.read_csv(data_dir.joinpath('kg_processed-suitable_category.csv'))
print('The shape of the KG dataframe:', kg_df.shape)
kg_df.head()

In [None]:
node_df = pd.read_csv(data_dir.joinpath('nodes.csv'))
print('The shape of the node dataframe:', node_df.shape)
node_df.head()

In [None]:
label_df = pd.read_csv(data_dir.joinpath('labels-suitable_category.csv'))
print('The shape of the label dataframe:', label_df.shape)
label_df.head()

## Utility function

In [None]:
def load_data(use_multimodal=False):
    data_dir = Path('data/kg_dr/processed')
    if use_multimodal:
        kg_df = pd.read_csv(data_dir.joinpath('kg_processed-suitable_category.csv'))
        label_df = pd.read_csv(data_dir.joinpath('labels-suitable_category.csv'))
    else:
        kg_df = pd.read_csv(data_dir.joinpath('kg_processed_no_multimodal-suitable_category.csv'))
        label_df = pd.read_csv(data_dir.joinpath('labels_no_multimodal-suitable_category.csv'))

    node_kg_list = list(set(kg_df['source']) | set(kg_df['target']))
    print('The shape of the KG dataframe:', kg_df.shape)
    print('The shape of the label dataframe:', label_df.shape)

    node_df = pd.read_csv(data_dir.joinpath('nodes.csv'))
    print('The shape of the node dataframe:', node_df.shape)
    node_df = node_df[node_df['name'].isin(node_kg_list)]
    print('The shape of the node dataframe after filtering:', node_df.shape)
    # for nodes whose attribute is username, change the attribute to influencer
    node_df.loc[node_df['attribute'] == 'username', 'attribute'] = 'influencer'
    
    return kg_df, node_df, label_df

def create_node_table(nodes_df):
    node_df_dict = {}
    attribute_list = list(set(nodes_df['attribute'].unique()))
    for attribute in attribute_list:
        attribute_df = nodes_df[nodes_df['attribute'] == attribute]
        attribute_df = attribute_df[['name', 'attribute']]
        attribute_df = attribute_df.drop_duplicates(subset=['name'])
        attribute_df = attribute_df.dropna(subset=['name'])
        # first letter uppercase
        attribute = attribute.capitalize()
        print('The shape of the', attribute, 'dataframe is:', attribute_df.shape)
        node_df_dict[attribute] = attribute_df
    return node_df_dict

def explore_edge_table(edges_df, nodes_df):
    nodes_filder_df = nodes_df.dropna(subset=['name'])
    edges_df = edges_df[(edges_df['source'].isin(nodes_filder_df['name'])) & (edges_df['target'].isin(nodes_filder_df['name']))]
    node2attribute_dict = dict(zip(nodes_filder_df['name'], nodes_filder_df['attribute']))

    relationship_list = list(set(edges_df['relationship'].unique()))
    for relationship in relationship_list:
        relationship_df = edges_df[edges_df['relationship'] == relationship]
        relationship_df = relationship_df[['source', 'target', 'relationship']]
        relationship_df = relationship_df.drop_duplicates()
        relationship_df = relationship_df.dropna(subset=['source', 'target', 'relationship'])
        relationship_df['source_attribute'] = relationship_df['source'].map(node2attribute_dict)
        relationship_df['target_attribute'] = relationship_df['target'].map(node2attribute_dict)
        print('The shape of the', relationship, 'dataframe is:', relationship_df.shape)
        print('The combination of source attribute and target attribute:')
        print(relationship_df[['source_attribute', 'target_attribute']].value_counts())
        print('\n')

def create_edge_table(edges_df, nodes_df, node_df_dict):
    node_list = []
    for df in node_df_dict.values():
        node_list.extend(df['name'].tolist())
    node_list = list(set(node_list))
    edges_df = edges_df[(edges_df['source'].isin(node_list)) & (edges_df['target'].isin(node_list))]
    
    nodes_filder_df = nodes_df.dropna(subset=['name'])
    node2attribute_dict = dict(zip(nodes_filder_df['name'], nodes_filder_df['attribute']))
    
    edge_df_dict = {}
    relationship_list = list(set(edges_df['relationship'].unique()))
    for relationship in relationship_list:
        relationship_df = edges_df[edges_df['relationship'] == relationship]
        relationship_df = relationship_df[['source', 'target', 'relationship']]
        relationship_df = relationship_df.drop_duplicates()
        relationship_df = relationship_df.dropna(subset=['source', 'target', 'relationship'])
        relationship_df['source_attribute'] = relationship_df['source'].map(node2attribute_dict)
        relationship_df['target_attribute'] = relationship_df['target'].map(node2attribute_dict)
        relationship = relationship.capitalize()

        if relationship == 'Recommend_product':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category'))]
            relationship = 'Recommend'
        
        if relationship == 'Target_audience':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'target_audience'))]
            relationship = 'Has_target_audience'

        if relationship == 'Occupation_or_industry' or relationship == 'Username' or relationship == 'Self_description' or relationship == 'Location':
            continue

        if relationship == 'Product_category':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'product_name') & (relationship_df['target_attribute'] == 'product_category'))]
            relationship = 'Has_product_category'

        if relationship == 'Its_brand':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'product_name') & (relationship_df['target_attribute'] == 'brand'))]
            # Has_brand
            relationship_has_brand_df = relationship_df[relationship_df['source_attribute'] == 'product_name']
            relationship = 'Has_brand'

        if relationship == 'Interests':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category'))]
            relationship = 'Has_interest'

        if relationship == 'Partner':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'brand')) |
                                              ((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'partner')) |
                                              ((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category')) |
                                              ((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'influencer'))]
            # Partner_brand
            relationship_partner_brand_df = relationship_df[relationship_df['target_attribute'] == 'brand']
            relationship = 'Partner_brand'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_brand_df.shape)
            print('Attributes of the source and target:', relationship_partner_brand_df['source_attribute'].unique(), relationship_partner_brand_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_brand_df
            # Partner_partner
            relationship_partner_partner_df = relationship_df[relationship_df['target_attribute'] == 'partner']
            relationship = 'Partner_partner'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_partner_df.shape)
            print('Attributes of the source and target:', relationship_partner_partner_df['source_attribute'].unique(), relationship_partner_partner_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_partner_df
            # Partner_product_category
            relationship_partner_product_category_df = relationship_df[relationship_df['target_attribute'] == 'product_category']
            relationship = 'Partner_product_category'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_product_category_df.shape)
            print('Attributes of the source and target:', relationship_partner_product_category_df['source_attribute'].unique(), relationship_partner_product_category_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_product_category_df
            # Partner_influencer
            relationship_partner_influencer_df = relationship_df[relationship_df['target_attribute'] == 'influencer']
            relationship = 'Partner_influencer'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_influencer_df.shape)
            print('Attributes of the source and target:', relationship_partner_influencer_df['source_attribute'].unique(), relationship_partner_influencer_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_influencer_df
            continue

        if relationship == 'Brand_collaboration':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'brand'))]
        
        if relationship == 'Has_interest':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category'))]
        
        print('The shape of the', relationship, 'dataframe is:', relationship_df.shape)
        print('Attributes of the source and target:', relationship_df['source_attribute'].unique(), relationship_df['target_attribute'].unique())
        edge_df_dict[relationship] = relationship_df

    for relationship, df in edge_df_dict.items():
        source_attribute = df['source_attribute'].iloc[0].capitalize()
        target_attribute = df['target_attribute'].iloc[0].capitalize()
        df = df[df['source'].isin(node_df_dict[source_attribute]['name']) & df['target'].isin(node_df_dict[target_attribute]['name'])]
        print('The shape of the', relationship, 'dataframe is:', df.shape)
        edge_df_dict[relationship] = df
    
    return edge_df_dict

In [None]:
kg_df, nodes_df, label_df = load_data(use_multimodal=True)
print()
explore_edge_table(kg_df, nodes_df)
multimodal_node_df_dict = create_node_table(nodes_df)
multimodal_edge_df_dict = create_edge_table(kg_df, nodes_df, multimodal_node_df_dict)
multimodal_label_df = label_df

In [None]:
kg_df, nodes_df, label_df = load_data(use_multimodal=False)
print()
explore_edge_table(kg_df, nodes_df)
nomultimodal_node_df_dict = create_node_table(nodes_df)
nomultimodal_edge_df_dict = create_edge_table(kg_df, nodes_df, nomultimodal_node_df_dict)
nomultimodal_label_df = label_df

## Binary classification

In [None]:
import kuzu
from langchain.chains import KuzuQAChain
from langchain_community.graphs import KuzuGraph
from langchain_openai import ChatOpenAI

In [None]:
def create_kuzu_graph(node_df_dict, edge_df_dict):
    db_name = "influencer_kg.db"
    
    # Clear previous database if it exists
    if os.path.exists(db_name):
        print(f"Removing existing database: {db_name}")
        shutil.rmtree(db_name)
    
    # Create a new Kuzu database
    db = kuzu.Database(db_name)
    conn = kuzu.Connection(db)

    print('----------Creating node tables----------')
    nodename2node_dict = dict()
    # Create node tables
    for node_type, df in node_df_dict.items():
        columns = df.columns.tolist()
        create_query = f"CREATE NODE TABLE {node_type} ("
        column_defs = []
        for col in columns:
            if node_type == 'Influencer' and col == 'follower_count':
                column_defs.append(f"{col} INT64")
            else:
                column_defs.append(f"{col} STRING")
        create_query += ", ".join(column_defs)
        create_query += ", PRIMARY KEY (name))"
        print(f"Executing query: {create_query}")  # Debug print
        conn.execute(create_query)
        
        # Import nodes directly from dataframe
        print(f"Importing nodes for {node_type}")
        conn.execute(f"COPY {node_type} FROM df")

        nodename2node_dict[node_type] = df['name'].tolist()
    print('\n')

    print('----------Creating edge tables----------')
    # Create edge tables
    for edge_type, df in edge_df_dict.items():
        if len(df) == 0:
            continue
        source_type = df['source_attribute'].iloc[0].capitalize()
        target_type = df['target_attribute'].iloc[0].capitalize()

        create_query = f"CREATE REL TABLE {edge_type} (FROM {source_type} TO {target_type})"
        print(f"Executing query: {create_query}")  # Debug print
        conn.execute(create_query)
        
        # Import relationships directly from dataframe
        print(f"Importing relationships for {edge_type}")
        df = df[['source', 'target']]
        df = df[df['source'].isin(nodename2node_dict[source_type]) & df['target'].isin(nodename2node_dict[target_type])]
        conn.execute(f"COPY {edge_type} FROM df")

    return db, conn

### Multimodal

In [None]:
multimodal_db, multimodal_conn = create_kuzu_graph(multimodal_node_df_dict, multimodal_edge_df_dict)

In [None]:
multimodal_graph = KuzuGraph(multimodal_db)

multimodal_chain = KuzuQAChain.from_llm(
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    graph=multimodal_graph,
    verbose=True,
)

print(multimodal_graph.get_schema)

In [None]:
prompt_template = """
You are an expert in influencer marketing. You have been asked to analyze influencers and product categories.
You have been given a Knowledge Graph (KG) that contains information about influencers, product categories, and their relationships.
You need to answer the following question based on the KG provided:
For the influencer '{influencer}', will this influencer be suitable for the product category '{product_category}'? 
You can only answer with 'Yes' or 'No'. Please show your reasoning step by step.
Please use JSON format as output and only for output. The output contains two keys: prediction and reason.
"""

In [None]:
result_df_dict = {'user': [], 'target_entity': [], 'true_label': [], 'GPT-4o_output': []}
for i, row in tqdm(multimodal_label_df.iterrows(), total=len(multimodal_label_df)):
    influencer = row['user']
    product_category = row['target_entity']
    true_label = row['label']
    prompt = prompt_template.format(influencer=influencer, product_category=product_category)
    try:
        result = multimodal_chain.invoke(prompt)
    except:
        result = 'Error'
    result_df_dict['user'].append(influencer)
    result_df_dict['target_entity'].append(product_category)
    result_df_dict['true_label'].append(true_label)
    result_df_dict['GPT-4o_output'].append(result)

    result_df = pd.DataFrame(result_df_dict)
    result_df.to_csv(save_dir.joinpath('GPT-4o_multimodal_output.csv'), index=False)

### NoMultimodal

In [None]:
nomultimodal_db, nomultimodal_conn = create_kuzu_graph(nomultimodal_node_df_dict, nomultimodal_edge_df_dict)

In [None]:
nomultimodal_graph = KuzuGraph(nomultimodal_db)

nomultimodal_chain = KuzuQAChain.from_llm(
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    graph=nomultimodal_graph,
    verbose=True,
)

print(nomultimodal_graph.get_schema)

In [None]:
prompt_template = """
You are an expert in influencer marketing. You have been asked to analyze influencers and product categories.
You have been given a Knowledge Graph (KG) that contains information about influencers, product categories, and their relationships.
You need to answer the following question based on the KG provided:
For the influencer '{influencer}', will this influencer be suitable for the product category '{product_category}'? 
You can only answer with 'Yes' or 'No'. Please show your reasoning step by step.
Please use JSON format as output and only for output. The output contains two keys: prediction and reason.
"""

In [None]:
result_df_dict = {'user': [], 'target_entity': [], 'true_label': [], 'GPT-4o_output': []}
for i, row in tqdm(nomultimodal_label_df.iterrows(), total=len(nomultimodal_label_df)):
    influencer = row['user']
    product_category = row['target_entity']
    true_label = row['label']
    prompt = prompt_template.format(influencer=influencer, product_category=product_category)
    try:
        result = nomultimodal_chain.invoke(prompt)
    except:
        result = 'Error'
    result_df_dict['user'].append(influencer)
    result_df_dict['target_entity'].append(product_category)
    result_df_dict['true_label'].append(true_label)
    result_df_dict['GPT-4o_output'].append(result)

    result_df = pd.DataFrame(result_df_dict)
    result_df.to_csv(save_dir.joinpath('GPT-4o_nomultimodal_output.csv'), index=False)

# Zero-shot problem

## Load data

In [None]:
data_dir = Path('data/kg_dr/coldstart_processed')
save_dir = Path('saved/kuzu_rag')

In [None]:
def create_kg_graph(label_df, userid2entityid, userid2relationship, 
                    target_entityid2entityid, target_entityid2relationship, 
                    id2node_dict, id2relationship_dict):
    userid2user = dict(zip(label_df['user_id'], label_df['user']))
    targetid2target = dict(zip(label_df['target_entity_id'], label_df['target_entity']))
    id2node_dict.update(userid2user)
    id2node_dict.update(targetid2target)
    label_df = label_df[label_df['label'] == 1]
    label_df = label_df[['user', 'target_entity']]
    label_df = label_df.rename(columns={'user': 'source', 'target_entity': 'target'})
    label_df['relationship'] = 'suitable_category'

    kg_df_dict = {'source': [], 'target': [], 'relationship': []}
    for userid, entityid_list in userid2entityid.items():
        kg_df_dict['source'] += [id2node_dict[userid]] * len(entityid_list)
        kg_df_dict['target'] += [id2node_dict[entityid] for entityid in entityid_list]
        relationship_list = userid2relationship[userid]
        kg_df_dict['relationship'] += [id2relationship_dict[relationship] for relationship in relationship_list]
    for target_entityid, entityid_list in target_entityid2entityid.items():
        kg_df_dict['source'] += [id2node_dict[target_entityid]] * len(entityid_list)
        kg_df_dict['target'] += [id2node_dict[entityid] for entityid in entityid_list]
        relationship_list = target_entityid2relationship[target_entityid]
        kg_df_dict['relationship'] += [id2relationship_dict[relationship] for relationship in relationship_list]

    kg_df = pd.DataFrame(kg_df_dict)
    kg_df = pd.concat([kg_df, label_df])
    kg_df = kg_df.drop_duplicates()
    return kg_df

def load_data(use_multimodal=False):
    data_dir = Path('data/kg_dr/coldstart_processed')
    if use_multimodal:
        with open(data_dir.joinpath('multimodal-Seed0-suitable_category-data_train_dict-kfold0.pkl'), 'rb') as f:
            train_dict = pickle.load(f)
        with open(data_dir.joinpath('multimodal-Seed0-suitable_category-data_valid_dict-kfold0.pkl'), 'rb') as f:
            valid_dict = pickle.load(f)
        with open(data_dir.joinpath('multimodal-Seed0-suitable_category-data_test_dict-kfold0.pkl'), 'rb') as f:
            test_dict = pickle.load(f)
    else:
        with open(data_dir.joinpath('no_multimodal-Seed0-suitable_category-data_train_dict-kfold0.pkl'), 'rb') as f:
            train_dict = pickle.load(f)
        with open(data_dir.joinpath('no_multimodal-Seed0-suitable_category-data_valid_dict-kfold0.pkl'), 'rb') as f:
            valid_dict = pickle.load(f)
        with open(data_dir.joinpath('no_multimodal-Seed0-suitable_category-data_test_dict-kfold0.pkl'), 'rb') as f:
            test_dict = pickle.load(f)

    id2node_dict = train_dict['id2node']
    id2relationship_dict = {v: k for k, v in train_dict['relationship2id'].items()}
    
    print('Get the KG dataframe from the train dictionary')
    train_edge_df = train_dict['edges'][['source', 'target', 'relationship']]
    train_label_df = train_dict['label'][['user', 'target_entity', 'label']]
    train_label_df = train_label_df[train_label_df['label'] == 1]
    train_label_df = train_label_df.rename(columns={'user': 'source', 'target_entity': 'target'})
    train_label_df['relationship'] = 'suitable_category'
    train_label_df = train_label_df[['source', 'target', 'relationship']]
    train_kg_df = pd.concat([train_edge_df, train_label_df])

    print('Get the KG dataframe from the valid dictionary')
    valid_kg_df = create_kg_graph(valid_dict['label'], valid_dict['userid2entityid'], valid_dict['userid2relationship'], 
                                  valid_dict['target_entityid2entityid'], valid_dict['target_entityid2relationship'], 
                                  id2node_dict, id2relationship_dict)
    
    print('Get the KG dataframe from the test dictionary')
    test_kg_df = create_kg_graph(test_dict['label'], test_dict['userid2entityid'], test_dict['userid2relationship'], 
                                 test_dict['target_entityid2entityid'], test_dict['target_entityid2relationship'], 
                                 id2node_dict, id2relationship_dict)
    test_label_df = test_kg_df[test_kg_df['relationship'] == 'suitable_category']
    test_kg_df = test_kg_df[test_kg_df['relationship'] != 'suitable_category']

    print('Get the node dataframe')
    all_node_list = list(set(train_kg_df['source']) | set(train_kg_df['target']) | 
                         set(valid_kg_df['source']) | set(valid_kg_df['target']) | 
                         set(test_kg_df['source']) | set(test_kg_df['target']) |
                         set(test_label_df['source']) | set(test_label_df['target']))
    nodes_df = pd.read_csv(data_dir.parent.joinpath('processed/nodes.csv'))
    nodes_df = nodes_df[nodes_df['name'].isin(all_node_list)]
    print('The shape of the node dataframe after filtering', nodes_df.shape)
    # for nodes whose attribute is username, change the attribute to influencer
    nodes_df.loc[nodes_df['attribute'] == 'username', 'attribute'] = 'influencer'
    
    return train_kg_df, valid_kg_df, test_kg_df, test_label_df, nodes_df

def create_node_table(nodes_df):
    node_df_dict = {}
    attribute_list = list(set(nodes_df['attribute'].unique()))
    for attribute in attribute_list:
        attribute_df = nodes_df[nodes_df['attribute'] == attribute]
        attribute_df = attribute_df[['name', 'attribute']]
        attribute_df = attribute_df.drop_duplicates(subset=['name'])
        attribute_df = attribute_df.dropna(subset=['name'])
        # first letter uppercase
        attribute = attribute.capitalize()
        print('The shape of the', attribute, 'dataframe is:', attribute_df.shape)
        node_df_dict[attribute] = attribute_df
    return node_df_dict

def explore_edge_table(edges_df, nodes_df):
    nodes_filder_df = nodes_df.dropna(subset=['name'])
    edges_df = edges_df[(edges_df['source'].isin(nodes_filder_df['name'])) & (edges_df['target'].isin(nodes_filder_df['name']))]
    node2attribute_dict = dict(zip(nodes_filder_df['name'], nodes_filder_df['attribute']))

    relationship_list = list(set(edges_df['relationship'].unique()))
    for relationship in relationship_list:
        relationship_df = edges_df[edges_df['relationship'] == relationship]
        relationship_df = relationship_df[['source', 'target', 'relationship']]
        relationship_df = relationship_df.drop_duplicates()
        relationship_df = relationship_df.dropna(subset=['source', 'target', 'relationship'])
        relationship_df['source_attribute'] = relationship_df['source'].map(node2attribute_dict)
        relationship_df['target_attribute'] = relationship_df['target'].map(node2attribute_dict)
        print('The shape of the', relationship, 'dataframe is:', relationship_df.shape)
        print('The combination of source attribute and target attribute:')
        print(relationship_df[['source_attribute', 'target_attribute']].value_counts())
        print('\n')

def create_edge_table(edges_df, nodes_df, node_df_dict):
    node_list = []
    for df in node_df_dict.values():
        node_list.extend(df['name'].tolist())
    node_list = list(set(node_list))
    edges_df = edges_df[(edges_df['source'].isin(node_list)) & (edges_df['target'].isin(node_list))]
    
    nodes_filder_df = nodes_df.dropna(subset=['name'])
    node2attribute_dict = dict(zip(nodes_filder_df['name'], nodes_filder_df['attribute']))
    
    edge_df_dict = {}
    relationship_list = list(set(edges_df['relationship'].unique()))
    for relationship in relationship_list:
        relationship_df = edges_df[edges_df['relationship'] == relationship]
        relationship_df = relationship_df[['source', 'target', 'relationship']]
        relationship_df = relationship_df.drop_duplicates()
        relationship_df = relationship_df.dropna(subset=['source', 'target', 'relationship'])
        relationship_df['source_attribute'] = relationship_df['source'].map(node2attribute_dict)
        relationship_df['target_attribute'] = relationship_df['target'].map(node2attribute_dict)
        relationship = relationship.capitalize()

        if relationship == 'Recommend_product':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category'))]
            relationship = 'Recommend'
        
        if relationship == 'Target_audience':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'target_audience'))]
            relationship = 'Has_target_audience'

        if relationship == 'Occupation_or_industry' or relationship == 'Username' or relationship == 'Self_description' or relationship == 'Location':
            continue

        if relationship == 'Product_category':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'product_name') & (relationship_df['target_attribute'] == 'product_category'))]
            relationship = 'Has_product_category'

        if relationship == 'Its_brand':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'product_name') & (relationship_df['target_attribute'] == 'brand'))]
            # Has_brand
            relationship_has_brand_df = relationship_df[relationship_df['source_attribute'] == 'product_name']
            relationship = 'Has_brand'

        if relationship == 'Interests':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category'))]
            relationship = 'Has_interest'

        if relationship == 'Partner':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'brand')) |
                                              ((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'partner')) |
                                              ((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category')) |
                                              ((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'influencer'))]
            # Partner_brand
            relationship_partner_brand_df = relationship_df[relationship_df['target_attribute'] == 'brand']
            relationship = 'Partner_brand'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_brand_df.shape)
            print('Attributes of the source and target:', relationship_partner_brand_df['source_attribute'].unique(), relationship_partner_brand_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_brand_df
            # Partner_partner
            relationship_partner_partner_df = relationship_df[relationship_df['target_attribute'] == 'partner']
            relationship = 'Partner_partner'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_partner_df.shape)
            print('Attributes of the source and target:', relationship_partner_partner_df['source_attribute'].unique(), relationship_partner_partner_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_partner_df
            # Partner_product_category
            relationship_partner_product_category_df = relationship_df[relationship_df['target_attribute'] == 'product_category']
            relationship = 'Partner_product_category'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_product_category_df.shape)
            print('Attributes of the source and target:', relationship_partner_product_category_df['source_attribute'].unique(), relationship_partner_product_category_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_product_category_df
            # Partner_influencer
            relationship_partner_influencer_df = relationship_df[relationship_df['target_attribute'] == 'influencer']
            relationship = 'Partner_influencer'
            print('The shape of the', relationship, 'dataframe is:', relationship_partner_influencer_df.shape)
            print('Attributes of the source and target:', relationship_partner_influencer_df['source_attribute'].unique(), relationship_partner_influencer_df['target_attribute'].unique())
            edge_df_dict[relationship] = relationship_partner_influencer_df
            continue

        if relationship == 'Brand_collaboration':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'brand'))]
        
        if relationship == 'Has_interest':
            relationship_df = relationship_df[((relationship_df['source_attribute'] == 'influencer') & (relationship_df['target_attribute'] == 'product_category'))]
        
        print('The shape of the', relationship, 'dataframe is:', relationship_df.shape)
        print('Attributes of the source and target:', relationship_df['source_attribute'].unique(), relationship_df['target_attribute'].unique())
        edge_df_dict[relationship] = relationship_df

    for relationship, df in edge_df_dict.items():
        source_attribute = df['source_attribute'].iloc[0].capitalize()
        target_attribute = df['target_attribute'].iloc[0].capitalize()
        df = df[df['source'].isin(node_df_dict[source_attribute]['name']) & df['target'].isin(node_df_dict[target_attribute]['name'])]
        print('The shape of the', relationship, 'dataframe is:', df.shape)
        edge_df_dict[relationship] = df
    
    return edge_df_dict

def zero_shot_process(use_multimodal=False):
    train_kg_df, valid_kg_df, test_kg_df, test_label_df, nodes_df = load_data(use_multimodal=use_multimodal)
    kg_df = pd.concat([train_kg_df, valid_kg_df])
    print('The shape of the KG dataframe:', kg_df.shape)
    explore_edge_table(kg_df, nodes_df)
    node_df_dict = create_node_table(nodes_df)
    edge_df_dict = create_edge_table(kg_df, nodes_df, node_df_dict)
    label_df = test_label_df
    return node_df_dict, edge_df_dict, label_df, test_kg_df

In [None]:
multimodal_node_df_dict, multimodal_edge_df_dict, multimodal_label_df, multimodal_test_kg_df = zero_shot_process(use_multimodal=True)

In [None]:
nomultimodal_node_df_dict, nomultimodal_edge_df_dict, nomultimodal_label_df, nomultimodal_test_kg_df = zero_shot_process(use_multimodal=False)

## Binary classification

In [None]:
import kuzu
from langchain.chains import KuzuQAChain
from langchain_community.graphs import KuzuGraph
from langchain_openai import ChatOpenAI

def create_kuzu_graph(node_df_dict, edge_df_dict):
    db_name = "influencer_kg.db"
    
    # Clear previous database if it exists
    if os.path.exists(db_name):
        print(f"Removing existing database: {db_name}")
        shutil.rmtree(db_name)
    
    # Create a new Kuzu database
    db = kuzu.Database(db_name)
    conn = kuzu.Connection(db)

    print('----------Creating node tables----------')
    nodename2node_dict = dict()
    # Create node tables
    for node_type, df in node_df_dict.items():
        columns = df.columns.tolist()
        create_query = f"CREATE NODE TABLE {node_type} ("
        column_defs = []
        for col in columns:
            if node_type == 'Influencer' and col == 'follower_count':
                column_defs.append(f"{col} INT64")
            else:
                column_defs.append(f"{col} STRING")
        create_query += ", ".join(column_defs)
        create_query += ", PRIMARY KEY (name))"
        print(f"Executing query: {create_query}")  # Debug print
        conn.execute(create_query)
        
        # Import nodes directly from dataframe
        print(f"Importing nodes for {node_type}")
        conn.execute(f"COPY {node_type} FROM df")

        nodename2node_dict[node_type] = df['name'].tolist()
    print('\n')

    print('----------Creating edge tables----------')
    # Create edge tables
    for edge_type, df in edge_df_dict.items():
        if len(df) == 0:
            continue
        source_type = df['source_attribute'].iloc[0].capitalize()
        target_type = df['target_attribute'].iloc[0].capitalize()

        create_query = f"CREATE REL TABLE {edge_type} (FROM {source_type} TO {target_type})"
        print(f"Executing query: {create_query}")  # Debug print
        conn.execute(create_query)
        
        # Import relationships directly from dataframe
        print(f"Importing relationships for {edge_type}")
        df = df[['source', 'target']]
        df = df[df['source'].isin(nodename2node_dict[source_type]) & df['target'].isin(nodename2node_dict[target_type])]
        conn.execute(f"COPY {edge_type} FROM df")

    return db, conn

def create_triplet(source, kg_df):
    triplet_df = kg_df[kg_df['source'] == source]
    triplet_df = triplet_df[['source', 'target', 'relationship']]
    triplet_df = triplet_df.drop_duplicates()
    triplet_str = ''
    for i, row in triplet_df.iterrows():
        triplet_str += f"{row['source']}, {row['relationship']}, {row['target']}.\n"
    return triplet_str

prompt_template = """
You are an expert in influencer marketing. You have been asked to analyze influencers and product categories.
You have been given a Knowledge Graph (KG) that contains information about influencers, product categories, and their relationships.
You need to answer the following question based on the given KG and external information of an influencer and a product category provided.
The external information is in triplet format: (head, relation, tail).
For the influencer '{influencer}', we have the following information: {influencer_info}.
For the product category '{product_category}', we have the following information: {product_category_info}.
Now, based on the KG and the external information, will this influencer be suitable for the product category '{product_category}'?
You can only answer with 'Yes' or 'No'. Please show your reasoning step by step.
Please use JSON format as output and only for output. The output contains two keys: prediction and reason.
"""

### Multimodal

In [None]:
multimodal_db, multimodal_conn = create_kuzu_graph(multimodal_node_df_dict, multimodal_edge_df_dict)

In [None]:
multimodal_graph = KuzuGraph(multimodal_db)

multimodal_chain = KuzuQAChain.from_llm(
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    graph=multimodal_graph,
    verbose=True,
)

print(multimodal_graph.get_schema)

In [None]:
influencer = 'in2itcosmetics.my'
product_category = 'Beauty & Personal Care'
influencer_info = create_triplet(influencer, multimodal_test_kg_df)
product_category_info = create_triplet(product_category, multimodal_test_kg_df)
prompt = prompt_template.format(influencer=influencer, product_category=product_category, 
                                influencer_info=influencer_info, product_category_info=product_category_info)
multimodal_chain.invoke(prompt)

In [None]:
multimodal_label_df

#### Run

In [None]:
result_df_dict = {'user': [], 'target_entity': [], 'true_label': [], 'GPT-4o_output': []}
for i, row in tqdm(multimodal_label_df.iterrows(), total=len(multimodal_label_df)):
    influencer = row['source']
    product_category = row['target']
    true_label = 1
    influencer_info = create_triplet(influencer, multimodal_test_kg_df)
    product_category_info = create_triplet(product_category, multimodal_test_kg_df)
    prompt = prompt_template.format(influencer=influencer, product_category=product_category,
                                    influencer_info=influencer_info, product_category_info=product_category_info)
    try:
        result = multimodal_chain.invoke(prompt)
    except:
        result = 'Error'
    result_df_dict['user'].append(influencer)
    result_df_dict['target_entity'].append(product_category)
    result_df_dict['true_label'].append(true_label)
    result_df_dict['GPT-4o_output'].append(result)

    result_df = pd.DataFrame(result_df_dict)
    result_df.to_csv(save_dir.joinpath('zero-shot_GPT-4o_multimodal_output.csv'), index=False)

### NoMultimodal

In [None]:
nomultimodal_db, nomultimodal_conn = create_kuzu_graph(nomultimodal_node_df_dict, nomultimodal_edge_df_dict)

In [None]:
nomultimodal_graph = KuzuGraph(nomultimodal_db)

nomultimodal_chain = KuzuQAChain.from_llm(
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    graph=nomultimodal_graph,
    verbose=True,
)

print(nomultimodal_graph.get_schema)

In [None]:
nomultimodal_label_df

In [None]:
result_df_dict = {'user': [], 'target_entity': [], 'true_label': [], 'GPT-4o_output': []}
for i, row in tqdm(nomultimodal_label_df.iterrows(), total=len(nomultimodal_label_df)):
    influencer = row['source']
    product_category = row['target']
    true_label = 1
    influencer_info = create_triplet(influencer, nomultimodal_test_kg_df)
    product_category_info = create_triplet(product_category, nomultimodal_test_kg_df)
    prompt = prompt_template.format(influencer=influencer, product_category=product_category,
                                    influencer_info=influencer_info, product_category_info=product_category_info)
    try:
        result = nomultimodal_chain.invoke(prompt)
    except:
        result = 'Error'
    result_df_dict['user'].append(influencer)
    result_df_dict['target_entity'].append(product_category)
    result_df_dict['true_label'].append(true_label)
    result_df_dict['GPT-4o_output'].append(result)

    result_df = pd.DataFrame(result_df_dict)
    result_df.to_csv(save_dir.joinpath('zero-shot_GPT-4o_nomultimodal_output.csv'), index=False)