In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import logging
logging.basicConfig(level=logging.ERROR)

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import numpy as np

from sqlalchemy import create_engine, URL, text

from helper.keyword_helper import neo4j_fetch_data, clean_abstracts
from helper.wordtrie_builder import WordTrie

In [2]:
SAMPLE_LIMIT = 500000
NEO4J_CREDENTIALS = {"url": "bolt://localhost:37687", "user": "neo4j", "password": "neo4jpassword"}
AHO_LENGTH = 3 # Paper Abstract has to contain AHO_LENGTH or more core keywords to be labeled as ai and deleted from neg sample

url_object = URL.create(
    drivername='postgresql+psycopg2', 
    username='tie',
    password='TIE%2023!tuhh',
    host='134.28.58.100',
    # host='tie-workstation.tail6716.ts.net',
    # host='localhost',
    port=45432,
    database='openalex_db',
)
engine = create_engine(url_object)
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f437cbaad50>

In [3]:
# Load 100.000 random papers from neo4j
# query = f"""
# MATCH (p:Paper)
# WHERE p.abstract IS NOT NULL
# RETURN p.id AS id, p.abstract AS abstract, p.title AS title
# ORDER BY rand()
# LIMIT {SAMPLE_LIMIT}
# """

query = f"""
MATCH (p:Paper)
WHERE p.abstract IS NOT NULL
RETURN p.id AS id, p.abstract AS abstract, p.title AS title
"""
print("Fetching data...")
papers = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(papers)} papers.")
print("")

query = """
MATCH (t:Task)
RETURN t.id as task_id, t.name as task_name, t.description as task_description
"""
print("Fetching data...")
tasks = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(tasks)} tasks.")

query = """
MATCH (m:Method)
RETURN m.id as method_id, m.name as method_name, m.description as method_description
"""
print("Fetching data...")
methods = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(methods)} methods.")

query = """
MATCH (d:Dataset)
RETURN d.id as dataset_id, d.name as dataset_name, d.description as dataset_description
"""
print("Fetching data...")
datasets = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(datasets)} datasets.")

query2 = f"""
MATCH (p:Paper)
WHERE p.id_openalex IS NOT NULL
RETURN p.id_openalex AS openalex_id
"""
print("Fetching data...")
pwc_oalex = neo4j_fetch_data(query2, NEO4J_CREDENTIALS)
print("Done.")
print(f"Got {len(pwc_oalex)} OpenAlex IDs from PwC.")

Fetching data...
Done.
Got 398892 papers.

Fetching data...
Done.
Got 3123 tasks.
Fetching data...
Done.
Got 2162 methods.
Fetching data...
Done.
Got 8890 datasets.
Fetching data...
Done.
Got 255908 OpenAlex IDs from PwC.


In [4]:
sql_query = f'''
    SELECT id, title, abstract, abstract_inverted_index
    FROM openalex.works TABLESAMPLE BERNOULLI(10)
    WHERE abstract_inverted_index IS NOT NULL
    LIMIT {SAMPLE_LIMIT + (SAMPLE_LIMIT * 0.5)}
'''

with engine.connect() as conn:
    print("Fetching data from Postgres...")
    openalex = pd.read_sql(sql=text(sql_query), con=conn)
    print(f"Got {len(openalex)} OpenAlex Works")

Fetching data from Postgres...
Got 750000 OpenAlex Works


In [5]:
sql_query2 = f'''
    SELECT work_id
    FROM openalex.works_concepts
    WHERE concept_id = (SELECT id FROM openalex.concepts WHERE display_name = 'Artificial intelligence') AND score > 0.5
'''
sql_query3 = f'''
    SELECT wc.work_id
    FROM openalex.works_concepts wc
    JOIN openalex.concepts c ON wc.concept_id = c.id
    WHERE c.display_name = 'Artificial intelligence' OR c.display_name = 'Machine learning' OR c.display_name = 'Natural language processing' AND wc.score > 0.2
'''

with engine.connect() as conn:
    print("Fetching data from Postgres...")
    openalex_ids_with_AI = pd.read_sql(sql=text(sql_query3), con=conn)
    print(f"Got {len(openalex_ids_with_AI)} OpenAlex Work IDs with AI")

Fetching data from Postgres...
Got 18116999 OpenAlex Work IDs with AI


In [6]:
def text_lower(text):
    return text.lower()

# Drop na on name and description columns
tasks = tasks.dropna(subset=["task_name", "task_description"])
methods = methods.dropna(subset=["method_name", "method_description"])
datasets = datasets.dropna(subset=["dataset_name", "dataset_description"])

# Concat papers with tasks, methods and datasets
tasks["text"] = tasks["task_name"] + " " + tasks["task_description"]
tasks["text"] = tasks["text"].apply(text_lower)
methods["text"] = methods["method_name"] + " " + methods["method_description"]
methods["text"] = methods["text"].apply(text_lower)
datasets["text"] = datasets["dataset_name"] + " " + datasets["dataset_description"]
datasets["text"] = datasets["text"].apply(text_lower)

# Add tasks, methods and datasets to papers df. Use their ids as papers id column and their text as abstract column
papers = pd.concat([papers, tasks[["task_id", "text"]].rename(columns={"task_id": "id", "text": "abstract"})])
papers = pd.concat([papers, methods[["method_id", "text"]].rename(columns={"method_id": "id", "text": "abstract"})])
papers = pd.concat([papers, datasets[["dataset_id", "text"]].rename(columns={"dataset_id": "id", "text": "abstract"})])

In [7]:
# Remove all openalex rows where the id is in the pwc_oalex df
openalex_id_filtered = openalex[~openalex['id'].isin(pwc_oalex['openalex_id'])]
print(f"Got {len(openalex_id_filtered)} OpenAlex Works after removing PwC papers")

# Remove all openalex rows where the id is in the openalex_ids_with_AI df
openalex_id_filtered = openalex_id_filtered[~openalex_id_filtered['id'].isin(openalex_ids_with_AI['work_id'])]
print(f"Got {len(openalex_id_filtered)} OpenAlex Works after removing papers with AI")

# Clean both DataFrames
print("Cleaning abstracts from OpenAlex...")
openalex_sample = clean_abstracts(openalex_id_filtered.copy(), "abstract", openalex_inv_index=True, lang='en')
print("Cleaning abstracts from Neo4j...")
papers = clean_abstracts(papers.copy(), "abstract", openalex_inv_index=False, lang='en')

# Reset index and apply sample limit if necessary
openalex_sample.reset_index(drop=True, inplace=True)
# if len(openalex_sample) > SAMPLE_LIMIT:
#     openalex_sample = openalex_sample.sample(n=SAMPLE_LIMIT, random_state=42)

# Drop unnecessary columns
openalex_sample.drop(columns=['abstract_inverted_index', 'abstract_lang'], inplace=True)
papers.drop(columns=['abstract_lang'], inplace=True)

# Prints for confirmation
print(f"Got {len(openalex_id_filtered)} OpenAlex Works after removing PwC papers")
print(f"Number of rows after cleaning: {len(openalex_sample)} ({round((len(openalex_sample))/len(openalex_id_filtered), 2) * 100}% remain)")
# if len(openalex_sample) > SAMPLE_LIMIT:
#     print(f"Cut down to {len(openalex_sample)} OpenAlex Works")

Got 749470 OpenAlex Works after removing PwC papers
Got 672129 OpenAlex Works after removing papers with AI
Cleaning abstracts from OpenAlex...
Extracting abstract from abstract_inverted_index...


  0%|          | 0/672129 [00:00<?, ?it/s]

Detecting language en...


  0%|          | 0/620808 [00:01<?, ?it/s]

Cleaning abstracts from Neo4j...
Detecting language en...


  0%|          | 0/410742 [00:01<?, ?it/s]

Got 672129 OpenAlex Works after removing PwC papers
Number of rows after cleaning: 500000 (74.0% remain)


In [8]:
# Import the dictionary.csv
dict_df = pd.read_csv("data/dictionaries/dictionary.csv")
print(f"Got {len(dict_df)} rows from dictionary.csv")

dict_df = dict_df[dict_df["source"] != "dataset"]
dict_df.dropna(subset=["keyword"], inplace=True)
dict_df.reset_index(drop=True, inplace=True)
print(f"Got {len(dict_df)} rows from dictionary.csv after filtering out source == 'dataset'")
dict_df["trie_id"] = dict_df.index

Got 453777 rows from dictionary.csv
Got 315245 rows from dictionary.csv after filtering out source == 'dataset'


In [9]:
# Make a wordtrie from the dict_df
# wordtrie = WordTrie(word_filter=True, text_filter=True, show_progress_bar=True, weights=False)
# wordtrie.add_bulk(dict_df['keyword'].tolist(), dict_df["trie_id"].tolist())
wordtrie = WordTrie(word_filter=True, text_filter=True, show_progress_bar=True, weights=False)
wordtrie.from_json("data/dictionaries/tries/all_trie.json")

In [10]:
openalex_sample[f"trie_words"] = openalex_sample['abstract'].progress_apply(lambda x: wordtrie.search(x, only_return_words=True))
openalex_metadata = openalex_sample['abstract'].progress_apply(lambda x: wordtrie.aggregate_search_info(x))
openalex_sample[f"trie_abs"] = openalex_metadata.apply(lambda x: x[0])
openalex_sample[f"trie_ratio"] = openalex_metadata.apply(lambda x: x[1])

papers[f"trie_words"] = papers['abstract'].progress_apply(lambda x: wordtrie.search(x, only_return_words=True))
papers_metadata = papers['abstract'].progress_apply(lambda x: wordtrie.aggregate_search_info(x))
papers[f"trie_abs"] = papers_metadata.apply(lambda x: x[0])
papers[f"trie_ratio"] = papers_metadata.apply(lambda x: x[1])

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/500000 [00:00<?, ?it/s]

  0%|          | 0/409642 [00:00<?, ?it/s]

  0%|          | 0/409642 [00:00<?, ?it/s]

In [11]:
# Extract the trie_words from each dataframe and write them into a new dataframe
print("Extracting trie_words from openalex_sample and papers...")
openalex_trie_words = openalex_sample[['id', 'trie_words']].explode('trie_words').dropna()
display(openalex_trie_words.head(20))
print(f"Got {len(openalex_trie_words)} rows from openalex_trie_words")

paper_trie_words = papers[['id', 'trie_words']].explode('trie_words').dropna()
display(paper_trie_words.head(20))
print(f"Got {len(paper_trie_words)} rows from paper_trie_words")

openalex_total_abstract_length = openalex_sample['abstract_length'].sum()
print(f"Total abstract length of openalex_sample: {openalex_total_abstract_length}")
paper_total_abstract_length = papers['abstract_length'].sum()
print(f"Total abstract length of papers: {paper_total_abstract_length}")

# Make the df only with unique trie_words and count their occurrences as well as their ids in a list
print("Grouping trie_words and counting their occurrences...")
openalex_trie_words_unique = openalex_trie_words.groupby('trie_words').agg({'id': list, 'trie_words': 'count'}).rename(columns={'trie_words': 'count'}).reset_index()
# Calculate the ratio of the trie_words in the openalex_sample
openalex_trie_words_unique['ratio_openalex'] = openalex_trie_words_unique['count'] / openalex_total_abstract_length
openalex_trie_words_unique.sort_values(by='ratio_openalex', ascending=False, inplace=True)
display(openalex_trie_words_unique.head(20))

paper_trie_words_unique = paper_trie_words.groupby('trie_words').agg({'id': list, 'trie_words': 'count'}).rename(columns={'trie_words': 'count'}).reset_index().sort_values(by='count', ascending=False)
# Calculate the ratio of the trie_words in the papers
paper_trie_words_unique['ratio_pwc'] = paper_trie_words_unique['count'] / paper_total_abstract_length
paper_trie_words_unique.sort_values(by='ratio_pwc', ascending=False, inplace=True)
display(paper_trie_words_unique.head(20))

Extracting trie_words from openalex_sample and papers...


Unnamed: 0,id,trie_words
213225,https://openalex.org/W2228141297,proposed
213225,https://openalex.org/W2228141297,paper
213225,https://openalex.org/W2228141297,center
213225,https://openalex.org/W2228141297,half
213225,https://openalex.org/W2228141297,challenge
213225,https://openalex.org/W2228141297,extreme
213225,https://openalex.org/W2228141297,considered
213225,https://openalex.org/W2228141297,chemical
213225,https://openalex.org/W2228141297,result
213225,https://openalex.org/W2228141297,order


Got 13259342 rows from openalex_trie_words


Unnamed: 0,id,trie_words
0,96244316557537553621242627677337244909,lengthy and expensive
0,96244316557537553621242627677337244909,process
0,96244316557537553621242627677337244909,space
0,96244316557537553621242627677337244909,potential
0,96244316557537553621242627677337244909,molecule
0,96244316557537553621242627677337244909,large
0,96244316557537553621242627677337244909,common technique
0,96244316557537553621242627677337244909,drug discovery
0,96244316557537553621242627677337244909,start
0,96244316557537553621242627677337244909,molecule


Got 18289757 rows from paper_trie_words
Total abstract length of openalex_sample: 54623909
Total abstract length of papers: 70151135
Grouping trie_words and counting their occurrences...


Unnamed: 0,trie_words,id,count,ratio_openalex
47916,result,"[https://openalex.org/W2228141297, https://ope...",157094,0.002876
55766,study,"[https://openalex.org/W2050129006, https://ope...",140240,0.002567
34248,method,"[https://openalex.org/W2228141297, https://ope...",107920,0.001976
23522,group,"[https://openalex.org/W1492999398, https://ope...",102945,0.001885
40033,patient,"[https://openalex.org/W1528549543, https://ope...",97290,0.001781
14947,effect,"[https://openalex.org/W2023672664, https://ope...",92043,0.001685
57020,system,"[https://openalex.org/W2156224029, https://ope...",84236,0.001542
1935,analysis,"[https://openalex.org/W2156224029, https://ope...",71567,0.00131
3660,based,"[https://openalex.org/W2023672664, https://ope...",68512,0.001254
5771,cell,"[https://openalex.org/W1994539623, https://ope...",68479,0.001254


Unnamed: 0,trie_words,id,count,ratio_pwc
148469,model,"[242780927947307754204161452189440138426, 2613...",254123,0.003623
145989,method,"[70071698200539603131760949540610170908, 24278...",218245,0.003111
188985,propose,"[242780927947307754204161452189440138426, 2427...",168210,0.002398
218670,show,"[96244316557537553621242627677337244909, 24633...",162504,0.002316
172671,paper,"[31966318194299414183392933111234958600, 70071...",142659,0.002034
9806,approach,"[59950857112407915654174502172742604926, 24633...",127762,0.001821
187294,problem,"[70071698200539603131760949540610170908, 24633...",118782,0.001693
207310,result,"[70071698200539603131760949540610170908, 24278...",118336,0.001687
49509,data,"[137074317915818717844637189787402663326, 3274...",118123,0.001684
7295,algorithm,"[246332513827943080783912584084585409747, 2463...",110808,0.00158


In [12]:
def kl_divergence_with_dynamic_penalty(p, q, base_penalty_factor=1.5):
    """
    Calculate the Kullback-Leibler divergence with a dynamic penalty when P is greater than Q.
    The penalty increases linearly with the difference P-Q for a more linear gradient in the divergence.

    :param p: Probability of the event in distribution P.
    :param q: Probability of the event in distribution Q.
    :param base_penalty_factor: The base factor by which P is penalized.
    :return: KL divergence with dynamic penalty for P.
    """
    # Replace zeros with a very small number to avoid division by zero and log of zero
    p = max(p, 1e-10)
    q = max(q, 1e-10)
    
    # Calculate dynamic penalty based on the difference P-Q
    penalty_factor = base_penalty_factor if p <= q else base_penalty_factor + (p - q)
    
    p_adjusted = p ** penalty_factor

    return p_adjusted * np.log2(p_adjusted / q)

def hellingerdistance(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)

# Make a new dataframe by merging on the trie_words column
print("Merging openalex_trie_words_unique and paper_trie_words_unique...")
merged = openalex_trie_words_unique.merge(paper_trie_words_unique, on='trie_words', how='outer').fillna(0)
merged['ratio_pwc'] = merged['ratio_pwc'].replace(0, 0.0000001)
merged['ratio_openalex'] = merged['ratio_openalex'].replace(0, 0.0000001)

# Calculate the KL Divergence
print("Calculating KL Divergence...")
merged['kl_divergence_positive'] = merged.progress_apply(lambda x: kl_divergence_with_dynamic_penalty(x['ratio_pwc'], x['ratio_openalex']), axis=1)
# # Calculate the Hellinger Distance
# print("Calculating Hellinger Distance...")
# merged['hellinger_distance'] = merged.progress_apply(lambda x: hellingerdistance(x['ratio_pwc'], x['ratio_openalex']), axis=1)
# # Combine the KL Divergence and Hellinger Distance into one score
# print("Combining KL Divergence and Hellinger Distance into one score...")
# merged['kl_hd_score'] = merged['kl_divergence_positive'] * merged['hellinger_distance']

Merging openalex_trie_words_unique and paper_trie_words_unique...
Calculating KL Divergence...


  0%|          | 0/279234 [00:00<?, ?it/s]

In [13]:
# Normalize the KL Divergence between 0 and 1
print("Normalizing KL Divergence between 0 and 1...")
merged['kl_divergence_normalized'] = (merged['kl_divergence_positive'] - merged['kl_divergence_positive'].min()) / (merged['kl_divergence_positive'].max() - merged['kl_divergence_positive'].min())
merged.sort_values(by='kl_divergence_normalized', ascending=False, inplace=True)
display(merged.head(30))

Normalizing KL Divergence between 0 and 1...


Unnamed: 0,trie_words,id_x,count_x,ratio_openalex,id_y,count_y,ratio_pwc,kl_divergence_positive,kl_divergence_normalized
17251,deep neural network,"[https://openalex.org/W4206512790, https://ope...",10.0,1.8307e-07,"[75642846615310274254135045340790875869, 21138...",17063.0,0.000243,1.7e-05,1.0
1453,dataset,"[https://openalex.org/W2090497959, https://ope...",994.0,1.819716e-05,"[31966318194299414183392933111234958600, 11739...",63597.0,0.000907,1.6e-05,0.998525
3225,neural network,"[https://openalex.org/W2473511803, https://ope...",164.0,3.002348e-06,"[11739083166608620618492938660983958288, 89739...",29852.0,0.000426,1.4e-05,0.995117
1536,datasets,"[https://openalex.org/W3043164622, https://ope...",883.0,1.616508e-05,"[332016691809592502767510544244005600176, 3379...",56510.0,0.000806,1.1e-05,0.991401
5515,extensive experiment,"[https://openalex.org/W2947493533, https://ope...",67.0,1.226569e-06,"[73897822424382435106402992514726513771, 27143...",21095.0,0.000301,1.1e-05,0.99085
7055,deep learning,"[https://openalex.org/W3130271948, https://ope...",45.0,8.238151e-07,"[28709436390441121994857486349023068774, 10614...",16884.0,0.000241,8e-06,0.986471
9698,training data,"[https://openalex.org/W4281683383, https://ope...",27.0,4.94289e-07,"[165469899874014190454671833381971900642, 8735...",14386.0,0.000205,8e-06,0.985532
4192,classifier,"[https://openalex.org/W2490634696, https://ope...",105.0,1.922235e-06,"[26845509869662012637019735410549572324, 26845...",19019.0,0.000271,5e-06,0.982116
17863,convolutional neural network,"[https://openalex.org/W2008839571, https://ope...",10.0,1.8307e-07,"[11739083166608620618492938660983958288, 24278...",9870.0,0.000141,5e-06,0.981973
20526,cnns,"[https://openalex.org/W4362715709, https://ope...",7.0,1.28149e-07,"[261398715338588505394450378291024309282, 1021...",9131.0,0.00013,5e-06,0.981859


In [14]:
merged.sort_values(by='kl_divergence_normalized', ascending=True, inplace=True)
display(merged.head(30))

Unnamed: 0,trie_words,id_x,count_x,ratio_openalex,id_y,count_y,ratio_pwc,kl_divergence_positive,kl_divergence_normalized
2,method,"[https://openalex.org/W2228141297, https://ope...",107920.0,0.001976,"[70071698200539603131760949540610170908, 24278...",218245.0,0.003111,-0.000607,0.0
17,model,"[https://openalex.org/W2023672664, https://ope...",58650.0,0.001074,"[242780927947307754204161452189440138426, 2613...",254123.0,0.003623,-0.000499,0.173001
0,result,"[https://openalex.org/W2228141297, https://ope...",157094.0,0.002876,"[70071698200539603131760949540610170908, 24278...",118336.0,0.001687,-0.000372,0.375781
13,show,"[https://openalex.org/W2228141297, https://ope...",59854.0,0.001096,"[96244316557537553621242627677337244909, 24633...",162504.0,0.002316,-0.000366,0.386007
11,paper,"[https://openalex.org/W2228141297, https://ope...",61498.0,0.001126,"[31966318194299414183392933111234958600, 70071...",142659.0,0.002034,-0.000331,0.442805
8,based,"[https://openalex.org/W2023672664, https://ope...",68512.0,0.001254,"[246332513827943080783912584084585409747, 2870...",108767.0,0.00155,-0.000266,0.546756
23,data,"[https://openalex.org/W2156224029, https://ope...",52833.0,0.000967,"[137074317915818717844637189787402663326, 3274...",118123.0,0.001684,-0.000262,0.552487
6,system,"[https://openalex.org/W2156224029, https://ope...",84236.0,0.001542,"[96244316557537553621242627677337244909, 96244...",96182.0,0.001371,-0.00025,0.572206
40,problem,"[https://openalex.org/W2023672664, https://ope...",37970.0,0.000695,"[70071698200539603131760949540610170908, 24633...",118782.0,0.001693,-0.00023,0.60372
53,approach,"[https://openalex.org/W1492999398, https://ope...",33013.0,0.000604,"[59950857112407915654174502172742604926, 24633...",127762.0,0.001821,-0.000229,0.605818


In [15]:
# Merge the kl_divergence_normalized column into the dict_df on keyword (left)
print("Merging kl_divergence_normalized into dict_df...")
new_dict_df = dict_df.merge(merged[['trie_words', 'kl_divergence_normalized']], left_on='keyword', right_on='trie_words', how='left').fillna(1)
new_dict_df.drop(columns=['trie_words'], inplace=True)


Merging kl_divergence_normalized into dict_df...


In [16]:
# Sort for the kl_divergence_normalized column
new_dict_df.sort_values(by='kl_divergence_normalized', ascending=False, inplace=True)
# Print me the count of all rows where kl_divergence_normalized is 1
print(f"Number of rows where kl_divergence_normalized is 1: {len(new_dict_df[new_dict_df['kl_divergence_normalized'] == 1])}")
display(new_dict_df.head(30))

Number of rows where kl_divergence_normalized is 1: 61360


Unnamed: 0,keyword,source,embedding,dict,trie_id,kl_divergence_normalized
155030,non-convex energy function,method,"[-0.31189224123954773, 0.2624879777431488, 0.2...",extended,155030,1.0
56979,small-sample trial,cso,"[-0.08517652750015259, 0.45226967334747314, -1...",extended,56979,1.0
56975,sub-sample size,cso,"[-0.29055795073509216, 0.7271391749382019, -1....",extended,56975,1.0
56973,non-dominated sorting rule,cso,"[0.6270943284034729, -0.4033014476299286, -0.4...",extended,56973,1.0
56972,non-dominated sorting approach,cso,"[0.766659677028656, -0.5265238881111145, -0.32...",extended,56972,1.0
233868,related recent scheme,method,"[0.6913132667541504, 0.04486856609582901, -0.5...",extended,233868,1.0
172406,complex dialogue task,method,"[-0.10094913095235825, 0.8068733215332031, 0.9...",extended,172406,1.0
56967,widely-used hyperspectral datasets,cso,"[0.9675602316856384, -0.896249532699585, 0.313...",extended,56967,1.0
200418,much-simplified version,method,"[-0.20040249824523926, 0.46404099464416504, -0...",extended,200418,1.0
172405,multi-turn dialogue scenario,method,"[-0.32410046458244324, 0.781856894493103, 0.54...",extended,172405,1.0


In [17]:
# Save the dict_df as csv
print("Saving dict_df as csv...")
new_dict_df.to_csv("data/dictionaries/dictionary_kl.csv", index=False)

Saving dict_df as csv...
