# Pre-processing Pipeline

### [Pre-processing - 1] Add embeddings from each paperId

In [None]:
import pandas as pd
import json

# NOTE: Only need to edit this with latest papers JSON for pre-processing steps 1 and 2
# Load in latest JSON files
papers_json_path = r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\papers\23-07-25_11935_database_update.json'

# load the data from your JSON file
with open(papers_json_path, 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# print out the DataFrame to verify
print(df.head())

In [None]:
print("# text not None: ", len(df[df['text'].notna()]), "# None: ", len(df[df['text'].isna()]))

In [None]:
import numpy as np

# Define the conditions for the 'text' column
condition = df['text'].isna()

# Create a subset dataframe where 'text' is NaN
subset_df = df[condition]

# Define what the 'text' column should be if the condition is met
value_when_true = 'Title: ' + subset_df['title'].astype(str) + '. Abstract: ' + subset_df['abstract'].astype(str)

# Define what the 'text' column should be if the abstract is na
value_when_abstract_na = 'Title: ' + subset_df['title'].astype(str) + '.'

# Apply the conditions to the DataFrame
df.loc[condition, 'text'] = np.where(subset_df['abstract'].isna(), value_when_abstract_na, value_when_true)

In [None]:
print("# text not None: ", len(df[df['text'].notna()]), "# None: ", len(df[df['text'].isna()]))

In [None]:
df[df['embedding'].notna()]

In [None]:
print("# embedding not None: ", len(df[df['embedding'].notna()]), "# None: ", len(df[df['embedding'].isna()]))

In [None]:
# Add embeddings based on text column for new rows
import torch
from transformers import AutoTokenizer, AutoModel

# Load pretrained model/tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
model = AutoModel.from_pretrained("allenai/specter")

In [None]:
# Function to get SPECTER embedding
def get_specter_embedding(text):
    # Tokenize text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')

    # Generate embedding
    with torch.no_grad():
        embedding = model(**inputs)[0].mean(dim=1).squeeze()

    # Convert tensor to numpy array
    embedding_np = embedding.numpy()

    return str(embedding_np.tolist())

# Get indices where 'embedding' is None
embedding_isna_indices = df[df['embedding'].isna()].index
max_index = embedding_isna_indices.max()

# Compute SPECTER embeddings for these rows and store in 'embedding' column
for i, idx in enumerate(embedding_isna_indices):
    print(f"idx {idx} / {max_index}")
    df.loc[idx, 'embedding'] = get_specter_embedding(df.loc[idx, 'text'])

In [None]:
print("# embedding not None: ", len(df[df['embedding'].notna()]), "# None: ", len(df[df['embedding'].isna()]))

### [Pre-processing - 2] Generating a T-SNE x, y coordinates from embeddings

In [None]:
print("# x not None: ", len(df[df['x'].notna()]), "# None: ", len(df[df['x'].isna()]))

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import ast

# Create a temporary DataFrame with only rows where 'embedding' is not NaN
df_temp = df[df['embedding'].notna()]

# Convert string of list to numpy array
# TODO: this is probably a very slow step to convert strings to lists
print("converting all embeddings to lists")
df_temp['embedding'] = df_temp['embedding'].apply(lambda x: np.array(ast.literal_eval(x)))

# Stack all embeddings into a numpy array
embeddings = np.vstack(df_temp['embedding'])

# Compute t-SNE
print("computing tsne")
tsne = TSNE(n_components=2, random_state=0)
embeddings_2d = tsne.fit_transform(embeddings)

# Update 'x' and 'y' in the original DataFrame for rows that have embeddings
print("updating df")
df.loc[df_temp.index, 'x'] = embeddings_2d[:, 0]
df.loc[df_temp.index, 'y'] = embeddings_2d[:, 1]

# Create a scatter plot of all the points with node sizes based on normalized citationCount
plt.figure(figsize=(10, 10))
plt.scatter(df['x'], df['y'], alpha=0.5, label='All papers')
plt.show()


In [None]:
from datetime import datetime

# Get today's date
now = datetime.now()
date_str = now.strftime('%y-%m-%d')
time_str = now.strftime('%H-%M-%S')
folder_path = f'papers/{date_str}'
n = len(papers.keys())
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

df.to_json(f'{folder_path}/{time_str}_tsne_output_{df.shape[0]}.json', orient='records')

In [None]:
print("# x not None: ", len(df[df['x'].notna()]), "# None: ", len(df[df['x'].isna()]))

### (Skip for now) Generating edges from embedding similarity & direct citations

In [None]:
import pandas as pd
import json

# load the data from your JSON file
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\data_collection\openalex\extracted_results_with_embeddings.json', 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)
df['paperId'] = df['id']
df = df[df['embedding'].apply(lambda x: len(x) != 0)] # drop empty embeddings

# print out the DataFrame to verify
print(df.head())

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import MinMaxScaler
import json


In [None]:
# Get embedding distances

# Extract 'vector' from 'embedding' and convert to DataFrame
embedding_df = pd.DataFrame(df['embedding'].tolist())
embedding_df = embedding_df.dropna()

# Compute the cosine distances
distances = cosine_distances(embedding_df)

# Initialize a MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 100))

# Reshape distances to a 1D array and scale
distances_scaled = scaler.fit_transform(distances.reshape(-1, 1))

# Reshape back to original shape
distances_scaled = distances_scaled.reshape(distances.shape)


In [None]:
# Initialize lists to store the edges data
source = []
target = []
weights = []

# Iterate over the distances matrix
for i in range(distances.shape[0]):
    for j in range(i+1, distances.shape[1]):  # j starts from i+1 to avoid duplicate edges and self-edges
        source.append(df.iloc[i]['paperId'])
        target.append(df.iloc[j]['paperId'])
        weights.append(distances_scaled[i, j])

# Create the edges DataFrame
edges_df = pd.DataFrame({'id': range(len(source)), 'weight': weights, 'source': source, 'target': target})

# 14m for 2000 nodes

In [None]:
# Convert the DataFrame to a JSON string
edges_json = edges_df.to_json(orient='records', indent=4)

# Dump in JSON
with open('edges/edges_2000.json', 'w') as f:
    f.write(edges_json)

# Convert the JSON string to a dictionary
edges_dict = json.loads(edges_json)


In [None]:
# Code for creating edges based on direct citations
df["citations"]

### Using LLMs for topic-based clustering

In [None]:
# [GPT Topic Labeling - 3]

import pandas as pd
import json

# Load in latest papers file
papers_json_path = r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\papers\23-07-25_11935_tsne_output.json'

# load the data from your JSON file
with open(papers_json_path, 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# print out the DataFrame to verify
print(df.head())

In [1]:
# [GPT Topic Labeling - 2] Using GPT3.5 to generate topics
# Improvement: add rate limiting error handling so you don't hardcode the wait time

# imports
%reload_ext dotenv
%dotenv
import os
import openai
# openai.api_key = os.getenv("OPENAI_GPT4_API_KEY")
openai.api_key = os.getenv("OPENAI_API_KEY")


# models
EMBEDDING_MODEL = "text-embedding-ada-003"
GPT_MODEL = "gpt-3.5-turbo"
# GPT_MODEL = "gpt-4"

# for bulk openai message, no stream
def chat_openai(prompt="Tell me to ask you a prompt", model=GPT_MODEL, chat_history=[]):
    # define message conversation for model
    if chat_history:
        messages = chat_history
    else:
        messages = [
            {"role": "system", "content": "You are an educated carbon capture research consultant and a generally educated and helpful researcher and programmer. Answer as correctly, clearly, and concisely as possible."},
        ]
    messages.append({"role": "user", "content": prompt})

    # create the chat completion
    print("Prompt: ", prompt)
    completion = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    print("Completion info: ", completion)
    text_answer = completion["choices"][0]["message"]["content"]

    # updated conversation history
    messages.append({"role": "assistant", "content": text_answer})

    return text_answer, messages

#### 1. Seeding – Ask GPT4 to be a domain expert and use “expert knowledge” to seed an initial taxonomy definition of main classes, subclasses, and divisions

In [None]:
seed_initial_taxonomy_prompt = '''
Create a taxonomy of all carbon capture research areas. Be as mutually exclusive, completely exhaustive (MECE) and concise as possible. Be sure to also include a "General" category for information like literature reviews and updates, a "Miscellaneous" category for concepts that have yet to be covered by an appropriate category and non-carbon capture related concepts, and use multilevel numbering. Create as many levels breadth-wise and depth-wise as appropriate.
'''

In [None]:
res = chat_openai(seed_initial_taxonomy_prompt)
res[0]

In [None]:
initial_taxonomy = res[0]
print(initial_taxonomy)

In [None]:
# jsonify_taxonomy_prompt = f'''
# {initial_taxonomy}

# Put the above hierarchy of categories and sub-categories into a JSON format of 1. unique id (integer), 2. name of the category, 3. layer in the hierarchy (integer), and 4. content (a list of subcategories) 

# The output should be of this format: [
#     {{
#         "cluster_id": 0,
#         "name": "General"
#         "layer": 0,
#         "content": [...],
#     }}
# ]
# '''

#### [Pre-processing - 3] 2. Initial categorization – Use GPT3.5 to look at 1) the seeded taxonomy and 2) paper titles and abstracts, 3) categorize the paper title and abstract and adapt the seeded taxonomy as necessary.

In [None]:
# Pre-processing step 1 and df should've already updated the text column and there should already be a classification_ids column
# # [GPT Topic Labeling - 3] Create a new text column
# df['text'] = df.apply(lambda row: 'Title: ' + row['title'] + '.' if pd.isna(row['abstract']) else 'Title: ' + row['title'] + '. Abstract: ' + row['abstract'], axis=1)

# # [GPT Topic Labeling - 4]
df['classification_ids'] = pd.Series(dtype='object')
# # df.head()

In [None]:
# [GPT Topic Labeling - 4.5, see everything for debugging]
pd.set_option('display.max_rows', None)

In [None]:
def load_latest_taxonomy_papers():
    # load the df
    with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\papers\latest_papers.json', 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data)

    with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\clusters\latest_taxonomy.txt', 'r') as f:
        numbered_taxonomy = f.read()

    return numbered_taxonomy, df

In [None]:
from datetime import datetime
import os

def save_taxonomy_papers_note(numbered_taxonomy, df, note):
    print("Saving taxonomy and df to papers with note")

    now = datetime.now()
    date_str = now.strftime('%y-%m-%d')
    time_str = now.strftime('%H-%M-%S')
    if not os.path.exists(f'clusters/{date_str}'):
        os.makedirs(f'clusters/{date_str}')
    if not os.path.exists(f'papers/{date_str}'):
        os.makedirs(f'papers/{date_str}')

    # save the taxonomy and df to a txt and csv file
    with open(f'clusters/{date_str}/{time_str}_{df.shape[0]}_{note}.txt', 'w') as f:
        f.write(numbered_taxonomy)
    df.to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_{note}.json', orient='records')
    df[['title', 'classification_ids']].to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_{note}_manual_inspection.json', orient='records', indent=2)

    # save to main
    with open(f'clusters/latest_taxonomy.txt', 'w') as f:
        f.write(numbered_taxonomy)
    df.to_json(f'papers/latest_papers.json', orient='records')

    print("Finished saving taxonomy and df to papers with note")

In [None]:
def retrieve_update_taxonomy_extract_keywords_prompt(taxonomy, papers):
    update_taxonomy_extract_keywords_prompt = f'''
Task:
The task is to 1) update the taxonomy of all carbon capture research by re-arranging or adding new categories and levels as appropriate and 2) correctly extract the most relevant and important paper text keywords to use them to classify each paper below into its top 3 matching categories in the updated taxonomy. 

Rules and Instructions:
1. For the taxonomy, be as mutually exclusive, completely exhaustive (MECE) and concise as possible. Try to avoid repetition and overlap. 
2. Ensure that the taxonomy is readable and not overwhelming. Try to model the taxonomy to reflect the usabilty and usefulness of great classification systems like Dewey Decimal System and Library of Congress Classification.
3. Use a hierarchical structure to manage the breadth and depth of the categories effectively, with the broadest categories at top and these categories becoming more specific as you go down the hierarchy. A general rule of thumb is to have around 10 top-level categories and 10 sub-categories for every parent category.
4. For paper keyword extraction and classification, be as accurate and grounded in the extracted paper text keywords as possible. 

Papers (id : text): 
{papers}

Input Taxonomy (category id : category name):
{taxonomy}

The output should be in the format of: 
1. "UPDATED TAXONOMY: " -- a readable and updated MECE multilevel taxonomy with any re-arrangement of categories or new categories and levels added.

2. "PAPER CLASSIFICATION: 
    [
        paper id : [[paper text keywords, corresponding category id], [paper text keywords, corresponding category id], etc.], 
        paper id : [[paper text keywords, corresponding category id], etc.], 
        etc.
    ]" 
    -- a JSON of each paper id with a relevance ranked list of paper text keywords and corresponding category id, with everything being strings. Rank by most to least relevant category to so that anyone looking for all papers about a category can find the most relevant papers to the category.
'''
    return update_taxonomy_extract_keywords_prompt

In [None]:
def retrieve_taxonomy_mapping_prompt(old_taxonomy, new_taxonomy):
    retrieve_taxonomy_mapping_prompt = f'''
Task:
The task is to match each Input Taxonomy category id to its closest Updated Taxonomy category id based on category names.

Rules:
1. Be as clear and correct as possible.

Input Taxonomy (id : name):
{old_taxonomy}

Updated Taxonomy (id : name):
{new_taxonomy}

The output should be in the following JSON format: 
"UPDATED CATEGORY IDS: [
{{ "Input Taxonomy category id" : "Updated Taxonomy category id"}},
{{ "Input Taxonomy category id" : "Updated Taxonomy category id"}},
etc.]" 
-- List every category id in Input Taxonomy and its closest category id in Updated Taxonomy based on category name. Use double quotes around each id.
'''
    return retrieve_taxonomy_mapping_prompt

In [None]:
# Trying token-optimizing keyword classification instead of paper
def retrieve_classify_keywords_prompt(taxonomy, keywords):
    classify_keywords_prompt = f'''
Task:
The task is to 1) update the taxonomy of all carbon capture research by re-arranging or adding new categories and levels as appropriate and 2) use the list of paper keywords to classify each paper id and keywords below into its keywords matching category ids in the updated taxonomy. 

Rules and Instructions:
1. For the taxonomy, be as mutually exclusive, completely exhaustive (MECE) and concise as possible. Try to avoid repetition and overlap. 
2. Ensure that the taxonomy is readable and not overwhelming. Try to model the taxonomy to reflect the usability and usefulness of great classification systems like Dewey Decimal System and Library of Congress Classification.
3. Use a hierarchical structure to manage the breadth and depth of the categories effectively, with the broadest categories at top and these categories becoming more specific as you go down the hierarchy. A general rule of thumb is to have around 10 top-level categories and 10 sub-categories for every parent category. Feel free to use as many depth levels as appropriate.
4. For paper keyword classification, be as accurate and grounded in the paper keywords as possible. 

Papers (id : keywords): 
{keywords}

Input Taxonomy (category id : category name):
{taxonomy}

The output should be in the format of: 
1. "UPDATED TAXONOMY: " -- a readable and updated MECE multilevel taxonomy with any re-arrangement of categories or new categories and levels added.

2. "PAPER CLASSIFICATION: 
[
    paper id : [[paper keywords, corresponding category id], [paper keywords, corresponding category id], etc.], 
    paper id : [[paper keywords, corresponding category id], etc.], 
    etc.
]" 
-- a JSON of each paper id with a list of its paper keywords and corresponding Updated Taxonomy category id, with everything being strings.

List formatting example (content and assignments are arbitrary):
Input: 80 : ['carbon capture', 'biology', 'sand']
Output: 80 : [['carbon capture, 8], ['biology', 10], ['sand', 9.3]]
'''

    return classify_keywords_prompt

In [None]:
# [GPT Topic Labeling - 6] Putting everything together to iterate through all papers, update and save taxonomy, and add category ids to each paper
import pandas as pd
import json
import re
import ast  # The Abstract Syntax Trees module

def extract_valid_json_string(json_str):
    print("EXTRACTING VALID JSONS FROM ", json_str)
    closing_brace_indices = [i for i, char in enumerate(json_str) if char == "}"]
    for index in reversed(closing_brace_indices):
        test_str = json_str[:index+1] + "]"
        try:
            json.loads(test_str)
            return test_str
        except json.JSONDecodeError:
            continue
    return None

def extract_taxonomy_and_classification(chat_output):
    # print("THIS IS CHAT OUTPUT IN EXTRACT TAXONOMY AND CLASSIFICATION: ", chat_output)

    # Extracting taxonomy
    taxonomy_start = chat_output.find('UPDATED TAXONOMY:') + len('UPDATED TAXONOMY:')
    taxonomy_end = chat_output.find('PAPER CLASSIFICATION:')
    updated_taxonomy = chat_output[taxonomy_start:taxonomy_end].strip()

    # Extracting paper classifications
    end_index = chat_output.rfind(']')
    classification_str = chat_output[taxonomy_end+len('PAPER CLASSIFICATION:'):end_index+1].strip()

    # Iterate through each line until one ending with ']]' is found
    valid_classification_str = ""
    classification_dict = {}
    for line in classification_str.splitlines():
        print("Line: ", line)
        if line.strip().endswith(']],'):
            key, value = line.split(":", 1)
            end_line_index = value.rfind(',')
            classification_dict[key.strip().strip('"')] = value[:end_line_index].strip()
        elif line.strip().endswith(']]'):
            key, value = line.split(":", 1)
            end_line_index = value.rfind(']')
            classification_dict[key.strip().strip('"')] = value[:end_line_index+1].strip()
    
    print("classification_dict", classification_dict)
    return updated_taxonomy, classification_dict

def extract_taxonomy_mapping(chat_output):
    print("THIS IS CHAT OUTPUT EXTRACT TAXONOMY MAPPING: ", chat_output)

    # Extracting changed category IDs
    changed_category_start = chat_output.find('[')
    changed_category_end = chat_output.rfind(']')
    print("changed_category_start", changed_category_start, "changed_category_end", changed_category_end)
    changed_category_ids_str = chat_output[changed_category_start:changed_category_end+1].strip()
    print("changed_category_ids_str", changed_category_ids_str)

    if changed_category_ids_str and (changed_category_ids_str[0] == '[' and changed_category_ids_str[-1] == ']'):
        changed_category_ids = json.loads(changed_category_ids_str)
        changed_category_ids_dict = {list(d.keys())[0]: list(d.values())[0] for d in changed_category_ids}
    else:
        changed_category_ids_dict = {}

    print("\nchanged changed_category_ids_dict: ", changed_category_ids_dict)
    return changed_category_ids_dict

In [None]:
# debugging the extract taxonomy and classification for reordering

chat_output = '''
UPDATED TAXONOMY:


'''

extract_taxonomy_and_classification(chat_output)

In [None]:
# [GPT Topic Labeling - 7] read and update BOTH df and numbered_taxonomy from last checkpoint or initial taxonomy
import pandas as pd
# df = pd.read_csv('checkpoints/gpt4_papers_100.csv')
with open('checkpoints/gpt4_2000/taxonomy_0_64.txt', 'r') as f:
    numbered_taxonomy = f.read()

In [None]:
import numpy as np
import ast

def update_classification_ids(classification_ids, changed_category_ids):
    print("classification_ids", classification_ids)

    # Parse string into actual list if necessary
    if isinstance(classification_ids, str):
        classification_ids = ast.literal_eval(classification_ids)

    # Check if the classification id exists in changed_category_ids. If it does, replace it
    # If classification_ids is NaN, skip over it
    if (classification_ids is np.nan) or (not classification_ids):
        return classification_ids
    
    res = []
    for item in classification_ids:
        if len(item) > 1:
            if item[1] in changed_category_ids:
                res.append([item[0], changed_category_ids[item[1]]])
            else:
                res.append(item)

    return res

In [None]:
print("# classification not None: ", len(df[df['classification_ids'].notna()]), "# None: ", len(df[df['classification_ids'].isna()]))

In [None]:
# [GPT Topic Labeling - 8]
import time
from datetime import datetime

def process_papers(df, numbered_taxonomy):
    # Typically 16000 is good for 8K max tokens
    TOTAL_PROMPT_TOKENS = 2500
    CHARS_PER_TEXT = 250
    NUM_BATCHES = TOTAL_PROMPT_TOKENS / CHARS_PER_TEXT # should be more than enough

    now = datetime.now()
    date_str = now.strftime('%y-%m-%d')
    time_str = now.strftime('%H-%M-%S')
    if not os.path.exists(f'clusters/{date_str}'):
        os.makedirs(f'clusters/{date_str}')
    if not os.path.exists(f'papers/{date_str}'):
        os.makedirs(f'papers/{date_str}')

    for i in range(0, int(NUM_BATCHES)):
        print(f"--- ITERATION {i} ---")
        subset = df.loc[df['classification_ids'].isna(), 'paperId':'text']
        min_idx = subset.index.min()
        if subset.empty:
            print("subset was all classified!")
            return
        print("Checking rows starting from", subset.index.min(), " num paper tokens to use: ", TOTAL_PROMPT_TOKENS - len(numbered_taxonomy))
        print("df", df['classification_ids'][min_idx-100:min_idx + 100], "numbered_taxonomy", numbered_taxonomy)
        
        # Create dictionary mapping index to paperId and add as many papers up to TOTAL_PROMPT_TOKENS
        index_to_paperId = {i: row['paperId'] for i, (_, row) in enumerate(subset.iterrows())}
        papers = {}
        total_length = 0
        for i, (_, row) in enumerate(subset.iterrows()):
            text = row['text'][:CHARS_PER_TEXT]
            if total_length + len(text) > TOTAL_PROMPT_TOKENS - len(numbered_taxonomy):
                break 
            papers[i] = text
            total_length += len(text)
        papers_processed = ""
        for index in papers.keys():
            papers_processed += f"{index} : {papers[index]}\n"

        # Call OpenAI API to update taxonomy and classify papers
        update_taxonomy_classify_papers_prompt = retrieve_update_taxonomy_extract_keywords_prompt(numbered_taxonomy, papers_processed)
        res = chat_openai(update_taxonomy_classify_papers_prompt)
        updated_taxonomy, paper_classification = extract_taxonomy_and_classification(res[0])
        print("updated taxonomy: ", updated_taxonomy, "paper classification: ", paper_classification)

        # Ensure that you update all previously classified papers' classification ids with the new taxonomy
        taxonomy_mapping_prompt = retrieve_taxonomy_mapping_prompt(numbered_taxonomy, updated_taxonomy)
        res = chat_openai(taxonomy_mapping_prompt)  # call to OpenAI API
        print("Map taxonomies result: ", res[0])
        changed_category_ids = extract_taxonomy_mapping(res[0])
        print("changed category ids: ", changed_category_ids)
    
        # update classification_ids from paper_classification using index_to_paperId
        for idx, class_ids in paper_classification.items():
            paper_id = index_to_paperId[int(idx)]
            df.loc[df['paperId'] == paper_id, 'classification_ids'] = df.loc[df['paperId'] == paper_id, 'classification_ids'].apply(lambda x: class_ids)
            
        # check and update for any changed paper classification ids because of updated taxonomy
        df['classification_ids'] = df['classification_ids'].apply(update_classification_ids, args=(changed_category_ids,))

        # save the taxonomy and df to a txt and csv file
        n = len(papers.keys())

        with open(f'clusters/{date_str}/{time_str}_{df.shape[0]}_{min_idx}_{n}.txt', 'w') as f:
            f.write(updated_taxonomy)
        df.to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_{min_idx}_{n}.json', orient='records')
        df[['title', 'classification_ids']].to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_{min_idx}_{n}_manual_analysis.json', orient='records', indent=2)
        
        numbered_taxonomy = updated_taxonomy

    return df, numbered_taxonomy

In [None]:
# [GPT Topic Labeling - 9.5] read and update BOTH df and numbered_taxonomy from last checkpoint or initial taxonomy
import pandas as pd

# load the df
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\papers\latest_papers.json', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)

with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\clusters\latest_taxonomy.txt', 'r') as f:
    numbered_taxonomy = f.read()

In [None]:
df['classification_ids'].head()

In [None]:
# [GPT Topic Labeling - 9]
df, numbered_taxonomy = process_papers(df, numbered_taxonomy)

# 159 min for 2000 papers, batch size of 40

In [None]:
print(df['classification_ids'].isnull())

In [None]:
# Test: keyword classification
# [GPT Topic Labeling - 8]
import time
import ast

def process_keywords(df, numbered_taxonomy):
    try:
        # Typically 16000 is good for 8K max tokens
        TOTAL_PROMPT_TOKENS = 5000
        CHARS_PER_TEXT = 250
        NUM_BATCHES = TOTAL_PROMPT_TOKENS / CHARS_PER_TEXT # should be more than enough

        now = datetime.now()
        date_str = now.strftime('%y-%m-%d')
        time_str = now.strftime('%H-%M-%S')
        if not os.path.exists(f'clusters/{date_str}'):
            os.makedirs(f'clusters/{date_str}')
        if not os.path.exists(f'papers/{date_str}'):
            os.makedirs(f'papers/{date_str}')

        for i in range(0, int(NUM_BATCHES)):
            print(f"--- ITERATION {i} ---")
            # only select rows that don't have keyword classification ids yet but have classification_ids (keywords have been extracted)
            subset_cols = df[['paperId', 'classification_ids']]
            subset = subset_cols[subset_cols['classification_ids'].apply(lambda x: type(x) == list)]
            min_idx = subset.index.min()
            if subset.empty:
                print("subset was all classified!")
                return
            
            print("Checking rows starting from", subset.index.min(), " num paper tokens to use: ", TOTAL_PROMPT_TOKENS - len(numbered_taxonomy))
            print("df", df['classification_ids'][min_idx-50:min_idx + 50], "numbered_taxonomy", numbered_taxonomy)
            
            # Create dictionary mapping index to paperId and add as many paper keywords up to TOTAL_PROMPT_TOKENS
            index_to_paperId = {i: row['paperId'] for i, (_, row) in enumerate(subset.iterrows())}
            papers = {}
            total_length = 0
            for i, (_, row) in enumerate(subset.iterrows()):
                classification_ids = row['classification_ids']
                keywords = str([item[0] for item in classification_ids])
                # 10K for GPT3.5, 15K for GPT4
                if total_length + len(keywords) > TOTAL_PROMPT_TOKENS - len(numbered_taxonomy): 
                    break 
                papers[i] = keywords
                total_length += len(keywords)
            papers_processed = ""
            for index in papers.keys():
                papers_processed += f"{index} : {papers[index]}\n"

            # Call OpenAI API to update taxonomy and classify papers
            update_taxonomy_prompt = retrieve_classify_keywords_prompt(numbered_taxonomy, papers_processed)
            res = chat_openai(update_taxonomy_prompt)
            updated_taxonomy, paper_classification = extract_taxonomy_and_classification(res[0])
            print("updated taxonomy: ", updated_taxonomy)
            print("paper classification: ", paper_classification)

            # Ensure that you update all previously classified papers' classification ids with the new taxonomy
            taxonomy_mapping_prompt = retrieve_taxonomy_mapping_prompt(numbered_taxonomy, updated_taxonomy)
            res = chat_openai(taxonomy_mapping_prompt)
            print("Map taxonomies result: ", res[0])
            changed_category_ids = extract_taxonomy_mapping(res[0])
            print("changed category ids: ", changed_category_ids)

            # update keyword_classification_ids using index_to_paperId
            for idx, class_ids in paper_classification.items():
                paper_id = index_to_paperId[int(idx)]  # map index back to paperId
                df.loc[df['paperId'] == paper_id, 'classification_ids'] = df.loc[df['paperId'] == paper_id, 'classification_ids'].apply(lambda x: class_ids)
                
            # check and update for any changed paper classification ids because of updated taxonomy
            df['classification_ids'] = df['classification_ids'].apply(update_classification_ids, args=(changed_category_ids,))

            # save the taxonomy and df to a txt and csv file
            n = len(papers.keys())

            with open(f'clusters/{date_str}/{time_str}_{df.shape[0]}_{min_idx}_{n}_reclassify_keywords.txt', 'w') as f:
                f.write(updated_taxonomy)
            df.to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_{min_idx}_{n}_reclassify_keywords.json', orient='records')
            df[['title', 'classification_ids']].to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_{min_idx}_{n}_manual_analysis_reclassify_keywords.json', orient='records', indent=2)
            
            numbered_taxonomy = updated_taxonomy

    except Exception as e:
        print("An error occurred: ", e)

    return df, numbered_taxonomy

In [None]:
# Test: keyword classification
process_keywords(df, numbered_taxonomy)

In [None]:
print(df.loc[df['classification_ids'].notnull(), 'classification_ids'])

In [None]:
# Reorganize taxonomy - 1
def retrieve_organize_taxonomy(taxonomy):
    organize_taxonomy_prompt = f'''
Initial Taxonomy (id : name)
{taxonomy}

Task:
There are already papers classified under each category, but the taxonomy is potentially all over the place. Imagine that the taxonomy is going to be transformed into a map, and that the top level categories would represent a zoomed out view and the lower level categories would appear as a user zooms in.

You are trying to create a useful taxonomy for carbon capture researchers. Re-arrange the categories and their levels so that the more relevant categories are  at the top level, with non-relevant categories categorized as lower levels under Miscellaneous. Feel free to use as many depth levels as necessary. Do not change category names to make them more or less relevant.
'''
    return organize_taxonomy_prompt

In [None]:
# Reorganize taxonomy - 2
import os
from datetime import datetime

def reorganize_taxonomy(df, numbered_taxonomy):
    now = datetime.now()
    date_str = now.strftime('%y-%m-%d')
    time_str = now.strftime('%H-%M-%S')
    if not os.path.exists(f'clusters/{date_str}'):
        os.makedirs(f'clusters/{date_str}')
    if not os.path.exists(f'papers/{date_str}'):
        os.makedirs(f'papers/{date_str}')

    try:
        update_taxonomy_prompt = retrieve_organize_taxonomy(numbered_taxonomy)
        print("update_taxonomy_prompt", update_taxonomy_prompt)

        res = chat_openai(update_taxonomy_prompt)  # call to OpenAI API
        print("Reorganized taxonomy result: ", res[0])
        
        # parse the res[0]
        updated_taxonomy = ""
        for line in res[0].splitlines():
            if len(line.strip()) > 2 and line.strip()[1] == ".":
                updated_taxonomy += line.strip() + "\n"
        print("updated taxonomy: ", updated_taxonomy)

        # Ensure that you update all previously classified papers' classification ids with the new taxonomy
        print("MAPPING TAXONOMIES")
        taxonomy_mapping_prompt = retrieve_taxonomy_mapping_prompt(numbered_taxonomy, updated_taxonomy)
        res = chat_openai(taxonomy_mapping_prompt)  # call to OpenAI API
        
        print("Map taxonomies result: ", res[0])
        changed_category_ids = extract_taxonomy_mapping(res[0])
        print("changed category ids: ", changed_category_ids)
            
        # check and update for any changed paper classification ids
        df['keyword_classification_ids'] = df['keyword_classification_ids'].apply(update_classification_ids, args=(changed_category_ids,))

        # save the taxonomy and df to a txt and csv file
        with open(f'clusters/{date_str}/{time_str}_{df.shape[0]}_reorganize_taxonomy.txt', 'w') as f:
            f.write(updated_taxonomy)
        df.to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_reorganize_taxonomy.json', orient='records')
        df[['title', 'classification_ids']].to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_reorganize_taxonomy_manual_inspection.json', orient='records', indent=2)

        # save to main
        with open(f'clusters/latest_taxonomy.txt', 'w') as f:
            f.write(numbered_taxonomy)
        df.to_json(f'papers/latest_papers.json', orient='records')
    except Exception as e:
        print("An error occurred: ", e)

    return df, numbered_taxonomy

In [None]:
df, numbered_taxonomy = reorganize_taxonomy(df, numbered_taxonomy)

In [None]:
df.head()

In [None]:
# Add keyword classification confidence scores - 1
import re

# create class_id_to_name dictionary
class_id_to_name = {}
for line in numbered_taxonomy.split("\n"):
    split_line = line.strip().split(maxsplit=1)
    if len(split_line) == 2:
        if split_line[0][-1] == ".": 
            class_id_to_name[split_line[0][:-1]] = split_line[1]
        else:
            class_id_to_name[split_line[0]] = split_line[1]

print("class_id_to_name:", class_id_to_name)

In [None]:
# Add keyword classification confidence scores - 2
import ast
import torch
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer, AutoModel

# Load pretrained model/tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
model = AutoModel.from_pretrained("allenai/specter")

def get_cosine_similarity(text1, text2):
    # Tokenize texts
    inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=512, return_tensors='pt')
    inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=512, return_tensors='pt')

    # Generate embeddings
    with torch.no_grad():
        embedding1 = model(**inputs1)[0].mean(dim=1).squeeze()
        embedding2 = model(**inputs2)[0].mean(dim=1).squeeze()

    # Compute cosine similarity
    score = 1 - cosine(embedding1.numpy(), embedding2.numpy())
    return score

In [None]:
# Add keyword classification confidence scores - 3
for i, (_, row) in enumerate(df.iterrows()):
    if row['classification_ids'] and row['classification_ids'] != 'nan' and (type(row['classification_ids']) == list or type(row['classification_ids']) == str):
        # Using ast.literal_eval to convert the string to a list of lists
        if type(row['classification_ids']) != list:
            classification_ids = ast.literal_eval(row['classification_ids'])
        else:
            classification_ids = row['classification_ids']

        updated_classification_ids = []
        for item in classification_ids:
            if item and len(item) == 2:
                keywords = item[0]
                class_id = item[1]
                if class_id in class_id_to_name.keys():
                    classification = class_id_to_name[class_id]
                    
                    # Get cosine similarity score using HuggingFace Semantic Scholar Spectre API embeddings
                    score = round(get_cosine_similarity(keywords, classification), 2)
                    updated_classification_ids.append([keywords, item[1], str(score)])
                else:
                    print("ROW ", i, " CLASS ID: ", class_id, " WAS NOT FOUND IN CLASS_ID_TO_NAME")
            
    df.loc[i, 'classification_ids'] = str(updated_classification_ids)

now = datetime.now()
date_str = now.strftime('%y-%m-%d')
time_str = now.strftime('%H-%M-%S')
if not os.path.exists(f'clusters/{date_str}'):
    os.makedirs(f'clusters/{date_str}')
if not os.path.exists(f'papers/{date_str}'):
    os.makedirs(f'papers/{date_str}')

# save the taxonomy and df to a txt and csv file
with open(f'clusters/{date_str}/{time_str}_{df.shape[0]}_add_keyword_class_scores.txt', 'w') as f:
    f.write(numbered_taxonomy)
df.to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_add_keyword_class_scores.json', orient='records')
df[['title', 'classification_ids']].to_json(f'papers/{date_str}/{time_str}_{df.shape[0]}_add_keyword_class_scores_manual_inspection.json', orient='records', indent=2)

# save to main
with open(f'clusters/latest_taxonomy.txt', 'w') as f:
    f.write(numbered_taxonomy)
df.to_json(f'papers/latest_papers.json', orient='records')

In [None]:
df.head()

In [None]:
from collections import defaultdict
import ast
import json
import os
from datetime import datetime

def generate_taxonomy_nested():
    taxonomy_str, df = load_latest_taxonomy_papers()

    # Create a dictionary where each key is a classification_id and
    # each value is a list of dictionaries containing paper data.
    class_children = defaultdict(list)

    # Iterate over the DataFrame once
    for i, row in df.iterrows():
        # Skip rows with no classification_ids
        if not row['classification_ids']:
            continue

        # Ensure classifications_ids is a list
        classification_ids = row['classification_ids']
        if type(classification_ids) == str:
            classification_ids_list = ast.literal_eval(classification_ids)
        else:
            classification_ids_list = classification_ids

        for keyword_idx, id_info in enumerate(classification_ids_list):
            # Skip malformed id_info
            if len(id_info) != 3:
                continue

            # Extract data from id_info
            keywords = id_info[0]
            paper_classification_id = id_info[1]
            confidence_score = id_info[2]

            # Add the paper data to the corresponding classification_id in class_children
            paper_data = {
                "name": str(row['paperId']) + "-" + str(keyword_idx), 
                "value": [{
                    "paperId": row['paperId'] if pd.notna(row['paperId']) else None, 
                    "title": row['title'] if pd.notna(row['title']) else None, 
                    "abstract": row['abstract'] if pd.notna(row['abstract']) else None,
                    "authors": [[item if pd.notna(item) else None for item in sublist] for sublist in row['authors']] if row['authors'] is not None else None,
                    "citationCount": row['citationCount'] if pd.notna(row['citationCount']) else None,
                    "doi": row['doi'] if pd.notna(row['doi']) else None,
                    "isOpenAccess": row['isOpenAccess'] if pd.notna(row['isOpenAccess']) else None,
                    "language": row['language'] if pd.notna(row['language']) else None,
                    "publicationDate": row['publication_date'] if pd.notna(row['publication_date']) else None,
                    "relevance_score": row["relevance_score"] if pd.notna(row["relevance_score"]) else None,
                    "url": row["url"] if pd.notna(row["url"]) else None,
                    "year": row["year"] if pd.notna(row["year"]) else None,
                    "tsne_x": row["x"] if pd.notna(row["x"]) else None,
                    "tsne_y": row["y"] if pd.notna(row["y"]) else None,
                    "keywords": keywords if keywords else None, 
                    "score": confidence_score if confidence_score else None
                }]
            }
            class_children[paper_classification_id].append(paper_data)

    # Now you can use your existing code to generate the taxonomy tree,
    # but replace the paper data generation part with a lookup in class_children.
    # Here is a rough example:

    print("class_children", class_children.keys())

    stack = []
    taxonomy_json = []
    lines = taxonomy_str.split('\n')
    id_counter = 0  # Keep track of unique id

    for line in lines:
        if line:
            category_id, category_name = line.strip().split(' ', 1)
            category_id = category_id.rstrip('.')
            layer = category_id.count('.')

            category_obj = {
                'id': id_counter,
                'classification_id': category_id,
                'name': category_name,
                'layer': layer,
                'children': class_children[category_id]
            }
            id_counter += 1

            if not stack:
                taxonomy_json.append(category_obj)
            else:
                while stack and stack[-1]['layer'] >= layer:
                    stack.pop()
                if not stack:
                    taxonomy_json.append(category_obj)
                else:
                    stack[-1]['children'].append(category_obj)
            stack.append(category_obj)

    # Write the taxonomy JSON to a file
    preprocessed_taxonomy_json = [{
        "name": "Carbon capture",
        "children": taxonomy_json
    }]

    now = datetime.now()
    date_str = now.strftime('%y-%m-%d')
    time_str = now.strftime('%H-%M-%S')

    with open(f'clusters/{date_str}/{time_str}_taxonomy_gen_taxonomy_json.json', 'w') as f:
        json.dump(preprocessed_taxonomy_json, f, indent=4)

    # save to main
    with open(f'clusters/latest_taxonomy.json', 'w') as f:
        json.dump(preprocessed_taxonomy_json, f, indent=4)

    # save checkpoint
    save_taxonomy_papers_note(taxonomy_str, df, "gen_taxonomy_json")

    return    

In [None]:
# [GPT Topic Labeling - 11]
generate_taxonomy_nested()

In [4]:
# Load in parsed taxonomy from file
import json

# Open the file and load the JSON
with open('clusters/latest_taxonomy.json', 'r') as f:
    parsed_taxonomy = json.load(f)

In [5]:
# [GPT Topic Labeling - 11]
# EDGES
# Selecting edges within each classification id group: keeping the paperId's most similar paper in their cluster, and also constructing the MST for each cluster to ensure it's a connected graph

import numpy as np
from scipy.spatial.distance import cosine
from itertools import combinations
from scipy.spatial.distance import euclidean
from collections import defaultdict
import networkx as nx


# 1. Create a dictionary that maps each classification_id to a list of papers in that cluster.
def traverse_clusters(cluster, classification_to_papers):
    values = []

    # Recursively traverse this cluster's child clusters
    for child in cluster['children']:
        if 'value' in child:
            values.append([child['name'], child['value'][0]])
        elif 'children' in child:
            traverse_clusters(child, classification_to_papers)

    # Add this cluster's classification_id and papers to the dictionary
    if 'classification_id' in cluster:
        classification_to_papers[cluster['classification_id']] = values

# Initialize an empty dictionary
classification_to_papers = {}

# Traverse each top-level category in the parsed taxonomy
for category in parsed_taxonomy:
    traverse_clusters(category, classification_to_papers)


# 2. Iterate through this dictionary, and for each category, compute the cosine similarity between every pair of papers in that category.
# 3. Store the results in a list of dictionaries, as you've described.

# A dictionary mapping each paper_id to its list of highest weight edge
max_edges = defaultdict(list)

# A dictionary mapping each classification_id to its graph
classification_to_graph = defaultdict(nx.Graph)

for classification_id, paper_ids in classification_to_papers.items():
    print("On classification id: ", classification_id)
    paper_pairs = list(combinations(paper_ids, 2))

    for paper_id1_obj, paper_id2_obj in paper_pairs:
        # print("paper_id1", paper_id1_obj[0], "paper_id2", paper_id2_obj[0])

        # Only use paperIds
        paper_id1, paper_id2 = paper_id1_obj[0], paper_id2_obj[0]

        if (paper_id1_obj[1]["tsne_x"] == None or paper_id1_obj[1]["tsne_y"] == None or paper_id2_obj[1]["tsne_x"] == None or paper_id2_obj[1]["tsne_y"] == None):
            # print("skipping because of None tsne_x or tsne_y")
            continue

        x1, y1 = paper_id1_obj[1]["tsne_x"], paper_id1_obj[1]["tsne_y"]
        x2, y2 = paper_id2_obj[1]["tsne_x"], paper_id2_obj[1]["tsne_y"]
        
        distance = euclidean([x1, y1], [x2, y2])
        weight = 1 / (1 + distance)

        # Add the edge to the graph for this classification_id
        classification_to_graph[classification_id].add_edge(paper_id1, paper_id2, weight=weight)

        # Create a new edge
        new_edge = {"source": paper_id1, "target": paper_id2, "weight": weight}
        
        # Add the new edge to the list for paper_id1 if it's one of the top num_most_similar
        num_most_similar = 1
        if len(max_edges[paper_id1]) < num_most_similar or weight > min(edge['weight'] for edge in max_edges[paper_id1]):
            if len(max_edges[paper_id1]) == num_most_similar:
                # Remove the edge with the lowest weight
                max_edges[paper_id1].remove(min(max_edges[paper_id1], key=lambda edge: edge['weight']))
            max_edges[paper_id1].append(new_edge)
        
        # Do the same for paper_id2, but reverse the source and target
        new_edge = {"source": paper_id2, "target": paper_id1, "weight": weight}
        if len(max_edges[paper_id2]) < num_most_similar or weight > min(edge['weight'] for edge in max_edges[paper_id2]):
            if len(max_edges[paper_id2]) == num_most_similar:
                max_edges[paper_id2].remove(min(max_edges[paper_id2], key=lambda edge: edge['weight']))
            max_edges[paper_id2].append(new_edge)

# Combine the max weight edges for each node with the MSTs for each classification_id
final_edges = []

# Create a set to keep track of edges that have been added
added_edges = set()

# Flatten max_edges.values()
flattened_max_edges = [edge for edges in max_edges.values() for edge in edges]

# Add an index as the id for each edge in final_edges
for idx, edge in enumerate(flattened_max_edges):
    edge_tuple = (edge["source"], edge["target"])
    if edge_tuple not in added_edges:
        # Add the edge id first so it appears first
        edge_with_id = {"id": idx}
        edge_with_id.update(edge)
        final_edges.append(edge_with_id)
        added_edges.add(edge_tuple)

# Now add the MST edges, continuing the ids from where they left off
# for graph in classification_to_graph.values():
#     mst_edges = nx.algorithms.tree.maximum_spanning_edges(graph, data=False)
#     for source, target in mst_edges:
#         edge_tuple = (source, target)
#         if edge_tuple not in added_edges:
#             weight = graph[source][target]['weight']
#             final_edges.append({
#                 "id": len(final_edges),  # The next id is the current length of final_edges
#                 "weight": weight,
#                 "source": source,
#                 "target": target
#             })
#             added_edges.add(edge_tuple)

On classification id:  1.1
On classification id:  1.2
On classification id:  1.3
On classification id:  1
On classification id:  2.1.1
On classification id:  2.1.2
On classification id:  2.1.3
On classification id:  2.1.4
On classification id:  2.1
On classification id:  2.2.1
On classification id:  2.2.2
On classification id:  2.2.3
On classification id:  2.2
On classification id:  2.3.1
On classification id:  2.3.2
On classification id:  2.3
On classification id:  2.4.1


On classification id:  2.4.2
On classification id:  2.4.3
On classification id:  2.4
On classification id:  2.5.1
On classification id:  2.5.2
On classification id:  2.5
On classification id:  2.6.1
On classification id:  2.6.2
On classification id:  2.6
On classification id:  2.7
On classification id:  2.8
On classification id:  2.9.1
On classification id:  2.9.2
On classification id:  2.9
On classification id:  2
On classification id:  3.1.1
On classification id:  3.1.2
On classification id:  3.1.3
On classification id:  3.1
On classification id:  3.2
On classification id:  3.3
On classification id:  3.4.1
On classification id:  3.4.2
On classification id:  3.4.3
On classification id:  3.4
On classification id:  3.5
On classification id:  3
On classification id:  4.1
On classification id:  4.2
On classification id:  4.3
On classification id:  4.4
On classification id:  4.5
On classification id:  4
On classification id:  5.1
On classification id:  5.2
On classification id:  5.3
On cla

In [7]:
# [GPT Topic Labeling - 11.1]
import json
from datetime import datetime
import os

now = datetime.now()
date_str = now.strftime('%y-%m-%d')
time_str = now.strftime('%H-%M-%S')
if not os.path.exists(f'edges/{date_str}'):
    os.makedirs(f'edges/{date_str}')

with open(f'edges/{date_str}/{time_str}_edges_gen_edges.json', 'w') as f:
    json.dump(final_edges, f, indent=4)
with open('edges/latest_edges.json', 'w') as f:
    json.dump(final_edges, f, indent=4)

#### 3. Iteration – it’s likely that the fields will get too big, so if there are sections with too many papers, have GPT3.5 break them down even further

In [None]:
# existing_themes = 'Renewable integration, carbon capture, Clean energy transition, Operational flexibility, Cost-effective CO2 reduction, carbon storage, lost decade'

# paper = "titled '" + df.iloc[0]["title"] + "'"
# if df.iloc[0]["abstract"]:
#     paper += "with the following abstract: " + df.iloc[0]["abstract"]

# prompt = f'''
# Paper: {paper} \n Task: Given the paper title and abstract above, determine at most 5 themes for a researcher whose goal is to eventually make impactful discoveries and experiments. \n Rules: Do not output any theme that is beyond what is given in the paper. Be as concise (less than 5 words), clear, and correct as possible. Do not make up anything not apparent from the paper. \n Use themes from other papers only if the paper mentions them: {existing_themes}. 

# Your output should be of the following format: Theme1, Theme2, Theme3, Theme4, Theme5
# '''

In [None]:
# Test 2 following some guy's topic modeling article: https://medium.com/@stephensonebinezer/transform-your-topic-modeling-with-chatgpt-cutting-edge-nlp-f4654b4eac99

existing_themes = set()

df['title_abstract'] = df.apply(lambda row: "Title: " + row["title"] + ("; Abstract: " + row["abstract"] if pd.notnull(row["abstract"]) else ""), axis=1)

In [None]:
df_sorted = df.sort_values(by='citationCount', ascending=False)
df_sorted.head()

In [None]:
# for paper in df:
prompt = f'''
I am giving you the title and abstract (if provided) of a paper in the format [Title ; Abstract]. Give me at most 5 broader categories or themes like carbon capture, membranes, algae sinking, enhanced rock weathering, etc. in the format [Theme1, Theme2, Theme3, Theme4, Theme5] for the paper '{df.iloc[0]['title_abstract']}'. Be as concise, clear, and correct as possible. Do not make up anything not apparent in the paper. Use as minimal number of categories and themes as possible, and rank the most relevant and specific ones to the paper first'
'''

res = chat_openai(prompt)
print(prompt + " -- " + df.iloc[0]['title_abstract'] + " -- " + res[0])

# Tried to label topics in a way dependent on previous topics, but I'll just sample topics independently instead
# if not existing_themes:
#     existing_themes = res[0][1:-1]
# else:
#     existing_themes += ", " + res[0][1:-1]

In [None]:
# Loop to get up to 5 topics from GPT, or Unknown as a list for each paper
import time

def get_topics(title_abstract):
    prompt = f'''
    I am giving you the title and abstract (if provided) of a paper in the format [Title ; Abstract]. Give me at most 5 broader categories or themes like carbon capture, membranes, algae sinking, enhanced rock weathering, etc. in the format [Theme1, Theme2, Theme3, Theme4, Theme5] for the paper '{title_abstract}'. Be as concise, clear, and correct as possible. Do not make up anything not apparent in the paper.'
    '''
    
    time.sleep(20)
    res = chat_openai(prompt)
    print(title_abstract + " -- " + res[0])

    if (res[0][0] != '[' and res[0][-1] != ']'):
        return "[Unknown]"

    return res[0]

# Add the GPT_topics column if it doesn't exist
if 'GPT_topics' not in df.columns:
    df['GPT_topics'] = None

for i in df.index:
    print(i)

    # Check if topics already has a value or if it's not in the right format
    if pd.isnull(df.at[i, 'GPT_topics']) or (df.at[i, 'GPT_topics'][0] != '[' or df.at[i, 'GPT_topics'][-1] != ']'):
        df.at[i, 'GPT_topics'] = get_topics(df.at[i, 'title_abstract'])

    # Save every 100 iterations
    df.to_json('df_with_topics.json', orient='records', indent=4)

# Save the final DataFrame
df.to_json('df_with_topics.json', orient='records', indent=4)

In [None]:
# Assuming pruned_trees already exists

# Loading trees back in
import json

# Open the file in read mode
with open('pruned_tree.json', 'r') as f:
    pruned_trees = json.load(f)  # Load the JSON data from the file

# Now, 'data' is a list of dictionaries that you can manipulate
# print(trees)


In [None]:
# Preparing main topic helper function
def get_main_topic(topics_list):
    # Converts the list of topics to a comma-separated string enclosed in brackets
    topics_str = ', '.join(topics_list)

    # hardcoded to get the prompt right
    # topics_str = "Carbon capture, Membrane design, Gas separation, Mechanical strength, Plant root mimicry, Carbon capture, Metal-organic frameworks, CO2 capture, Adsorption, Porous materials, Carbon capture, Mixed matrix membranes, Post-combustion carbon capture, Cost prediction, High-throughput computational, Carbon capture, Separation"

    # topics_str = "Public opinion, Carbon capture and storage, Framing effects, Support, Energy transition, Policy scenarios, Bioenergy, Carbon capture and storage, Perceptions, Climate engineering perception, Carbon capture and storage, Public acceptance, Germany, Policy implications, Climate change mitigation, Carbon capture and storage, Decade-long challenges, Policy and regulatory framework, Technological advancements, Climate change mitigation, Carbon sequestration, Hydrological processes, Geologic storage, Environmental impact, Carbon capture and storage (CCS), Negative emissions technologies (NET), Climate mitigation, Bioenergy with CCS (BECCS), CO2 storage assessment, Carbon capture, utilization, and storage (CCUS), Pipeline network for carbon dioxide transport, Financial incentives for CCUS, Ethanol biorefineries, Enhanced oil recovery (EOR), Climate change mitigation, Negative emission technologies, Carbon accounting, Policy frameworks, Sustainable biomass production"

    prompt = f'''
    I am giving you a list of topics in the format [Topic1, Topic2, ...]. Give me the main topic that most differentiates this list from other carbon capture topic lists: '{topics_str}'. Be as concise, clear, and accurate as possible. Only use carbon capture when all the topics are essentially the entire field of carbon capture, otherwise, try to output a broader theme that is differentiated from the other topics. The output should be of this format: '[<insert main topic>]'

    Example:
    Topics: [Public opinion, carbon capture, policy, regulatory framework, technological advancements, CCUS, CCS, storage, enhanced rock weathering]
    Output: [Public policy and governance]
    '''
    time.sleep(25)
    res = chat_openai(prompt)
    print(topics_str + " -- " + res[0])

    if (res[0][0] != '[' and res[0][-1] != ']'):
        return "[Unknown]"

    # Assuming that res[0] is a string of main topic enclosed in brackets, we strip the brackets
    return res[0]

In [None]:
print(get_main_topic([
                    "Carbon capture",
                    "Membrane design",
                    "Gas separation",
                    "Mechanical strength",
                    "Plant root mimicry",
                    "Carbon capture"]))

In [None]:
# Add a list of GPT topics and main topics to each cluster
def update_cluster_topics(cluster, df):
    if isinstance(cluster['content'][0], dict):  # it's a parent cluster
        all_topics = []  # will store all topics from all children clusters
        for sub_cluster in cluster['content']:
            update_cluster_topics(sub_cluster, df)
            all_topics.extend(sub_cluster['GPT_topics'])
        cluster['GPT_topics'] = all_topics
    else:  # it's a leaf cluster
        paperIds = cluster['content']
        all_topics = []  # will store all topics from all papers in this cluster
        for paperId in paperIds:
            paper_topics = df[df['paperId'] == paperId]['GPT_topics'].values
            if len(paper_topics) > 0:
                # paper_topics[0] is a string of topics comma separated in brackets, 
                # we convert it to a list of topics and extend all_topics with it
                topics_list = paper_topics[0].strip('[]').split(', ')
                all_topics.extend(topics_list)
        cluster['GPT_topics'] = all_topics

    # After assigning GPT_topics, we get the main topic
    print(cluster['cluster_id'])
    # make sure cluster main topic hasn't already been assigned to save token usage
    if cluster['main_topic'] and cluster['main_topic'][0] != '[' and cluster['main_topic'][-1] != ']':
        cluster['main_topic'] = get_main_topic(cluster['GPT_topics'])

        with open('pruned_tree_w_main_topic.json', 'w') as f:
            json.dump(pruned_trees, f, indent=4)

# Then call the function for each tree in the list like this
for tree in pruned_trees:
    update_cluster_topics(tree, df)


In [None]:
pruned_trees

In [None]:
print(json.dumps(pruned_trees, indent=4))

with open('pruned_tree_w_topics.json', 'w') as f:
    json.dump(pruned_trees, f, indent=4)

## Generating clusters from 1. x, y coordinates, 2. edges, and 3. topics

In [None]:
# Using HDBSCAN for hierarchical clustering on TSNE data. Worst case just use K-means or DBSCAN

import pandas as pd
import json

# load the data from your JSON file
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\t-sne\output_2000.json', 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# print out the DataFrame to verify
print(df.shape)

In [None]:
df.head()

In [None]:
import numpy as np
np.any(np.isnan(df[['x', 'y']].values))

In [None]:
from hdbscan import HDBSCAN
from collections import defaultdict
import pandas as pd
import numpy as np

# Extract 'x' and 'y' columns from df and convert them into a 2D array
data = df[['x', 'y']].values

# Create an HDBSCAN instance
clusterer = HDBSCAN(min_cluster_size=100, min_samples=1, gen_min_span_tree=True, cluster_selection_epsilon=0.5, cluster_selection_method='eom')

# Fit the model to your data
clusterer.fit(data)

# Add the labels to the DataFrame
df['hdbscan_labels'] = clusterer.labels_

# Get the condensed tree 
tree = clusterer.condensed_tree_.to_pandas()

# Create a dictionary to map every cluster to its paper IDs
cluster_to_paper = df.groupby('hdbscan_labels')['paperId'].apply(list).to_dict()

# the parent-child relationship of the clusters
relationships = tree[['parent', 'child']].values

# create a dictionary to store cluster hierarchy
cluster_tree = defaultdict(list)

# iterate through hierarchy df and fill children
for parent, child in relationships:
    cluster_tree[parent].append(child)

# Use cache to store results of function calls (memoization)
descendants_cache = {}
def get_all_descendants(cluster_tree, cluster_id):
    if cluster_id in descendants_cache:
        return descendants_cache[cluster_id]

    descendants = []
    child_clusters = cluster_tree[cluster_id]
    if child_clusters:
        descendants.extend(child_clusters)
        for child_cluster in child_clusters:
            descendants.extend(get_all_descendants(cluster_tree, child_cluster))
    
    descendants_cache[cluster_id] = descendants
    return descendants

# Use cache to store results of centroid calculations (memoization)
centroid_cache = {}
def calculate_cluster_centroid(cluster_ids):
    # Get all the descendants of the cluster
    all_cluster_ids = cluster_ids
    for cluster_id in cluster_ids:
        child_clusters = get_all_descendants(cluster_tree, cluster_id)
        if child_clusters:
            all_cluster_ids += child_clusters

    centroid_x = []
    centroid_y = []
    for cluster_id in all_cluster_ids:
        if cluster_id in cluster_to_paper:
            paper_ids = cluster_to_paper[cluster_id]
            cluster_points = df[df['paperId'].isin(paper_ids)]
            x_values = cluster_points['x'].values
            y_values = cluster_points['y'].values

            valid_values = np.logical_and(~np.isnan(x_values), ~np.isnan(y_values))
            if np.any(valid_values):
                centroid_x.append(np.mean(x_values[valid_values]))
                centroid_y.append(np.mean(y_values[valid_values]))
    
    if centroid_x and centroid_y:
        centroid_cache[cluster_id] = (np.mean(centroid_x), np.mean(centroid_y))
        return centroid_cache[cluster_id]

    return 0, 0


def traverse_tree(cluster_tree, cluster_id, layer):
    if not cluster_tree[cluster_id]:
        paper_points = df[df['hdbscan_labels'] == cluster_id]['paperId'].tolist()
        centroid_x, centroid_y = centroid_cache.get(cluster_id) if cluster_id in centroid_cache else calculate_cluster_centroid([cluster_id])
        return {
            "cluster_id": cluster_id,
            "layer": layer,
            "content": paper_points,
            "centroid_x": centroid_x,
            "centroid_y": centroid_y
        } if paper_points else None

    result = {"cluster_id": cluster_id, "layer": layer, "content": [], "centroid_x": None, "centroid_y": None}
    child_cluster_ids = cluster_tree[cluster_id]
    for child_cluster_id in child_cluster_ids:
        child_tree = traverse_tree(cluster_tree, child_cluster_id, layer + 1)
        if child_tree is not None:  # only add child_tree if it's not None
            result["content"].append(child_tree)
    
    centroid_x, centroid_y = centroid_cache.get(cluster_id) if cluster_id in centroid_cache else calculate_cluster_centroid(child_cluster_ids)
    result["centroid_x"] = centroid_x
    result["centroid_y"] = centroid_y

    return result if result["content"] else None

# Let's find all the roots and traverse the tree from each root
roots = set(cluster_tree.keys()) - set(child for children in cluster_tree.values() for child in children)

trees = []
for root in roots:
    tree = traverse_tree(cluster_tree, root, 0)
    if tree is not None:
        trees.append(tree)


# took 2 min to complete for around 60 samples.
print("COMPLETE!")

In [None]:
import json
import numpy as np

# Custom encoder class to convert int64 to int
class Int64Encoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.int64):
            return int(obj)
        return super().default(obj)

# Convert trees to JSON format with indentation for readability
tree_json = json.dumps(trees, indent=4, cls=Int64Encoder)

# Print the formatted tree
print(tree_json)

# Specify the file path to save the JSON data
output_file = "clusters/tree_2000.json"

# Write the JSON data to the file
with open(output_file, "w") as file:
    json.dump(trees, file, indent=4, cls=Int64Encoder)

print("JSON file saved successfully.")

# JUMP TO PRUNING BELOW IF NECESSARY

In [None]:
# For now, it seems like d3 for computing infinite voronoi diagrams is probably the best bet because python doesn't have a library for that for some reason
# Pre-processing cluster trees to form voronoi diagram
from shapely.geometry import Polygon
from scipy.spatial import Voronoi, convex_hull_plot_2d, Delaunay
import numpy as np

# This function creates Voronoi polygons
def create_voronoi_polygons(cluster):
    leaf_clusters = []
    all_centroids = []

    # print("cluster", cluster)
    # print("cluster['content']", cluster['content'])
    # print("cluster['content'][0]", cluster['content'][0])
    def recurse(cluster):
        if isinstance(cluster['content'][0], dict):
            for subcluster in cluster['content']:
                recurse(subcluster)
        else:
            leaf_clusters.append(cluster)
            all_centroids.append([cluster['centroid_x'], cluster['centroid_y']])

    recurse(cluster)


    all_centroids = np.array(all_centroids)
    vor = Voronoi(all_centroids)
    hull = Delaunay(all_centroids)

    # testing if voronoi works properly
    import matplotlib.pyplot as plt
    fig = voronoi_plot_2d(vor)
    plt.show()
    print("leaf_clusters", len(leaf_clusters))
    print("len(vor.point_region)", len(vor.point_region))
    print("vor.point_region", vor.point_region)

    polygons = []
    for indices in vor.point_region:
        print("indices", indices)
        region = vor.regions[indices]
        print("vor.vertices[region].tolist()", vor.vertices[region].tolist())

        # TODO: risk, just not handling this and adding the vertices
        # if not region: 
        #     continue
        # if -1 in region:  # infite region that goes on forever
        #     print("region", region)
        #     polygon = hull.points[hull.convex_hull].tolist()  
        # else:
        
        polygon = vor.vertices[region].tolist()
        polygons.append(polygon)

    for i, leaf_cluster in enumerate(leaf_clusters):
        # print(polygons[i])
        leaf_cluster['polygonPoints'] = polygons[i]

    # visualie leaf_cluster polygons
    import matplotlib.pyplot as plt
    fig = voronoi_plot_2d(vor)
    plt.show()

# Main function to add polygon points to clusters
def add_polygon_points_to_clusters(cluster):
    # create Voronoi polygons for the leaf clusters
    create_voronoi_polygons(cluster)

    # This function populates polygon points up to the parent clusters
    def recurse(cluster):
        if isinstance(cluster['content'][0], dict):
            cluster['polygonPoints'] = []
            for subcluster in cluster['content']:
                recurse(subcluster)
                cluster['polygonPoints'].extend(subcluster.get('polygonPoints', []))

    recurse(cluster)

# Let's use the functions
add_polygon_points_to_clusters(trees[0])

In [None]:
trees[0]

In [None]:
# Convert trees to JSON format with indentation for readability
tree_json = json.dumps(trees, indent=4, cls=Int64Encoder)

# Print the formatted tree
print(tree_json)

# Specify the file path to save the JSON data
output_file = "tree.json"

# Write the JSON data to the file
with open(output_file, "w") as file:
    json.dump(trees, file, indent=4, cls=Int64Encoder)

print("JSON file saved successfully.")

In [None]:
# PRUNING STARTS HERE
# Loading trees back in
import json

# Open the file in read mode
with open('clusters/tree_2000.json', 'r') as f:
    trees = json.load(f)  # Load the JSON data from the file

# Now, 'data' is a list of dictionaries that you can manipulate
# print(trees)

In [None]:
# Pruning the trees
def prune_single_child_clusters(cluster, depth=0):
    cluster['layer'] = depth  # Set the layer to the current depth

    # If this cluster contains sub-clusters
    if 'content' in cluster and isinstance(cluster['content'][0], dict):
        new_content = []
        for sub_cluster in cluster['content']:
            pruned = prune_single_child_clusters(sub_cluster, depth+1)
            # if the pruned sub_cluster has only one sub_cluster, replace it with its sub_cluster
            if len(pruned.get('content', [])) == 1 and isinstance(pruned['content'][0], dict):
                new_content.append(pruned['content'][0])
            else:
                new_content.append(pruned)
        cluster['content'] = new_content
    return cluster

pruned_trees = [prune_single_child_clusters(cluster) for cluster in trees]

print(json.dumps(pruned_trees, indent=4))

with open('clusters/pruned_tree_2000.json', 'w') as f:
    json.dump(pruned_trees, f, indent=4)

In [None]:
# Manually inspect simple tree
def simplify_cluster(cluster):
    # Create a new dictionary with just the cluster_id
    simple_cluster = {'cluster_id': cluster['cluster_id']}

    # If this cluster contains sub-clusters, simplify those as well
    if 'content' in cluster and isinstance(cluster['content'][0], dict):
        simple_cluster['content'] = [simplify_cluster(sub_cluster) for sub_cluster in cluster['content']]

    return simple_cluster

# Simplify each cluster in the data
simple_data = [simplify_cluster(cluster) for cluster in pruned_trees]

# Print the simplified data as a formatted JSON string
print(json.dumps(simple_data, indent=4))

with open('simple_tree.json', 'w') as f:
    json.dump(simple_data, f, indent=4)

In [None]:
# Manually inspect both papers
def get_papers_details(paper_ids, cluster_id, df):
    papers_df = df[df['paperId'].isin(paper_ids)][['paperId', 'title', 'abstract', 's2FieldsOfStudy']]
    papers_df.to_json(f'leaf_cluster_paper_inspection/papers_details_{cluster_id}.json', orient='records', indent=4)

def print_cluster_details(cluster):
    if 'content' in cluster:
        if isinstance(cluster['content'][0], dict):  # if 'content' contains clusters
            for item in cluster['content']:  # iterate over the list of clusters
                print_cluster_details(item)  # recursion call
        else:  # it's a leaf cluster
            print(cluster['cluster_id'])  # print leaf cluster id
            cluster_id = cluster['cluster_id']
            paper_ids = cluster['content']
            get_papers_details(paper_ids, cluster_id, df)
            
# Use the function like this
for cluster in trees:
    print(f"Cluster Id: {cluster['cluster_id']}")
    print_cluster_details(cluster)

# Archive

## Generate a literature review using concepts (TLDRs)

## Use knowledge base from Semantic Scholar

In [None]:
import pandas as pd
import json

# load the data from your JSON file
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\data_collection\output_100.json', 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# remove rows with null abstracts
df = df[df['abstract'].notna()]

# print out the DataFrame to verify
print(df.head())

In [None]:
import pandas as pd
import json

# load the data from your JSON file
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\data_collection\output_50.json', 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# remove rows with null abstracts
# df = df[df['abstract'].notna()]
# df = df[df['tldr'].notna()]

# print out the DataFrame to verify
print(df.head())

In [None]:
# Get the first row of the DataFrame
first_row = df.iloc[0]
print(first_row)

In [None]:
# Get the shape of the DataFrame
num_rows, num_cols = df.shape
print(f"The dataframe has {num_rows} rows and {num_cols} columns.")


In [None]:
# Have GPT3.5 generate a literature review of carbon capture using the TLDRs (significance)
%load_ext dotenv
%dotenv
import os
openai_api_key = os.getenv('OPENAI_API_KEY')
import openai

In [None]:
# for bulk openai message, no stream
def chat_openai(prompt="Tell me to ask you a prompt", chat_history=[]):
    # define message conversation for model
    if chat_history:
        messages = chat_history
    else:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
        ]
    messages.append({"role": "user", "content": prompt})

    # create the chat completion
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
    )
    text_answer = completion["choices"][0]["message"]["content"]

    # updated conversation history
    messages.append({"role": "assistant", "content": text_answer})

    return text_answer, messages

## Generating Topic Metadata from S2FieldsOfStudy

In [None]:
# Print the top 10 words for each topic
for i in range(lda.num_topics):
    print(f"Topic {i}:")
    for word, prob in lda.show_topic(i, topn=10):
        print(f"  {word}: {prob}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import textwrap
from adjustText import adjust_text

In [None]:
df.head()

In [None]:
# Trying to

In [None]:
# Collect list of all s2FieldsOfStudy topics, corresponding PaperIds, and citation counts, and the centroid x and y values
import pandas as pd
from collections import defaultdict

# Assuming df is your DataFrame
topic_data = defaultdict(lambda: {"paperIds": [], "citationCount": 0, "xSum": 0, "ySum": 0, "count": 0})

for index, row in df.iterrows():
    for category_info in row['s2FieldsOfStudy']:
        # Get category
        category = category_info['category']
        
        # Prevent duplicates
        if category_info['source'] == 's2-fos-model':
            # Check if paperId already counted for this category
            if row['paperId'] not in topic_data[category]["paperIds"]:
                # Add paper id to category
                topic_data[category]["paperIds"].append(row['paperId'])
            
                # Cumulative citation count
                topic_data[category]["citationCount"] += row['citationCount']
                
                # Sum x and y for the average
                topic_data[category]["xSum"] += row['x']
                topic_data[category]["ySum"] += row['y']
                topic_data[category]["count"] += 1

# Compute the average x and y for each category
for category in topic_data:
    topic_data[category]["x"] = topic_data[category]["xSum"] / topic_data[category]["count"]
    topic_data[category]["y"] = topic_data[category]["ySum"] / topic_data[category]["count"]
    
    # Remove unnecessary keys
    del topic_data[category]["xSum"]
    del topic_data[category]["ySum"]
    del topic_data[category]["count"]

# Convert topic_data to DataFrame
topic_df = pd.DataFrame.from_dict(topic_data, orient='index').reset_index()

# Rename 'index' column to 'Topic'
topic_df.rename(columns={'index': 'topic'}, inplace=True)

In [None]:
topic_df.head()

# Exporting dataframe with T-SNE coordinates
topic_df.to_json('topic_100_tsne.json', orient='records')
topic_df


In [None]:
# Preprocessing for SPECTRE API
SAMPLE_TOPICS = []

for i, topic in enumerate(topic_data.keys(), start=1):
    topic_dict = {
        "paper_id": topic,
        "title": topic,
        "abstract": topic
    }
    SAMPLE_TOPICS.append(topic_dict)

SAMPLE_TOPICS

In [None]:
# Need to create SPECTRE em beddings for all the s2FieldsOfStudy
from typing import Dict, List
import json

import requests


URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16


def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


SAMPLE_PAPERS = [
    {
        "paper_id": "A",
        "title": "Angiotensin-converting enzyme 2 is a functional receptor for the SARS coronavirus",
        "abstract": "Spike (S) proteins of coronaviruses ...",
    },
    {
        "paper_id": "B",
        "title": "Hospital outbreak of Middle East respiratory syndrome coronavirus",
        "abstract": "Between April 1 and May 23, 2013, a total of 23 cases of MERS-CoV ...",
    },
]


def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id


# if __name__ == "__main__":
all_embeddings = embed(SAMPLE_TOPICS)

# Prints { 'A': [4.089589595794678, ...], 'B': [-0.15814849734306335, ...] }
print(len(all_embeddings))
print(len(df['embedding']))

In [None]:
# Add all_embeddings to new df and then run T-SNE
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import textwrap
from adjustText import adjust_text

In [None]:
# Convert all_embeddings values to a list of lists
all_embeddings_list = list(all_embeddings.values())

# Convert df['embedding'] to a list of lists
df_embeddings = df['embedding'].apply(lambda x: x['vector']).tolist()

# Stack them together
combined_embeddings = np.vstack([df_embeddings, all_embeddings_list])

In [None]:
tsne = TSNE(n_components=2, random_state=0)
embeddings_2d = tsne.fit_transform(combined_embeddings)


In [None]:
len(df)

In [None]:
# Add the 2D t-SNE coordinates to the papers DataFrame and normalize citationCount for node sizes
df['x'] = embeddings_2d[:len(df), 0]
df['y'] = embeddings_2d[:len(df), 1]
df['citationCount_normalized'] = (df['citationCount'] - df['citationCount'].min()) / (df['citationCount'].max() - df['citationCount'].min())
df['citationCount_normalized'] = df['citationCount_normalized'] * 100  # Scale to a suitable range for scatter plot node sizes

# Sort the DataFrame based on citationCount and select the top 20
df_top20 = df.nlargest(20, 'citationCount')

# Create a scatter plot of all the points with node sizes based on normalized citationCount
plt.figure(figsize=(10, 10))
plt.scatter(df['x'], df['y'], s=df['citationCount_normalized'], alpha=0.5, label='All papers')

# Highlight the top 20 papers in the plot with node sizes based on normalized citationCount
plt.scatter(df_top20['x'], df_top20['y'], s=df_top20['citationCount_normalized'], color='red', label='Top 20 papers')

# Prepare to add titles of the top 20 papers to the plot with text wrapping
texts = []
for i, row in df_top20.iterrows():
    title_wrapped = textwrap.fill(row['title'], width=100)  # Wrap text after 20 characters
    plt.scatter(row['x'], row['y'], color='red')  # This will ensure the dot is above the line
    texts.append(plt.annotate(title_wrapped, (row['x'], row['y']), textcoords="offset points", xytext=(0,10), ha='center', arrowprops=dict(arrowstyle="->")))

# Exporting dataframe with T-SNE coordinates
df.to_json('output_100_tsne.json', orient='records')

In [None]:
# Add x and y coordinates for topics

topic_df['x'] = embeddings_2d[len(df):, 0]
topic_df['y'] = embeddings_2d[len(df):, 1]
# topic_df['citationCount_normalized'] = (df['citationCount'] - df['citationCount'].min()) / (df['citationCount'].max() - df['citationCount'].min())
# topic_df['citationCount_normalized'] = df['citationCount_normalized'] * 100  # Scale to a suitable range for scatter plot node sizes

# Exporting dataframe with T-SNE coordinates
topic_df.to_json('topic_100_tsne.json', orient='records')
topic_df

In [None]:
print(tldr_texts)

In [None]:
prompt = "I am a college student who is a beginner in carbon capture. Write a literature review of carbon capture using the TLDRs (significance) of papers: " + str(tldr_texts)

In [None]:
res = chat_openai(prompt)

In [None]:
res

In [None]:
# Get 'tldr' column as a Series
tldr_series = df['tldr']

# Extract the 'text' from each 'tldr' dictionary in the Series
tldr_texts = [item['text'] if item is not None else None for item in tldr_series]


## Manual Inspection of Output for Manual Hierarchical Clustering

In [None]:
import json

# Load the input data
with open('t-sne/output_100_tsne.json', 'r') as infile:
    data = json.load(infile)

# Extract desired fields
new_data = []
for entry in data:
    paperId = entry.get('paperId', None)
    title = entry.get('title', None)
    abstract = entry.get('abstract', None)
    
    # Build new data entry
    if paperId is not None and title is not None and abstract is not None:
        new_data.append({
            'paperId': paperId,
            'title': title,
            # 'abstract': abstract
        })

# Write to output file
with open('t-sne/output_100_tsne_manual_inspection_id_titles_only.json', 'w') as outfile:
    json.dump(new_data, outfile, indent=4)


## Generating paper topics from title and abstract

In [None]:
import pandas as pd
import json

# load the data from your JSON file
with open(r'C:\Users\1kevi\Desktop\projects\Research\autoscious-carbon-capture\knowledge_base\t-sne\output_100_tsne.json', 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
df = pd.DataFrame(data)

# print out the DataFrame to verify
print(df.head())

In [None]:
# # Testing top 3 n-grams from title, embedding each n-gram, and then clustering each n-gram
# from sklearn.feature_extraction.text import TfidfVectorizer
# from nltk.corpus import stopwords
# import numpy as np

# # List of English stop words
# stop_words = list(stopwords.words('english'))

# # Create the transform
# vectorizer = TfidfVectorizer(ngram_range=(2,3), stop_words=stop_words)

# # Tokenize, build vocab and calculate TF-IDF
# tfidf_matrix = vectorizer.fit_transform(df['text'])

In [None]:
# # Define a function to extract top 3 words with the highest TF-IDF score in a given document
# def top_words(doc_index):
#     feature_index = tfidf_matrix[doc_index,:].nonzero()[1]
#     tfidf_scores = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
    
#     # Corresponding feature names and scores
#     words_scores = [(vectorizer.get_feature_names_out() [i], s) for (i, s) in tfidf_scores]
#     words_scores = sorted(words_scores, key = lambda x: x[1], reverse=True)
    
#     return words_scores

In [None]:
# # Apply the function to each row index in the dataframe
# df['top_words'] = [top_words(i) for i in range(tfidf_matrix.shape[0])]

In [None]:
# df[['title', 'text', 'top_words']]
# df[['title', 'text', 'top_words']].to_json('topics/top_words.json', orient='records', lines=True, indent=2)

In [None]:
# # Testing reranking of words based on similarity to document embedding via specter & ada

# from transformers import AutoTokenizer, AutoModel
# import torch

# # models
# EMBEDDING_MODEL = "text-embedding-ada-002"

# tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
# model = AutoModel.from_pretrained('allenai/specter')

# def calculate_embedding(text):
#     inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
#     with torch.no_grad():
#         return model(**inputs)[0].mean(dim=1).numpy()

# # Calculate document embeddings
# # df['document_embedding'] = df['text'].apply(calculate_embedding)


In [None]:
# # Exploring LDA2Vec
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
# from lda2vec import LDA2Vec
# from transformers import AutoTokenizer, AutoModel
# import torch
# import numpy as np

# # Load your data
# df = pd.read_csv('your_data.csv')

# # Preprocess your text data
# # This step will depend on the specifics of your data
# # For example, you might need to remove stop words, perform lemmatization, etc.

# # Set up SPECTER model for embeddings
# tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
# model = AutoModel.from_pretrained('allenai/specter')

# def calculate_embedding(text):
#     inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
#     with torch.no_grad():
#         return model(**inputs)[0].mean(dim=1).numpy()

# # Generate word embeddings using SPECTER
# df['embeddings'] = df['text'].apply(calculate_embedding)

# # Prepare data for LDA2Vec
# # This involves creating a count matrix of your text data
# vectorizer = CountVectorizer()
# counts = vectorizer.fit_transform(df['text'])

# # Create an LDA2Vec model
# lda2vec = LDA2Vec(n_topics=20, n_words=10000)

# # Fit the LDA2Vec model
# lda2vec.fit(counts, df['embeddings'].tolist())

# # Now you can use the lda2vec model to explore topics in your text
# # For example, you can look at the most common words in each topic


In [None]:
# # Trying Guided LDA
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.cluster import KMeans
# from transformers import AutoTokenizer, AutoModel
# import torch
# import pandas as pd
# import numpy as np
# from nltk.util import ngrams
# import nltk
# nltk.download('stopwords')

# # Initialize SPECTER model
# tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
# model = AutoModel.from_pretrained('allenai/specter')

# # Function to calculate embeddings
# def calculate_embedding(text):
#     inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
#     with torch.no_grad():
#         return model(**inputs)[0].mean(dim=1).numpy()

# # Calculate document embeddings
# # df['document_embedding'] = df['text'].apply(calculate_embedding)

# # Tokenize text and create bigrams

# # Load the set of English stop words
# stop_words = set(stopwords.words('english'))

# df['tokens'] = df['text'].apply(lambda x: [word for word in x.split() if word not in stop_words])
# df['bigrams'] = df['tokens'].apply(lambda x: [' '.join(gram) for gram in ngrams(x, 2) if all(word not in stop_words for word in gram)])

# unique_tokens = pd.Series([item for sublist in df['tokens'].tolist() for item in sublist]).unique()
# unique_bigrams = pd.Series([item for sublist in df['bigrams'].tolist() for item in sublist]).unique()

# word_embeddings = {word: calculate_embedding(word) for word in unique_tokens}
# bigram_embeddings = {bigram: calculate_embedding(bigram) for bigram in unique_bigrams}


In [None]:
# word_embeddings = {word: word_embeddings[word].reshape(1, -1) for word in word_embeddings}
# bigram_embeddings = {bigram: bigram_embeddings[bigram].reshape(1, -1) for bigram in bigram_embeddings}

# print(word_embeddings['carbon'].shape)
# print(bigram_embeddings['carbon capture'].shape)

In [None]:
# # Cluster words and bigrams based on embeddings
# # Cluster words and bigrams based on embeddings
# word_clusters = KMeans(n_clusters=20).fit(np.array(list(word_embeddings.values())).reshape(-1, 768))
# bigram_clusters = KMeans(n_clusters=20).fit(np.array(list(bigram_embeddings.values())).reshape(-1, 768))


In [None]:

# # Create seed topics based on clusters
# seed_topics = {word: word_clusters.labels_[i] for i, word in enumerate(word_embeddings.keys())}
# seed_topics.update({bigram: bigram_clusters.labels_[i] for i, bigram in enumerate(bigram_embeddings.keys())})

In [None]:
# print(seed_topics)
# # Need to verify if the topics and words under them are even good.

In [None]:
# seed_topics_grouped = {}

# for word, topic in seed_topics.items():
#     if topic in seed_topics_grouped:
#         seed_topics_grouped[topic].append(word)
#     else:
#         seed_topics_grouped[topic] = [word]


In [None]:
# seed_topics_grouped

In [None]:
# from numpy import dot
# from numpy.linalg import norm

# def cosine_similarity(a, b):
#     a = a.flatten()  # Add this line
#     b = b.flatten()  # Add this line
#     return dot(a, b) / (norm(a) * norm(b))


In [None]:
# # Running GuidedLDA
# from guidedlda import GuidedLDA

# # Vectorize your text data
# vectorizer = CountVectorizer(vocabulary=seed_topics.keys())
# X = vectorizer.fit_transform(df['text'])

# # Create a seed topics matrix
# seed_topics_matrix = np.zeros((40, len(seed_topics)))

# for word, topic in seed_topics.items():
#     seed_topics_matrix[topic, vectorizer.vocabulary_[word]] = 1

# # Run Guided LDA
# model = GuidedLDA(n_topics=40, n_iter=100, random_state=7, refresh=20)
# model.fit(X, seed_topics=seed_topics_matrix)

# # Get the topic-word and document-topic distributions
# topic_word_distributions = model.topic_word_
# document_topic_distributions = model.transform(X)

In [None]:
# def get_top_words_spectre(row):
#     # Get the document embedding and flatten it
#     document_embedding = row['document_embedding_ada'].flatten()
    
#     top_words_spectre = []
#     for word, _ in row['top_words']:
#         word_embedding = calculate_embedding(word)
#         word_embedding = word_embedding.flatten()  # Flatten the word embedding
#         similarity = cosine_similarity(word_embedding, document_embedding)
#         top_words_spectre.append((word, similarity))
    
#     # Sort the words by their similarity to the document embedding
#     top_words_spectre.sort(key=lambda x: x[1], reverse=True)
    
#     return top_words_spectre

In [None]:
# # 21 min on 100 papers and all their n grams
# df['top_words_spectre'] = df.apply(get_top_words_spectre, axis=1)

In [None]:

# df[['title', 'text', 'top_words_spectre']].to_json('topics/top_words_spectre.json', orient='records', lines=True, indent=2)

In [None]:
# # Use an embedding to transform each of these words? 
# from sentence_transformers import SentenceTransformer
# import numpy as np

# model = SentenceTransformer('sentence-transformers/miniLM-L6-H384-uncased')

# # Assume 'top_words' is a list of your top words for each paper
# top_words = ['carbon', 'capture', 'storage', 'renewable', 'energy']

# word_embeddings = {}
# for word in top_words:
#     word_embeddings[word] = model.encode([word])[0]

# # Now 'word_embeddings' is a dictionary that maps words to their corresponding embeddings

In [None]:
df.head()

In [None]:
# Using GPT3.5 to generate topics

# imports
%load_ext dotenv
%dotenv
import os
# openai_api_key = os.getenv('OPENAI_API_KEY')
openai_api_key = os.getenv('OPENAI_GPT4_API_KEY')
import openai

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
# GPT_MODEL = "gpt-4"

# for bulk openai message, no stream
def chat_openai(prompt="Tell me to ask you a prompt", chat_history=[]):
    # define message conversation for model
    if chat_history:
        messages = chat_history
    else:
        messages = [
            {"role": "system", "content": "You are ChatGPT, a large language model trained by OpenAI. Answer as concisely as possible."},
        ]
    messages.append({"role": "user", "content": prompt})

    # create the chat completion
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0
    )
    text_answer = completion["choices"][0]["message"]["content"]

    # updated conversation history
    messages.append({"role": "assistant", "content": text_answer})

    return text_answer, messages

In [None]:
# existing_themes = 'Renewable integration, carbon capture, Clean energy transition, Operational flexibility, Cost-effective CO2 reduction, carbon storage, lost decade'

# paper = "titled '" + df.iloc[0]["title"] + "'"
# if df.iloc[0]["abstract"]:
#     paper += "with the following abstract: " + df.iloc[0]["abstract"]

# prompt = f'''
# Paper: {paper} \n Task: Given the paper title and abstract above, determine at most 5 themes for a researcher whose goal is to eventually make impactful discoveries and experiments. \n Rules: Do not output any theme that is beyond what is given in the paper. Be as concise (less than 5 words), clear, and correct as possible. Do not make up anything not apparent from the paper. \n Use themes from other papers only if the paper mentions them: {existing_themes}. 

# Your output should be of the following format: Theme1, Theme2, Theme3, Theme4, Theme5
# '''

## Archive t-sne & LDA testing

In [None]:
# # Use adjust_text to automatically adjust the positions of the text labels
# adjust_text(texts)

# plt.title('t-SNE visualization of embedding vectors')
# plt.legend()
# plt.show()

In [None]:
# # Working implementation of t-SNE?

# # Add the 2D t-SNE coordinates to the DataFrame and normalize citationCount for node sizes
# df['x'] = embeddings_2d[:, 0]
# df['y'] = embeddings_2d[:, 1]
# df['citationCount_normalized'] = (df['citationCount'] - df['citationCount'].min()) / (df['citationCount'].max() - df['citationCount'].min())
# df['citationCount_normalized'] = df['citationCount_normalized'] * len(df)  # Scale to a suitable range for scatter plot node sizes

# # Sort the DataFrame based on citationCount and select the top 20
# df_top20 = df.nlargest(10, 'citationCount')

# # Create a scatter plot of all the points with node sizes based on normalized citationCount
# plt.figure(figsize=(10, 10))
# plt.scatter(df['x'], df['y'], s=df['citationCount_normalized'], alpha=0.5, label='All papers')

# # Highlight the top 20 papers in the plot with node sizes based on normalized citationCount
# top20_scatter = plt.scatter(df_top20['x'], df_top20['y'], s=df_top20['citationCount_normalized'], color='red')

# # Prepare to add titles of the top 20 papers to the plot with text wrapping
# texts = []
# for i, row in df_top20.iterrows():
#     title_wrapped = textwrap.fill(row['title'], width=20)  # Wrap text after 20 characters
#     plt.scatter(row['x'], row['y'], color='red')  # This will ensure the dot is above the line
#     texts.append(plt.annotate(title_wrapped, (row['x'], row['y']), textcoords="offset points", xytext=(0,10), ha='center', arrowprops=dict(arrowstyle="->")))

# # Use adjust_text to automatically adjust the positions of the text labels
# adjust_text(texts)

# plt.title('t-SNE visualization of embedding vectors')
# plt.legend(handles=[top20_scatter], labels=['Top 20 papers'])
# plt.show()

In [None]:
# # attempting hierarchical clustering with LDA
# import pandas as pd
# from gensim.corpora import Dictionary
# from gensim.models import LdaModel
# from gensim.utils import simple_preprocess
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# # Download the set of stop words the first time
# import nltk
# nltk.download('stopwords')

# # Load the set of English stop words
# stop_words = set(stopwords.words('english'))

# # Assume that 'documents' is your list of documents. 
# # Each document is a string of words.

# # Preprocess your documents
# documents = df['abstract'].apply(simple_preprocess)

# # Tokenize the documents into words, convert to lower case, 
# # and remove stop words
# filtered_documents = []
# for doc in documents:
#     word_tokens = doc
#     filtered_document = [word for word in word_tokens if word not in stop_words]
#     filtered_documents.append(filtered_document)

# # Create a gensim dictionary from the documents
# dictionary = Dictionary(filtered_documents)

# # Create a corpus for LDA
# corpus = [dictionary.doc2bow(doc) for doc in documents]

# # Fit an LDA model
# lda = LdaModel(corpus, num_topics=20, id2word=dictionary, alpha='auto', eta='auto')

# # Print the top words for each topic
# for i in range(10):
#     print(f"Topic {i+1}:")
#     print([dictionary[word_id] for word_id, prob in lda.get_topic_terms(i, topn=10)])

In [None]:
# # Get the topic-document distribution from the LDA model
# topic_dist = [lda.get_document_topics(bow, minimum_probability=0) for bow in corpus]

# # Convert the topic distributions to a 2D array
# topic_dist_array = np.zeros((len(corpus), lda.num_topics))
# for i in range(len(corpus)):
#     for topic, prob in topic_dist[i]:  # topic_dist[i] is a list of tuples
#         topic_dist_array[i, topic] = prob  # topic is an integer (the topic ID)

# # Perform hierarchical clustering on the topic-document distribution
# cluster = AgglomerativeClustering(n_clusters=5)
# cluster_labels = cluster.fit_predict(topic_dist_array)