# Merging existing knowledge base with new papers

In [11]:
# Likely can just run after double checking the paths

import pandas as pd
import json

# Load in existing knowledge base from papers
papers_json_path = 'papers/latest_papers.json'

# Load in potentially new data from data_collection folder
new_data_json_path = '../data_collection/openalex/latest_unique_papers_downloaded.json'

# load the data from your JSON file
with open(papers_json_path, 'r') as f:
    data = json.load(f)

# convert the data into a pandas DataFrame
existing_df = pd.DataFrame(data)

# print out the DataFrame to verify
print(existing_df.head())

# load the data from your JSON file
with open(new_data_json_path, 'r') as f:
    new_data = json.load(f)

# convert the data into a pandas DataFrame
new_df = pd.DataFrame(new_data)

# print out the DataFrame to verify
print(new_df.head())

                                 id  \
0  https://openalex.org/W1002897494   
1  https://openalex.org/W1010274275   
2  https://openalex.org/W1012054217   
3   https://openalex.org/W101897731   
4  https://openalex.org/W1023696101   

                                            abstract  \
0  Abstract Carbon Capture and Storage is develop...   
1  Searching for efficient solid sorbents for CO2...   
2  The gaseous carbon dioxide incorporation react...   
3                                               None   
4  This paper presents a model to perform energy ...   

                                             authors  citationCount  \
0  [[https://openalex.org/A5054450376, Yasin Ghol...           10.0   
1  [[https://openalex.org/A5005055697, Guoping Ga...           38.0   
2  [[https://openalex.org/A5044055674, Chika Haya...           21.0   
3  [[https://openalex.org/A5030437606, Teng Ben],...            1.0   
4  [[https://openalex.org/A5067189491, Arun Verma...           20.0   

 

In [12]:
print(existing_df.columns)

Index(['id', 'abstract', 'authors', 'citationCount',
       'citationCount_normalized', 'citations', 'classification_ids',
       'concepts', 'doi', 'embedding', 'isOpenAccess',
       'keyword_classification_ids', 'language', 'paperId', 'publication_date',
       'relevance_score', 'search_score', 'text', 'title', 'url', 'x', 'y',
       'year'],
      dtype='object')


In [13]:
print(new_df.columns)
# new_df.iloc[124]

Index(['id', 'doi', 'title', 'isOpenAccess', 'abstract', 'paperId', 'url',
       'citations', 'citationCount', 'concepts', 'publication_date',
       'relevance_score', 'language', 'year', 'search_score', 'authors'],
      dtype='object')


In [14]:
# Set 'id' as the index in both dataframes if not done already
existing_df.set_index('id', inplace=True)
new_df.set_index('id', inplace=True)

# Combine the two dataframes, updating the values in existing_df with the non-NA ones from new_df and adding new rows from new_df
combined_df = new_df.combine_first(existing_df)

# Reset the index
combined_df.reset_index(inplace=True)

In [15]:
combined_df[combined_df["embedding"].notnull()].head(10)

Unnamed: 0,id,abstract,authors,citationCount,citationCount_normalized,citations,classification_ids,concepts,doi,embedding,...,paperId,publication_date,relevance_score,search_score,text,title,url,x,y,year
0,https://openalex.org/W1002897494,Abstract Carbon Capture and Storage is develop...,"[[https://openalex.org/A5054450376, Yasin Ghol...",10.0,,"[https://openalex.org/W1753239825, https://ope...","[[numerical pressure-decay method, 4.1, 0.77],...","[{'id': 'https://openalex.org/C10899652', 'wik...",https://doi.org/10.1016/j.molliq.2015.06.060,"[-0.4225695729255676, 0.062300171703100204, 0....",...,https://openalex.org/W1002897494,2015-11-01,21.742895,"[[carbon capture, 21.742895]]",Title: Suggesting a numerical pressure-decay m...,Suggesting a numerical pressure-decay method f...,https://doi.org/10.1016/j.molliq.2015.06.060,-42.265751,4.114934,2015.0
1,https://openalex.org/W1010274275,Searching for efficient solid sorbents for CO2...,"[[https://openalex.org/A5005055697, Guoping Ga...",38.0,,"[https://openalex.org/W1966750682, https://ope...","[[CO2 adsorption, 2.9, 0.81], [separation, 4.2...","[{'id': 'https://openalex.org/C162862793', 'wi...",https://doi.org/10.1016/j.commatsci.2015.06.005,"[-0.8119574189186096, -0.12326264381408691, 0....",...,https://openalex.org/W1010274275,2015-10-01,21.940535,"[[carbon capture, 21.940535]]",Title: Modelling CO 2 adsorption and separatio...,Modelling CO 2 adsorption and separation on ex...,https://doi.org/10.1016/j.commatsci.2015.06.005,-15.913016,-9.927729,2015.0
2,https://openalex.org/W1012054217,The gaseous carbon dioxide incorporation react...,"[[https://openalex.org/A5044055674, Chika Haya...",21.0,,"[https://openalex.org/W1973556473, https://ope...","[[Cobalt-Catalyzed Reductive Carboxylation, 10...","[{'id': 'https://openalex.org/C185592680', 'wi...",https://doi.org/10.1246/bcsj.20150043,"[-0.905866265296936, -0.24191510677337646, 1.2...",...,https://openalex.org/W1012054217,2015-06-15,21.46716,"[[carbon capture, 21.46716]]",Title: Cobalt-Catalyzed Reductive Carboxylatio...,"Cobalt-Catalyzed Reductive Carboxylation of α,...",https://doi.org/10.1246/bcsj.20150043,-12.920057,-1.328741,2015.0
3,https://openalex.org/W101897731,,"[[https://openalex.org/A5030437606, Teng Ben],...",1.0,,"[https://openalex.org/W656376090, https://open...","[[Carbon Dioxide Capture, 2.1, 0.79], [Porous ...","[{'id': 'https://openalex.org/C150394285', 'wi...",https://doi.org/10.1007/978-3-642-54646-4_4,"[-0.048764150589704514, -0.2366003841161728, 0...",...,https://openalex.org/W101897731,2014-01-01,22.707438,"[[carbon capture, 22.707438]]",Title: Carbon Dioxide Capture in Porous Aromat...,Carbon Dioxide Capture in Porous Aromatic Fram...,https://doi.org/10.1007/978-3-642-54646-4_4,-9.399205,-15.376349,2014.0
4,https://openalex.org/W1023696101,This paper presents a model to perform energy ...,"[[https://openalex.org/A5067189491, Arun Verma...",20.0,,"[https://openalex.org/W1965513514, https://ope...","[[process simulation model, 10.1, 0.63], [ener...","[{'id': 'https://openalex.org/C194439259', 'wi...",https://doi.org/10.1016/j.ijhydene.2015.06.149,"[-0.13096152245998383, 0.1287851482629776, 1.0...",...,https://openalex.org/W1023696101,2015-09-01,17.861244,"[[carbon capture, 17.861244]]",Title: Development of a process simulation mod...,Development of a process simulation model for ...,https://doi.org/10.1016/j.ijhydene.2015.06.149,-38.810551,-24.716375,2015.0
5,https://openalex.org/W1028462910,In a pulp mill the energy conversion mainly ta...,"[[https://openalex.org/A5003756798, Erik Hekto...",1.0,,"[https://openalex.org/W1965871315, https://ope...","[[Carbon Dioxide Capture, 2.1, 0.79], [Pulp an...","[{'id': 'https://openalex.org/C2781110116', 'w...",,"[-0.0863686352968216, 0.4754101037979126, 1.08...",...,https://openalex.org/W1028462910,2005-01-01,23.888971,"[[carbon capture, 23.888971]]",Title: Carbon Dioxide Capture in the Pulp and ...,Carbon Dioxide Capture in the Pulp and Paper I...,https://research.chalmers.se/en/publication/9920,-39.621006,-26.748983,2005.0
6,https://openalex.org/W103632297,Continuing support for combustion research is ...,"[[https://openalex.org/A2072231077, Robert W. ...",7.0,,"[https://openalex.org/W1592130436, https://ope...","[[Combustion Technology, 2.1, 0.83], [fossil f...","[{'id': 'https://openalex.org/C105923489', 'wi...",https://doi.org/10.1007/978-94-007-0412-1_1,"[-0.531643807888031, 0.23009800910949707, 0.61...",...,https://openalex.org/W103632297,2011-01-01,22.008118,"[[carbon capture, 22.008118]]",Title: The Role of Combustion Technology in th...,The Role of Combustion Technology in the 21st ...,https://doi.org/10.1007/978-94-007-0412-1_1,-59.983696,-6.735415,2011.0
7,https://openalex.org/W105069333,The Role of Law in Responding to Climate Chang...,"[[https://openalex.org/A5080990487, Nicola Dur...",7.0,,[],"[[Legal Responses, 7.1, 0.81], [Climate Change...","[{'id': 'https://openalex.org/C132651083', 'wi...",,"[-0.7133026719093323, -0.11468589305877686, 0....",...,https://openalex.org/W105069333,2011-01-04,18.517447,"[[carbon capture, 18.517447]]",Title: Legal Responses to Climate Change. Abst...,Legal Responses to Climate Change,http://ci.nii.ac.jp/ncid/BB07622906,-62.529289,-10.624932,2011.0
8,https://openalex.org/W1062343535,Existing monitoring protocols for the storage ...,"[[https://openalex.org/A5042943033, Tim Dixon]...",38.0,,"[https://openalex.org/W1501420375, https://ope...","[[monitoring protocols, 8.1, 0.94], [CO2 geolo...","[{'id': 'https://openalex.org/C47737302', 'wik...",https://doi.org/10.1016/j.ijggc.2015.05.029,"[-0.49119389057159424, 0.06770742684602737, 0....",...,https://openalex.org/W1062343535,2015-10-01,17.48964,"[[carbon capture, 17.48964]]",Title: Improving monitoring protocols for CO2 ...,Improving monitoring protocols for CO2 geologi...,https://doi.org/10.1016/j.ijggc.2015.05.029,-54.125225,-4.230964,2015.0
9,https://openalex.org/W106711868,"This article outlines the ongoing research, de...",,1948.0,7.246559,https://api.openalex.org/works?filter=cites:W1...,"[[Microgrids, 6.4, 0.84], [distributed generat...","[{'id': 'https://openalex.org/C2776784348', 'w...",https://doi.org/10.1109/mpae.2007.376583,"[0.014594187028706074, 0.030436156317591667, 0...",...,https://openalex.org/W106711868,,,,Title: Microgrids. Abstract: This article outl...,Microgrids,https://doi.org/10.1109/mpae.2007.376583,-53.935482,-20.382832,2007.0


In [16]:
from datetime import datetime
import os

now = datetime.now()
date_str = now.strftime('%y-%m-%d')
time_str = now.strftime('%H-%M-%S')
if not os.path.exists(f'papers/{date_str}'):
    os.makedirs(f'papers/{date_str}')

combined_df.to_json(f'papers/{date_str}/{time_str}_{combined_df.shape[0]}_database_update.json', orient='records')

combined_df.to_json(f'papers/{date_str}/{time_str}_{combined_df.shape[0]}_database_update_readable.json', orient='records', indent=2)

In [None]:
# Update the main knowledge base file that will be saved over Github
combined_df.to_json(f'papers/latest_papers.json', orient='records')