# Extraction and insertion

In this notebook there will be only the **final, necessary** code for the data extraction (cleaning/filtering/merging) and data import (into the graph database neo4j).

In [1]:
import pandas as pd
import os 
import numpy as np
import re
from zipfile import ZipFile

## Extraction

### 1. Filtering 

In [2]:
# read all data files, stratified by publication type

zip_file = ZipFile('RC export 2020-10-12.zip')
files = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv')}

ed = files["educational-2020-10-12.csv"]
books = files["books-2020-10-12.csv"]
conf = files["conference-2020-10-12.csv"]
journ = files["journal-2020-10-12.csv"]
oth = files["other-2020-10-12.csv"]
pap = files["papers-2020-10-12.csv"]
pat = files["patents-2020-10-12.csv"]
pres = files["presentations-2020-10-12.csv"]

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [3]:
# we restrict ourselves to only book chapters, conference papers, journal papers and other papers

research_data_df = pd.concat([books, conf, journ, pap], ignore_index=True)
print(research_data_df.shape)
print("There are ", research_data_df.shape[0], "documents, ", 
      sum(research_data_df["dc.description.abstract"].notna()), " of which have abstracts")
research_data_df.head()

(176057, 114)
There are  176057 documents,  21421  of which have abstracts


Unnamed: 0,id,collection,dc.contributor.author,dc.contributor.editor,dc.contributor.other,dc.contributor.supervisor,dc.date.accessioned,dc.date.available,dc.date.issued,dc.date.published,...,ethz.relation.isCompiledBy,ethz.relation.isDocumentedBy,ethz.relation.references,dc.language,ethz.identifier.pubmed,ethz.relation.isReviewedBy,ethz.relation.reviews,ethz.version,ethz.date.retentionend,ethz.date.retentionendDate
0,108069,,"Schützeichel, Rainer","Gerber, Andri||Kurath, Stefan",,,2017-06-11T21:44:58Z,2017-06-11T21:44:58Z,2016,,...,,,,,,,,,,
1,188444,,"Milman, Vitali||Wagner, Roy","Ball, Keith M.||Milman, Vitali",,,2017-10-23T13:08:25Z,2017-06-12T14:30:31Z||2017-08-21T11:33:24Z||20...,1999-04,,...,,,,,,,,,,
2,125569,,"Folkers, Gerd","Abel, Günter||Plümacher, Martina",,,2017-06-12T18:11:46Z,2017-06-12T18:11:46Z,2016,,...,,,,,,,,,,
3,157378,,"Mack, Rüdiger||Schleich, Wolfgang P.||Haase, D...","Arendt, Wolfgang||Schleich, Wolfgang P.",,,2017-06-14T12:24:19Z,2017-06-14T12:24:19Z,2008,,...,,,,,,,,,,
4,19843,,"Burkhard, Remo||Schmitt, Gerhard","Wang, Xiangyu||Schnabel, Marc Aurel",,,2017-06-08T23:42:55Z,2017-06-08T23:42:55Z,2008,,...,,,,,,,,,,


In [4]:
set(research_data_df.columns)

{'collection',
 'dc.contributor',
 'dc.contributor.author',
 'dc.contributor.editor',
 'dc.contributor.other',
 'dc.contributor.supervisor',
 'dc.date.accessioned',
 'dc.date.available',
 'dc.date.issued',
 'dc.date.published',
 'dc.description.abstract',
 'dc.format',
 'dc.identifier.doi',
 'dc.identifier.isbn',
 'dc.identifier.issn',
 'dc.identifier.olduri',
 'dc.identifier.other',
 'dc.identifier.uri',
 'dc.identifier.wos',
 'dc.language',
 'dc.language.iso',
 'dc.publisher',
 'dc.relation.isnodouble',
 'dc.relation.ispartof',
 'dc.relation.isreplacedbydouble',
 'dc.rights.license',
 'dc.rights.uri',
 'dc.subject',
 'dc.title',
 'dc.title.alternative',
 'dc.type',
 'ethz.availability',
 'ethz.book.title',
 'ethz.code.ddc',
 'ethz.code.jel',
 'ethz.date.deposited',
 'ethz.date.embargoend',
 'ethz.date.retentionend',
 'ethz.date.retentionendDate',
 'ethz.doipreview',
 'ethz.ecitpid',
 'ethz.ecolpid',
 'ethz.edit.source',
 'ethz.edit.status',
 'ethz.eth',
 'ethz.event',
 'ethz.event.da

In [5]:
research_data_df["ethz.leitzahl.certified"][0]

'ETH Zürich::00002 - ETH Zürich, direkt::00012 - Lehre und Forschung, direkt::00007 - Departemente, direkt::02100 - Departement Architektur / Department of Architecture::02601 - Institut für Geschichte und Theorie der Architektur (gta) / Institute for the History and Theory of Architecture (gta)::03414 - Magnago Lampugnani, Vittorio (emeritus)'

In [6]:
# let's remove all irrelevant columns

rd_f = research_data_df[["id","dc.contributor.author","dc.title","dc.date.issued","dc.type","dc.description.abstract",
                 "dc.language.iso", "ethz.journal.title"]]
rd_f = rd_f.rename({"dc.contributor.author": "author", "dc.date.issued": "publication date",
                       "dc.title": "title", "dc.type": "publication type", "dc.description.abstract": "abstract",
                       "dc.language.iso": "language", "ethz.journal.title": "journal"}, 
              axis = 1)
rd_f.head()

Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal
0,108069,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen
1,188444,"Milman, Vitali||Wagner, Roy",Asymptotic versions for operators and operator...,1999-04,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
2,125569,"Folkers, Gerd",On Re-Positioning,2016,Book Chapter,,en,Berlin Studies in Knowledge Research
3,157378,"Mack, Rüdiger||Schleich, Wolfgang P.||Haase, D...",Factorization,2008,Book Chapter,,en,
4,19843,"Burkhard, Remo||Schmitt, Gerhard",Visualising future cities in the ETH Value Lab,2008,Book Chapter,,en,


### 2. Cleaning

In [7]:
# define helper functions for cleaning

def separate_names(names):
    """ Separes a string of names of the form name1||name2||name3||... into a list of names."""
    if isinstance(names, str):
        return names.split("||")

def date_to_year(date):
    """Get the dates into a single format (YYYY)"""
    if isinstance(date, str) and len(re.findall("[\d]{4}",date))>0:
        return re.findall("[\d]{4}",date)[0]

def date_to_year_and_month(date):
    """Get dates into format YYYY-MM"""
    # if needed later
    if isinstance(date, str):
        if len(re.findall("[\d]{4}[-][\d]{2}",date)) > 0:
            return re.findall("[\d]{4}[-][\d]{2}",date)[0]
        else:
            return str(date)

In [8]:
author_array = rd_f["author"].apply(separate_names)
rd_f["author"] = author_array
# Date format
date_format = date_to_year
date_array = rd_f["publication date"].apply(date_format)
rd_f["publication date"] = date_array

rd_f.head()

Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal
0,108069,"[Schützeichel, Rainer]",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen
1,188444,"[Milman, Vitali, Wagner, Roy]",Asymptotic versions for operators and operator...,1999,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
2,125569,"[Folkers, Gerd]",On Re-Positioning,2016,Book Chapter,,en,Berlin Studies in Knowledge Research
3,157378,"[Mack, Rüdiger, Schleich, Wolfgang P., Haase, ...",Factorization,2008,Book Chapter,,en,
4,19843,"[Burkhard, Remo, Schmitt, Gerhard]",Visualising future cities in the ETH Value Lab,2008,Book Chapter,,en,


In [9]:
# a little detour: check how many papers were published each year

rd_f[["id","publication date"]].groupby("publication date").count().sort_values("publication date", ascending = False).rename({'id':'count'}, axis = 1).head(20)

Unnamed: 0_level_0,count
publication date,Unnamed: 1_level_1
2021,54
2020,7002
2019,11291
2018,11293
2017,11169
2016,11605
2015,11495
2014,11822
2013,11644
2012,10611


In [10]:
# now explode on author field

rd_fe = rd_f.explode("author")
print("Shape changed from ", rd_f.shape, " to ", rd_fe.shape)
rd_fe.head()

Shape changed from  (176057, 8)  to  (867718, 8)


Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal
0,108069,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen
1,188444,"Milman, Vitali",Asymptotic versions for operators and operator...,1999,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
1,188444,"Wagner, Roy",Asymptotic versions for operators and operator...,1999,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
2,125569,"Folkers, Gerd",On Re-Positioning,2016,Book Chapter,,en,Berlin Studies in Knowledge Research
3,157378,"Mack, Rüdiger",Factorization,2008,Book Chapter,,en,


### 3. Merge with department and organisation data

#### 3.1 Organisation data

In [11]:
pl = pd.read_excel("ETH Professor list.xlsx")

In [12]:
# create professor column and keep only the relevant columns

pl["Professor"] = pl["Name"] + ", " + pl["First name"]
pl = pl[["Professor", "Org. unit code", "Organisation"]]
pl.columns = ["professor", "organisation unit code", "organisation"]
print(pl.shape)
pl.head()

(732, 3)


Unnamed: 0,professor,organisation unit code,organisation
0,"Abhari, Reza S.",2627,Institute of Energy Technology (former)
1,"Acciaio, Beatrice",9727,"Acciaio, Beatrice"
2,"Ackermann, Martin",3743,"Ackermann, Martin"
3,"Ackermann, Martin",2721,Inst. Biogeochem. and Pollutant Dynamics
4,"Adey, Bryan T.",2604,Inst. Construction&Infrastructure Manag.


In [13]:
# merge with exploded research collection

rd_m = rd_fe.merge(pl,how="outer",right_on="professor",left_on="author")
print("Shape of research collection changed from ", rd_fe.shape, " to ", rd_m.shape, ". Diff: ", 
      rd_m.shape[0]-rd_fe.shape[0])
rd_m.head(3)

Shape of research collection changed from  (867718, 8)  to  (877074, 11) . Diff:  9356


Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal,professor,organisation unit code,organisation
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,


#### 3.2 Leitzahl data

In [15]:
leitzahl_mapping = pd.read_csv("cost_centre_dept_mapping.csv", encoding = "ISO-8859-1")
leitzahl_mapping.head()

Unnamed: 0,LTZL,NAME,TYPNEU,TYPNAME,DEPT,"TO_CHAR(T.VONDAT,'DD/MM/YYYY')","TO_CHAR(T.BISDAT,'DD/MM/YYYY')"
0,T1057,T-SR Architektur,98,Totalisierungs-Leitzahlen,ARCH,01/01/2006,31/12/9999
1,T1058,Total NDS/NDK ARCH,98,Totalisierungs-Leitzahlen,ARCH,01/01/2000,31/12/2005
2,T1059,T-Lehre ARCH,98,Totalisierungs-Leitzahlen,ARCH,01/01/2000,31/12/9999
3,T1104,T-SR Bauing.wiss.,98,Totalisierungs-Leitzahlen,BAUG,01/01/2006,31/12/9999
4,T1105,T-SR Umwelting.,98,Totalisierungs-Leitzahlen,BAUG,01/01/2006,31/12/9999


In [16]:
set(leitzahl_mapping["NAME"])

{'Ris, R. (em.)',
 'Rehsteiner, F. (em.)',
 'Bosshard, H. H.',
 'CAS TPP: PP',
 'Knörrer, H. (em.)',
 'Tatbul (ehem.)',
 'CAS ETH RM',
 'T-Anorganische Chem.',
 'DS Lebensmittelwiss.',
 'Smith, Roy (Tit.)',
 'T-Mikrobiologie',
 'Wegener, Konrad',
 'T-Design, Mat., Fab.',
 'T-Atmosphärenphysik',
 'Eisner, M. (ehem.)',
 'Imboden, D. (em.)',
 'DA Informatik',
 'Vorburger (SNF)(eh.)',
 'Total NDS/NDK INFK',
 'SCCER-SoE',
 'MAS ETH STP',
 'T-Merkt, F.',
 'GESS, nicht zuteilb.',
 'Degen, Christian',
 'Van Gunsteren (em.)',
 'Waldmanagement',
 'T-SR Rechnergest.Wi.',
 'DZ Lebensmittelwiss.',
 'Schwab, Martin (em.)',
 'Ed. J. Org. Letters',
 'Faber, Michael (eh.)',
 'Bibliothek D-PHYS',
 'Vaterlaus, Andreas',
 'Mil. Sicherheitst.',
 'DZ Rechnerg. Wiss.',
 'Quantenelektronik',
 'DA ARCH',
 'Geschichte',
 'SR Physik',
 'Total DS BAUG',
 'Kamgarpour, Maryam',
 'Bsc Raumbez. Ing.',
 'Davies, H. C. (em.)',
 'Heinrich, Ch. (em.)',
 'T-LSZ',
 'Geering, H. P. (em.)',
 'Lehre FOWI, direkt',
 'Studienbe

In [24]:
# note: we have no department-specific code/leitzahl
ln = leitzahl_mapping[["NAME","DEPT"]]

In [25]:
# map department code to department names
dep_map = {'AGRL': 'Agricultural Sciences',
 'ANBI': 'Applied Biosciences',
 'ARCH': 'Architecture',
 'BAUG': 'Civil, Environmental and Geomatic Engineering',
 'BIOL': 'Biology',
 'BSSE': 'Biosystems Science and Engineering',
 'CHAB': 'Chemistry and Applied Biosciences',
 'ERDW': 'Earth Sciences',
 'GESS': 'Humanities, Social and Political Sciences',
 'HEST': 'Health Sciences and Technology',
 'INFK': 'Computer Science',
 'ITET': 'Information Technology and Electrical Engineering',
 'MATH': 'Mathematics',
 'MATL': 'Materials',
 'MAVT': 'Mechanical and Process Engineering',
 'MTEC': 'Management, Technology and Economics',
 'PHYS': 'Physics',
 'USYS': 'Environmental Systems Science'}

def transform_dep_code(code):
    return dep_map[code]

In [26]:
def transform_dep_code(code):
    return dep_map[code]

transform_dep_code('MATH')

'Mathematics'

In [27]:
ln["DEPT"] = ln["DEPT"].apply(transform_dep_code)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
ln.head()

Unnamed: 0,NAME,DEPT
0,T-SR Architektur,Architecture
1,Total NDS/NDK ARCH,Architecture
2,T-Lehre ARCH,Architecture
3,T-SR Bauing.wiss.,"Civil, Environmental and Geomatic Engineering"
4,T-SR Umwelting.,"Civil, Environmental and Geomatic Engineering"


In [31]:
# now try with deleting parentheses after the names

def delete_parenthesis(name):
    if isinstance(name, str):
        return re.split('(\s\([a-zA-Z.]+\))', name)[0]

print(delete_parenthesis('Schlunegger (ehem.)'))
print(delete_parenthesis('Fontana, M. (em.)'))
print(delete_parenthesis('Smith, Roy (Tit.)'))
print(delete_parenthesis('Baccini, Peter (em.)'))

Schlunegger
Fontana, M.
Smith, Roy
Baccini, Peter


In [32]:
# apply to leitzahl names, merge and check coverage

ln["NAME"] = ln["NAME"].apply(delete_parenthesis)
rd_m2 = rd_m.merge(ln, how = "left", left_on = "author", right_on = "NAME")
print("Number of entries with complete author and department: ", 
      sum((rd_m2["author"].notna() & rd_m2["DEPT"].notna())))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Number of entries with complete author and department:  68999


Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal,professor,organisation unit code,organisation,NAME,DEPT
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,,,
3,28599.0,"Schützeichel, Rainer",Architettura contestuale. Difesa di una proget...,2010,Book Chapter,,it,,,,,,
4,319574.0,"Schützeichel, Rainer",Die Stadt und das Haus,2010,Book Chapter,,en||de,,,,,,


### 4. Final touches

In [18]:
# add professor indicator
# problem: only professors that can be matched to the organisations data will be indicated as professors
#rd_m2["is_professor"] = rd_m2["author"] == rd_m2["professor"]
#
#rd_m2.head()

In [33]:
# drop professor and name columns

rd_final = rd_m2.drop(columns = ["professor", "NAME"])

# rename for ease of neo4j import 

rd_final = rd_final.rename({"publication date": "publication_date", "publication type": "publication_type",
                           "organisation unit code": "organisation_unit_code", "DEPT": "department"},
                          axis = 1)
rd_final.head()

Unnamed: 0,id,author,title,publication_date,publication_type,abstract,language,journal,organisation_unit_code,organisation,department
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,
3,28599.0,"Schützeichel, Rainer",Architettura contestuale. Difesa di una proget...,2010,Book Chapter,,it,,,,
4,319574.0,"Schützeichel, Rainer",Die Stadt und das Haus,2010,Book Chapter,,en||de,,,,


In [34]:
# issue: quotes need to be escaped in neo4j
# the following function helps us avoid errors when importing

def add_quote(abstract):
    if isinstance(abstract, str):
        abstract = abstract.replace('\\"', '""')
        return abstract.replace('"', '""')

rd_final["abstract"] = rd_final["abstract"].apply(add_quote)

## Import/export :):

In [35]:
rd_final.to_csv("graph_data_final.csv", index=False)
#rc_m.to_json("metadata_final.json", orient = "records") 

### Neo4j import details

The following nodes will be created: 
- **person** [name, gender]
- **publication** [id, title, date, type, abstract, journal]
- **organisation** [name, code] ## NOTE: No code with leitzahl mapping data
- **department** [name, code]


--- 


**The commands**

    
To load the csv you first have to <u>copy it into your Neo4j base directory</u>. More info [here](https://neo4j.com/docs/cypher-manual/current/clauses/load-csv/#load-csv-import-data-from-a-csv-file). I did the following: 

```cp .\metadata_final.csv C:/Users/Giulia/.Neo4jDesktop/neo4jDatabases/database-befe90d3-7991-457e-9671-62c55c830654/installation-3.5.12/import```

<u>Constraints first</u>

The constraints are here to make sure we don't create duplicate nodes.

    CREATE CONSTRAINT ON (c:Person) ASSERT c.name IS UNIQUE;
    CREATE CONSTRAINT ON (c:Organisation) ASSERT c.code IS UNIQUE;
    CREATE CONSTRAINT ON (c:Publication) ASSERT c.title IS UNIQUE;
    CREATE CONSTRAINT ON (c:Department) ASSERT c.name IS UNIQUE; # or c.code
    CREATE INDEX ON :Publication(id)

 
Now we'll <u>load the data</u> in a very lightweight manner: 

1) person nodes <br>
```
    LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
    WITH line WHERE line.author IS NOT NULL
    MERGE (person:Person {name: line.author})
```
        > Added 267877 labels, created 267877 nodes, set 316963 properties, completed after 11841 ms.

2) publication nodes (this might take a while) <br>
       
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.id IS NOT NULL
        MERGE (publication: Publication {title: line.title})
        SET publication.id=line.id,             
            publication.type=line.publication_type, 
            publication.date=date(line.publication_date),
            publication.abstract=line.abstract,
            publication.journal=line.journal;

        > Added 170284 labels, created 170284 nodes, set 3704438 properties, completed after 13718 ms.
        
3) organisation nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.organisation_unit_code IS NOT NULL
        MERGE (organisation:Organisation {code:line.organisation_unit_code})
        SET organisation.name=line.organisation;
                                          
        > Added 383 labels, created 383 nodes, set 80749(****) properties, completed after 5205 ms.
            
4) department nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.department IS NOT NULL
        MERGE (department:Department {name:line.department})
        SET department.name=line.department;
                                          
        > Added 16 labels, created 16 nodes, set 49319 properties, completed after 5271 ms.                       
            
        
5) finally all the edges <br> 
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}), 
               (publication:Publication {id:line.id})
        MERGE (person)-[:PUBLISHED]->(publication)
        
        > 
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (organisation:Organisation {code:line.organisation_unit_code})
        MERGE (person)-[:BELONGS_TO]->(organisation)
        
        > Created 664 relationships, completed after 6556 ms.
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (department:Department {name:line.department})
        MERGE (person)-[:WORKS_IN]->(department)
        
        Alternative without leitzahl:
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (department:Department {name:line.department})
        MERGE (person)-[:WORKS_IN]->(department)
        
        > 
        
Note: in case you did something wrong and you want to erase the network here's the query: 

        MATCH (n)
        DETACH DELETE n;

    


## Export for topic modeling

In [30]:
# select only english publications

rd_full_en = rd_m2[rd_m2["language"] == "en"]

In [31]:
# drop duplicates (keeping only the first)

abstracts_data = rd_full_en.drop_duplicates(subset = ["id"], keep = "first")
print(abstracts_data.shape)

(158522, 15)


In [32]:
# keep only the entries that have an abstract

print("Number of abstracts: ", sum(abstracts_data["abstract"].notna()))
abstracts_only = abstracts_data[abstracts_data["abstract"].notna()]

Number of abstracts:  20494


In [33]:
# export only abstract and id

abstracts_only[["abstract","id","title"]].to_csv("abstracts_eng.csv", index = False)

## Function for estimating storage requirements

In [13]:
def compute_storage_upper_bound(nrpubs, nrdeps, nrorgs, nrpeople):
    """Gives a rough upper bound of the storage required for a graph (GB) with the given input parameter values"""
    import numpy as np
    
    # Record size per node: 15B
    # Record size per edge: 34B
    # Record size per attribute: 41B
    # Record size per string or array attribute: 128B
    # https://neo4j.com/developer/kb/understanding-data-on-disk/
    
    nrtopics = np.log(nrpubs) # assume that the number of topics grows logarithmically with the number of publications
    
    # for each type on node, multiply the number of nodes with the storage required for the node annd its attributes
    node_storage = nrpubs*(15+2*41+4*128) + nrdeps*(15+41+128) + nrorgs*(15+41+128) + nrpeople*(15+41+128) + nrtopics*(15+41+128)
    
    dep_people_edges = nrdeps*40 # assume max 40 professors per department on average
    org_people_edges = nrorgs*5 # assume max 5 professors per organisation on average
    pub_people_edges = nrpubs*10 # assume max 10 authors per publication on average
    pub_topic_edges = nrpubs*20 # assume max 10 topics per publication on average
    
    # for each type on edge, multiply the number of nodes with the storage required for the node annd its attributes
    edge_storage = dep_people_edges*34 + org_people_edges*34 + pub_people_edges*34 + pub_topic_edges*(34+128)
    
    # storage required for indices
    # following neo4j heuristics: average property value size * (1/3)
    # we have four indices, one for each node
    avg_prop_size = (6*41+9*128)/15
    index_storage = avg_prop_size*(nrpubs + nrdeps + nrorgs + nrpeople)*(1/3)
    
    # add and return in GB
    return (node_storage + edge_storage + index_storage)/10**9

In [14]:
compute_storage_upper_bound(170000, 16, 400, 10000)

0.7197434437472171