# Extraction and insertion

In this notebook there will be only the **final, necessary** code for the data extraction (cleaning/filtering/merging) and data import (into the graph database neo4j).

In [1]:
import pandas as pd
import os 
import numpy as np
import re
from zipfile import ZipFile

## Extraction

### 1. Filtering 

In [2]:
# read all data files, stratified by publication type

zip_file = ZipFile('RC export 2020-10-12.zip')
files = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv')}

ed = files["educational-2020-10-12.csv"]
books = files["books-2020-10-12.csv"]
conf = files["conference-2020-10-12.csv"]
journ = files["journal-2020-10-12.csv"]
oth = files["other-2020-10-12.csv"]
pap = files["papers-2020-10-12.csv"]
pat = files["patents-2020-10-12.csv"]
pres = files["presentations-2020-10-12.csv"]

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


In [3]:
# we restrict ourselves to only book chapters, conference papers, journal papers and other papers

research_data_df = pd.concat([books, conf, journ, pap], ignore_index=True)
print(research_data_df.shape)
print("There are ", research_data_df.shape[0], "documents, ", 
      sum(research_data_df["dc.description.abstract"].notna()), " of which have abstracts")
research_data_df.head()

(176057, 114)
There are  176057 documents,  21421  of which have abstracts


Unnamed: 0,id,collection,dc.contributor.author,dc.contributor.editor,dc.contributor.other,dc.contributor.supervisor,dc.date.accessioned,dc.date.available,dc.date.issued,dc.date.published,...,ethz.relation.isCompiledBy,ethz.relation.isDocumentedBy,ethz.relation.references,dc.language,ethz.identifier.pubmed,ethz.relation.isReviewedBy,ethz.relation.reviews,ethz.version,ethz.date.retentionend,ethz.date.retentionendDate
0,108069,,"Schützeichel, Rainer","Gerber, Andri||Kurath, Stefan",,,2017-06-11T21:44:58Z,2017-06-11T21:44:58Z,2016,,...,,,,,,,,,,
1,188444,,"Milman, Vitali||Wagner, Roy","Ball, Keith M.||Milman, Vitali",,,2017-10-23T13:08:25Z,2017-06-12T14:30:31Z||2017-08-21T11:33:24Z||20...,1999-04,,...,,,,,,,,,,
2,125569,,"Folkers, Gerd","Abel, Günter||Plümacher, Martina",,,2017-06-12T18:11:46Z,2017-06-12T18:11:46Z,2016,,...,,,,,,,,,,
3,157378,,"Mack, Rüdiger||Schleich, Wolfgang P.||Haase, D...","Arendt, Wolfgang||Schleich, Wolfgang P.",,,2017-06-14T12:24:19Z,2017-06-14T12:24:19Z,2008,,...,,,,,,,,,,
4,19843,,"Burkhard, Remo||Schmitt, Gerhard","Wang, Xiangyu||Schnabel, Marc Aurel",,,2017-06-08T23:42:55Z,2017-06-08T23:42:55Z,2008,,...,,,,,,,,,,


In [4]:
set(research_data_df.columns)

{'collection',
 'dc.contributor',
 'dc.contributor.author',
 'dc.contributor.editor',
 'dc.contributor.other',
 'dc.contributor.supervisor',
 'dc.date.accessioned',
 'dc.date.available',
 'dc.date.issued',
 'dc.date.published',
 'dc.description.abstract',
 'dc.format',
 'dc.identifier.doi',
 'dc.identifier.isbn',
 'dc.identifier.issn',
 'dc.identifier.olduri',
 'dc.identifier.other',
 'dc.identifier.uri',
 'dc.identifier.wos',
 'dc.language',
 'dc.language.iso',
 'dc.publisher',
 'dc.relation.isnodouble',
 'dc.relation.ispartof',
 'dc.relation.isreplacedbydouble',
 'dc.rights.license',
 'dc.rights.uri',
 'dc.subject',
 'dc.title',
 'dc.title.alternative',
 'dc.type',
 'ethz.availability',
 'ethz.book.title',
 'ethz.code.ddc',
 'ethz.code.jel',
 'ethz.date.deposited',
 'ethz.date.embargoend',
 'ethz.date.retentionend',
 'ethz.date.retentionendDate',
 'ethz.doipreview',
 'ethz.ecitpid',
 'ethz.ecolpid',
 'ethz.edit.source',
 'ethz.edit.status',
 'ethz.eth',
 'ethz.event',
 'ethz.event.da

In [5]:
research_data_df["ethz.leitzahl.certified"][0]

'ETH Zürich::00002 - ETH Zürich, direkt::00012 - Lehre und Forschung, direkt::00007 - Departemente, direkt::02100 - Departement Architektur / Department of Architecture::02601 - Institut für Geschichte und Theorie der Architektur (gta) / Institute for the History and Theory of Architecture (gta)::03414 - Magnago Lampugnani, Vittorio (emeritus)'

In [6]:
# let's remove all irrelevant columns

rd_f = research_data_df[["id","dc.contributor.author","dc.title","dc.date.issued","dc.type","dc.description.abstract",
                 "dc.language.iso", "ethz.journal.title"]]
rd_f = rd_f.rename({"dc.contributor.author": "author", "dc.date.issued": "publication date",
                       "dc.title": "title", "dc.type": "publication type", "dc.description.abstract": "abstract",
                       "dc.language.iso": "language", "ethz.journal.title": "journal"}, 
              axis = 1)
rd_f.head()

Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal
0,108069,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen
1,188444,"Milman, Vitali||Wagner, Roy",Asymptotic versions for operators and operator...,1999-04,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
2,125569,"Folkers, Gerd",On Re-Positioning,2016,Book Chapter,,en,Berlin Studies in Knowledge Research
3,157378,"Mack, Rüdiger||Schleich, Wolfgang P.||Haase, D...",Factorization,2008,Book Chapter,,en,
4,19843,"Burkhard, Remo||Schmitt, Gerhard",Visualising future cities in the ETH Value Lab,2008,Book Chapter,,en,


### 2. Cleaning

In [7]:
# define helper functions for cleaning

def separate_names(names):
    """ Separes a string of names of the form name1||name2||name3||... into a list of names."""
    if isinstance(names, str):
        return names.split("||")

def date_to_year(date):
    """Get the dates into a single format (YYYY)"""
    if isinstance(date, str):
        return re.findall("[\d]{4}",date)[0]

def date_to_year_and_month(date):
    """Get dates into format YYYY-MM"""
    # if needed later
    if isinstance(date, str):
        if len(re.findall("[\d]{4}[-][\d]{2}",date)) > 0:
            return re.findall("[\d]{4}[-][\d]{2}",date)[0]
        else:
            return str(date)

In [8]:
author_array = rd_f["author"].apply(separate_names)
rd_f["author"] = author_array
date_array = rd_f["publication date"].apply(date_to_year_and_month)
rd_f["publication date"] = date_array

rd_f.head()

Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal
0,108069,"[Schützeichel, Rainer]",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen
1,188444,"[Milman, Vitali, Wagner, Roy]",Asymptotic versions for operators and operator...,1999-04,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
2,125569,"[Folkers, Gerd]",On Re-Positioning,2016,Book Chapter,,en,Berlin Studies in Knowledge Research
3,157378,"[Mack, Rüdiger, Schleich, Wolfgang P., Haase, ...",Factorization,2008,Book Chapter,,en,
4,19843,"[Burkhard, Remo, Schmitt, Gerhard]",Visualising future cities in the ETH Value Lab,2008,Book Chapter,,en,


In [9]:
# now explode on author field

rd_fe = rd_f.explode("author")
print("Shape changed from ", rd_f.shape, " to ", rd_fe.shape)
rd_fe.head()

Shape changed from  (176057, 8)  to  (867718, 8)


Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal
0,108069,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen
1,188444,"Milman, Vitali",Asymptotic versions for operators and operator...,1999-04,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
1,188444,"Wagner, Roy",Asymptotic versions for operators and operator...,1999-04,Book Chapter,The goal of this note is to introduce new clas...,en,Mathematical Sciences Research Institute Publi...
2,125569,"Folkers, Gerd",On Re-Positioning,2016,Book Chapter,,en,Berlin Studies in Knowledge Research
3,157378,"Mack, Rüdiger",Factorization,2008,Book Chapter,,en,


### 3. Merge with department and organisation data

#### 3.1 Organisation data

In [10]:
pl = pd.read_excel("ETH Professor list.xlsx")

In [11]:
# create professor column and keep only the relevant columns

pl["Professor"] = pl["Name"] + ", " + pl["First name"]
pl = pl[["Professor", "Org. unit code", "Organisation"]]
pl.columns = ["professor", "organisation unit code", "organisation"]
print(pl.shape)
pl.head()

(732, 3)


Unnamed: 0,professor,organisation unit code,organisation
0,"Abhari, Reza S.",2627,Institute of Energy Technology (former)
1,"Acciaio, Beatrice",9727,"Acciaio, Beatrice"
2,"Ackermann, Martin",3743,"Ackermann, Martin"
3,"Ackermann, Martin",2721,Inst. Biogeochem. and Pollutant Dynamics
4,"Adey, Bryan T.",2604,Inst. Construction&Infrastructure Manag.


In [12]:
# merge with exploded research collection

rd_m = rd_fe.merge(pl,how="outer",right_on="professor",left_on="author")
print("Shape of research collection changed from ", rd_fe.shape, " to ", rd_m.shape, ". Diff: ", 
      rd_m.shape[0]-rd_fe.shape[0])
rd_m.head(3)

Shape of research collection changed from  (867718, 8)  to  (877074, 11) . Diff:  9356


Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal,professor,organisation unit code,organisation
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,


#### 3.2 Leitzahl data

In [14]:
leitzahl_mapping = pd.read_csv("cost_centre_dept_mapping.csv", encoding = "ISO-8859-1")
leitzahl_mapping.head()

Unnamed: 0,LTZL,NAME,TYPNEU,TYPNAME,DEPT,"TO_CHAR(T.VONDAT,'DD/MM/YYYY')","TO_CHAR(T.BISDAT,'DD/MM/YYYY')"
0,T1057,T-SR Architektur,98,Totalisierungs-Leitzahlen,ARCH,01/01/2006,31/12/9999
1,T1058,Total NDS/NDK ARCH,98,Totalisierungs-Leitzahlen,ARCH,01/01/2000,31/12/2005
2,T1059,T-Lehre ARCH,98,Totalisierungs-Leitzahlen,ARCH,01/01/2000,31/12/9999
3,T1104,T-SR Bauing.wiss.,98,Totalisierungs-Leitzahlen,BAUG,01/01/2006,31/12/9999
4,T1105,T-SR Umwelting.,98,Totalisierungs-Leitzahlen,BAUG,01/01/2006,31/12/9999


In [15]:
set(leitzahl_mapping["NAME"])

{'Tönnesmann (ehem.)',
 'Signorell, Ruth',
 'T-Entomologisches I.',
 'T-Molekularbiologie',
 'Meier, Beat H.',
 'Hugi, H. R.',
 'Kissling, E. (Tit.)',
 'Nievergelt, J. (em.)',
 'Schär, Christoph',
 'Meyer, M. R. (ehem.)',
 'Experimente',
 'T-SR Umweltnat.wiss.',
 'T-Informatik',
 'Bugmann, H.',
 'Domeisen, D. (SNF)',
 'Magnago Lampug.(em.)',
 'Baumgartner,R.(Tit.)',
 'Kovalenko, Maksym',
 'Jessberger (ehem.)',
 'IVUK-Lehre',
 'Meboldt, Mirko',
 'T-CTM',
 'Sander, Karin',
 'Abell, D. F. (em.)',
 'Baustatik u. Konst',
 'Total Lehre ANBI',
 'T-Organische Chemie',
 'Total Schwab, M.',
 'T-Betriebswissensch.',
 'Stemmler, H. (em.)',
 'Gutknecht, J. (em.)',
 'Total DS AGRL',
 'Vasella, A. T. (em.)',
 'MSc Interdisz.Nat.w.',
 'Maeder, R, E.',
 'Total Krek, W.',
 'T-Stark, W.',
 'DZ Informatik',
 'Hoffmann, Volker',
 'T-Lehre ARCH',
 'Douglas (em.)',
 'Alonso, Gustavo',
 'Carminati, A (vor A)',
 'Pratsinis, Sotiris',
 'T-Agrarwiss.',
 'Würgler, F. E. (em.)',
 'Total NDS/NDK ITET',
 'Camartin, 

In [51]:
# note: we have no department-specific code/leitzahl
ln = leitzahl_mapping[["NAME","DEPT"]]

In [52]:
# try merging as is

rc_temp = rd_m.merge(ln, how = "left", left_on = "author", right_on = "NAME")
rc_temp.head()

Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal,professor,organisation unit code,organisation,NAME,DEPT
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,,,
3,28599.0,"Schützeichel, Rainer",Architettura contestuale. Difesa di una proget...,2010,Book Chapter,,it,,,,,,
4,319574.0,"Schützeichel, Rainer",Die Stadt und das Haus,2010,Book Chapter,,en||de,,,,,,


In [53]:
# check department + author coverage

print("Number of entries with complete author and department: ", 
      sum((rc_temp["author"].notna() & rc_temp["DEPT"].notna())))

# better than research areas data!

Number of entries with complete author and department:  68999


In [54]:
# now try with deleting parentheses after the names

def delete_parenthesis(name):
    if isinstance(name, str):
        return re.split('(\s\([a-zA-Z.]+\))', name)[0]

print(delete_parenthesis('Schlunegger (ehem.)'))
print(delete_parenthesis('Fontana, M. (em.)'))
print(delete_parenthesis('Smith, Roy (Tit.)'))
print(delete_parenthesis('Baccini, Peter (em.)'))

Schlunegger
Fontana, M.
Smith, Roy
Baccini, Peter


In [55]:
# apply to leitzahl names, merge and check coverage

ln["NAME"] = ln["NAME"].apply(delete_parenthesis)
rd_m2 = rd_m.merge(ln, how = "left", left_on = "author", right_on = "NAME")
print("Number of entries with complete author and department: ", 
      sum((rd_m2["author"].notna() & rd_m2["DEPT"].notna())))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Number of entries with complete author and department:  68999


#### (Research areas data)

In [20]:
ar = pd.read_excel("Research areas from researcher profile page.xls")
print(ar.shape)
ar.head()

(564, 7)


Unnamed: 0,ANREDE,FAMNAME,VORNAME,DEPARTEMENT_NAME,DEPARTEMENT_LEITZAHL,FORSCHUNGSGEBIET_E,HOMEPAGE
0,Herr,Avermaete,Tom,Architektur,2100,,
1,Herr,Block,Philippe Camille Vincent,Architektur,2100,<p>Philippe Block is a structural engineer and...,http://block.arch.ethz.ch
2,Herr,Brandlhuber,Arno Hans,Architektur,2100,,
3,Herr,Caminada,Gion Antoni,Architektur,2100,,
4,Herr,Caruso,Adam,Architektur,2100,,


In [21]:
# small adjustment 
ar.loc[ar["ANREDE"]=="Herr",["ANREDE"]] = "M"
ar.loc[ar["ANREDE"]=="Frau",["ANREDE"]] = "F"
# producing single name column 
ar["name"] = ar["FAMNAME"] + ", " + ar["VORNAME"]

In [22]:
# Should we translate the department names? 
# (Check how they're represented in other files)

In [23]:
# projecting into interesting columns 
# Should we keep the "FORSCHUNGSGEBIET_E" column? 396 values are NaN (circa 70%)
# same goes for the Homepage (btw 60-70% NaN)
ar_p = ar[["name","ANREDE","DEPARTEMENT_NAME","DEPARTEMENT_LEITZAHL"]]

In [24]:
# Renaming the columns 
ar_p.columns = ["name","gender","department","department leitzahl"]
ar_p.head(3)

Unnamed: 0,name,gender,department,department leitzahl
0,"Avermaete, Tom",M,Architektur,2100
1,"Block, Philippe Camille Vincent",M,Architektur,2100
2,"Brandlhuber, Arno Hans",M,Architektur,2100


In [25]:
# finally merging with research collection

#rd_m2 = rd_m.merge(ar_p,how="outer",right_on="name",left_on="author")
#print("Shape of research collection changed from ", rd_m.shape, " to ", rd_m2.shape, ". Diff: ", 
#      rd_m2.shape[0]-rd_m.shape[0])
#rd_m2.head(3)

Shape of research collection changed from  (877074, 11)  to  (877291, 15) . Diff:  217


Unnamed: 0,id,author,title,publication date,publication type,abstract,language,journal,professor,organisation unit code,organisation,name,gender,department,department leitzahl
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,,,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,,,,,


In [29]:
# check department + author coverage

#print("Number of entries with complete author and department: ", 
#      sum((rd_m2["author"].notna() & rd_m2["department"].notna())))

Number of entries with complete author and department:  49086


### 4. Final touches

In [18]:
# add professor indicator
# problem: only professors that can be matched to the organisations data will be indicated as professors
#rd_m2["is_professor"] = rd_m2["author"] == rd_m2["professor"]
#
#rd_m2.head()

In [57]:
# drop professor and name columns

rd_final = rd_m2.drop(columns = ["professor", "NAME"])

# rename for ease of neo4j import 

rd_final = rd_final.rename({"publication date": "publication_date", "publication type": "publication_type",
                           "organisation unit code": "organisation_unit_code", "DEPT": "department"},
                          axis = 1)
rd_final.head()

Unnamed: 0,id,author,title,publication_date,publication_type,abstract,language,journal,organisation_unit_code,organisation,department
0,108069.0,"Schützeichel, Rainer",Die Perspektive des Städtebauers. Ein Blick au...,2016,Book Chapter,,de,Grundlagen,,,
1,127413.0,"Schützeichel, Rainer",Einleitung. Wegmarken einer Theorie des archit...,2016,Book Chapter,,de,,,,
2,127410.0,"Schützeichel, Rainer",Tradition as a Means of Modernisation: The Cru...,2016,Book Chapter,,en,,,,
3,28599.0,"Schützeichel, Rainer",Architettura contestuale. Difesa di una proget...,2010,Book Chapter,,it,,,,
4,319574.0,"Schützeichel, Rainer",Die Stadt und das Haus,2010,Book Chapter,,en||de,,,,


In [28]:
# issue: quotes need to be escaped in neo4j
# the following function helps us avoid errors when importing

def add_quote(abstract):
    if isinstance(abstract, str):
        abstract = abstract.replace('\\"', '""')
        return abstract.replace('"', '""')

rd_final["abstract"] = rd_final["abstract"].apply(add_quote)

## Import/export :):

In [29]:
rd_final.to_csv("graph_data_final.csv", index=False)
#rc_m.to_json("metadata_final.json", orient = "records") 

### Neo4j import details

The following nodes will be created: 
- **person** [name, gender]
- **publication** [id, title, date, type, abstract, journal]
- **organisation** [name, code] ## NOTE: No code with leitzahl mapping data
- **department** [name, code]


--- 


**The commands**

    
To load the csv you first have to <u>copy it into your Neo4j base directory</u>. More info [here](https://neo4j.com/docs/cypher-manual/current/clauses/load-csv/#load-csv-import-data-from-a-csv-file). I did the following: 

```cp .\metadata_final.csv C:/Users/Giulia/.Neo4jDesktop/neo4jDatabases/database-befe90d3-7991-457e-9671-62c55c830654/installation-3.5.12/import```

<u>Constraints first</u>

The constraints are here to make sure we don't create duplicate nodes.

    CREATE CONSTRAINT ON (c:Person) ASSERT c.name IS UNIQUE;
    CREATE CONSTRAINT ON (c:Organisation) ASSERT c.code IS UNIQUE;
    CREATE CONSTRAINT ON (c:Publication) ASSERT c.title IS UNIQUE;
    CREATE CONSTRAINT ON (c:Department) ASSERT c.code IS UNIQUE;
    CREATE INDEX ON :Publication(id)

 
Now we'll <u>load the data</u> in a very lightweight manner: 

1) person nodes <br>
```
    LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
    WITH line WHERE line.author IS NOT NULL
    MERGE (person:Person {name: line.author})
    SET person.gender=line.gender;
```
        > Added 267877 labels, created 267877 nodes, set 316963 properties, completed after 11841 ms.

2) publication nodes (this might take a while) <br>
       
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.id IS NOT NULL
        MERGE (publication: Publication {title: line.title})
        SET publication.id=line.id,             
            publication.type=line.publication_type, 
            publication.date=line.publication_date,
            publication.abstract=line.abstract,
            publication.journal=line.journal;

        > Added 170284 labels, created 170284 nodes, set 3704438 properties, completed after 13718 ms.
        
3) organisation nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.organisation_unit_code IS NOT NULL
        MERGE (organisation:Organisation {code:line.organisation_unit_code})
        SET organisation.name=line.organisation;
                                          
        > Added 383 labels, created 383 nodes, set 80749(****) properties, completed after 5205 ms.
            
4) department nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.department_leitzahl IS NOT NULL
        MERGE (department:Department {code:line.department_leitzahl})
        SET department.name=line.department;
                                          
        > Added 16 labels, created 16 nodes, set 49319 properties, completed after 5271 ms.                       
            
        
5) finally all the edges <br> 
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}), 
               (publication:Publication {id:line.id})
        MERGE (person)-[:PUBLISHED]->(publication)
        
        > 
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (organisation:Organisation {code:line.organisation_unit_code})
        MERGE (person)-[:BELONGS_TO]->(organisation)
        
        > Created 664 relationships, completed after 6556 ms.
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (department:Department {code:line.department_leitzahl})
        MERGE (person)-[:WORKS_IN]->(department)
        
        > 
        
Note: in case you did something wrong and you want to erase the network here's the query: 

        MATCH (n)
        DETACH DELETE n;

    


## Export for topic modeling

In [30]:
# select only english publications

rd_full_en = rd_m2[rd_m2["language"] == "en"]

In [31]:
# drop duplicates (keeping only the first)

abstracts_data = rd_full_en.drop_duplicates(subset = ["id"], keep = "first")
print(abstracts_data.shape)

(158522, 15)


In [32]:
# keep only the entries that have an abstract

print("Number of abstracts: ", sum(abstracts_data["abstract"].notna()))
abstracts_only = abstracts_data[abstracts_data["abstract"].notna()]

Number of abstracts:  20494


In [33]:
# export only abstract and id

abstracts_only[["abstract","id","title"]].to_csv("abstracts_eng.csv", index = False)

In [16]:
def compute_upper_bytes(n):
    return (138*n + 225*n**2)/(10**9)

In [17]:
compute_upper_bytes(100000)

2250.0138

In [20]:
(438560*(15+3*128)+844453*(34+128))*1.33/10**9

0.41467647858000006