# Extraction and insertion

In this notebook there will be only the **final, necessary** code for the data extraction (cleaning/filtering/merging) and data import.

In [2]:
import pandas as pd
import os 
import numpy as np
import re

## Extraction

In [3]:
rcp = pd.read_csv("ResearchCollectionPublications2008_2018.tsv", sep="\t", header=0, encoding="latin-1") #utf8 not working...

  interactivity=interactivity, compiler=compiler, result=result)


### 1. Filtering 

In [4]:
# 1. type of publication only limited to: Journal Article, Conference Paper, Other Conference Item, Book Chapter
rcp_tf = rcp.loc[(rcp["DC_TYPE"] == "Journal Article")|
                (rcp["DC_TYPE"] == "Conference Paper")|
                (rcp["DC_TYPE"] =="Other Conference Item")|
                (rcp["DC_TYPE"] =="Book Chapter"),:]

In [5]:
rcp_tf.columns

Index(['RC_ID', 'COLLECTION', 'DC_CONTRIBUTOR_AUTHOR', 'DC_CONTRIBUTOR_EDITOR',
       'DC_DATE_ISSUED', 'DC_DATE_PUBLISHED', 'DC_IDENTIFIER_DOI',
       'DC_IDENTIFIER_ISBN', 'DC_IDENTIFIER_ISSN', 'DC_IDENTIFIER_OTHER',
       'DC_IDENTIFIER_URI', 'DC_TITLE', 'DC_TITLE_ALTERNATIVE', 'DC_TYPE',
       'ETHZ_AVAILABILITY', 'ETHZ_BOOK_TITLE', 'ETHZ_ETH', 'ETHZ_EVENT',
       'ETHZ_EVENT_DATE', 'ETHZ_EVENT_LOCATION', 'ETHZ_IDENTIFIER_SCOPUS',
       'ETHZ_IDENTIFIER_URL', 'ETHZ_IDENTIFIER_WOS',
       'ETHZ_JOURNAL_ABBREVIATED', 'ETHZ_JOURNAL_ISSUE', 'ETHZ_JOURNAL_TITLE',
       'ETHZ_JOURNAL_VOLUME', 'ETHZ_LEITZAHLIDENTIFIERS_CERT',
       'ETHZ_PAGES_END', 'ETHZ_PAGES_START', 'ETHZ_PUBLICATION_PLACE',
       'ETHZ_TITLE_SUBTITLE', 'ETHZ_GRANT', 'ETHZ_GRANT_FUNDERDOI',
       'ETHZ_GRANT_FUNDERNAME', 'ETHZ_GRANT_PROGRAM', 'ETHZ_IDENTIFIER_DISS',
       'ETHZ_IDENTIFIER_PUBMED', 'DC_IDENTIFIER_WOS',
       'ETHZ_IDENTIFIER_ORCIDWORKCODE', 'DC_CONTRIBUTOR_OTHER',
       'ETHZ_IDENTIFIER_AR

In [6]:
#2. projecting into [id, author, date, title, type, journal title]
rcp_ff = rcp_tf[["RC_ID","DC_CONTRIBUTOR_AUTHOR","DC_DATE_ISSUED","DC_TITLE","DC_TYPE","ETHZ_JOURNAL_TITLE"]]

In [7]:
rcp_ff.head()

Unnamed: 0,RC_ID,DC_CONTRIBUTOR_AUTHOR,DC_DATE_ISSUED,DC_TITLE,DC_TYPE,ETHZ_JOURNAL_TITLE
97,3379,"Gonzalez-Nicolini, Valeria||Fussenegger, Martin",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
98,15103,"Vorrath, Judith",2008,From refugee to reintegration crisis?,Book Chapter,L'Afrique des grands lacs
99,158533,"Burri, Regula Valérie||Dumit, Joseph",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,
100,14698,"Glaser, Marie A.",2008,Die Baustelle,Book Chapter,
101,8255,"Knubel, Denis||Greenwood, Greg||Wiegandt, Ellen",2008,Research and development in mountain glaciers,Book Chapter,


In [8]:
#3. changing column names ro more readable ones
rcp_ff = rcp_ff.rename({"RC_ID": "id", "DC_CONTRIBUTOR_AUTHOR": "author", "DC_DATE_ISSUED": "publication date",
                       "DC_TITLE": "title", "DC_TYPE": "publication type", "ETHZ_JOURNAL_TITLE": "journal"}, 
              axis = 1)
rcp_ff.head()


Unnamed: 0,id,author,publication date,title,publication type,journal
97,3379,"Gonzalez-Nicolini, Valeria||Fussenegger, Martin",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
98,15103,"Vorrath, Judith",2008,From refugee to reintegration crisis?,Book Chapter,L'Afrique des grands lacs
99,158533,"Burri, Regula Valérie||Dumit, Joseph",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,
100,14698,"Glaser, Marie A.",2008,Die Baustelle,Book Chapter,
101,8255,"Knubel, Denis||Greenwood, Greg||Wiegandt, Ellen",2008,Research and development in mountain glaciers,Book Chapter,


### 2. Cleaning

In [9]:
# 1. Cleaning the names 

In [10]:
def separate_names(names):
    """ Separes a string of names of the form name1||name2||name3||... into a list of names."""
    # Also: does pandas DataFrame support list of strings as fields? 
    # Andreas: Yes
    if isinstance(names, str):
        return names.split("||")

In [11]:
author_array = rcp_ff["author"].apply(separate_names)
rcp_ff["author"] = author_array
print("Check that we don't have more missing values: ", author_array.shape[0] - np.count_nonzero(author_array))
rcp_ff.head()

Check that we don't have more missing values:  8


Unnamed: 0,id,author,publication date,title,publication type,journal
97,3379,"[Gonzalez-Nicolini, Valeria, Fussenegger, Martin]",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
98,15103,"[Vorrath, Judith]",2008,From refugee to reintegration crisis?,Book Chapter,L'Afrique des grands lacs
99,158533,"[Burri, Regula Valérie, Dumit, Joseph]",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,
100,14698,"[Glaser, Marie A.]",2008,Die Baustelle,Book Chapter,
101,8255,"[Knubel, Denis, Greenwood, Greg, Wiegandt, Ellen]",2008,Research and development in mountain glaciers,Book Chapter,


In [12]:
rcp_ff_e = rcp_ff.explode("author")
print(rcp_ff.shape, rcp_ff_e.shape)

(98367, 6) (521431, 6)


In [13]:
rcp_ff_e.head()

Unnamed: 0,id,author,publication date,title,publication type,journal
97,3379,"Gonzalez-Nicolini, Valeria",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
97,3379,"Fussenegger, Martin",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
98,15103,"Vorrath, Judith",2008,From refugee to reintegration crisis?,Book Chapter,L'Afrique des grands lacs
99,158533,"Burri, Regula Valérie",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,
99,158533,"Dumit, Joseph",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,


In [14]:
# 2. Cleaning the dates (same formatting)

In [15]:
def date_to_year(date):
    "Get the dates into a single format (YYYY)"
    if isinstance(date, str):
        return re.findall("[\d]{4}",date)[0]

def date_to_year_and_month(date):
    "Get dates into format YYYY-MM"
    # if needed later
    if isinstance(date, str):
        if len(re.findall("[\d]{4}[-][\d]{2}",date)) > 0:
            return re.findall("[\d]{4}[-][\d]{2}",date)[0]
        else:
            return str(date)

In [16]:
print("Missing values before: ", rcp_ff_e["publication date"].shape[0] - np.count_nonzero(rcp_ff_e["publication date"]))
date_array = rcp_ff_e["publication date"].apply(date_to_year)
rcp_ff_e["publication date"] = date_array
print("Missing values after: ", author_array.shape[0] - np.count_nonzero(author_array))
rcp_ff_e.head()

Missing values before:  0
Missing values after:  8


Unnamed: 0,id,author,publication date,title,publication type,journal
97,3379,"Gonzalez-Nicolini, Valeria",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
97,3379,"Fussenegger, Martin",2008,Adenovirus-mediated transduction of auto- and ...,Book Chapter,Methods in Molecular Biology
98,15103,"Vorrath, Judith",2008,From refugee to reintegration crisis?,Book Chapter,L'Afrique des grands lacs
99,158533,"Burri, Regula Valérie",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,
99,158533,"Dumit, Joseph",2008,Social Studies of Scientific Imaging and Visua...,Book Chapter,


### 3. Merging

#### 3.1 Organisations data

In [17]:
pl = pd.read_excel("ETH Professor list.xlsx")

In [18]:
# create professor column and keep only the relevant columns
pl["Professor"] = pl["Name"] + ", " + pl["First name"]
pl = pl[["Professor", "Org. unit code", "Organisation"]]
pl.head()

Unnamed: 0,Professor,Org. unit code,Organisation
0,"Abhari, Reza S.",2627,Institute of Energy Technology (former)
1,"Acciaio, Beatrice",9727,"Acciaio, Beatrice"
2,"Ackermann, Martin",3743,"Ackermann, Martin"
3,"Ackermann, Martin",2721,Inst. Biogeochem. and Pollutant Dynamics
4,"Adey, Bryan T.",2604,Inst. Construction&Infrastructure Manag.


In [19]:
rc_m = pl.merge(rcp_ff_e,how="outer",left_on="Professor",right_on="author")
rc_m.head(3)

Unnamed: 0,Professor,Org. unit code,Organisation,id,author,publication date,title,publication type,journal
0,"Abhari, Reza S.",2627.0,Institute of Energy Technology (former),253873.0,"Abhari, Reza S.",2008,The Dynamics of the Vorticity Field in a Low S...,Other Conference Item,
1,"Abhari, Reza S.",2627.0,Institute of Energy Technology (former),13706.0,"Abhari, Reza S.",2008,Aerothermal Performance of Streamwise and Comp...,Conference Paper,
2,"Abhari, Reza S.",2627.0,Institute of Energy Technology (former),16637.0,"Abhari, Reza S.",2008,Unsteady CFD Investigation on Inlet Distortion...,Conference Paper,Proceedings of the ASME Turbo Expo


In [20]:
rc_m.shape

(527256, 9)

In [21]:
rcp_ff_e.shape

(521431, 6)

#### 3.2 Research areas data

In [22]:
ar = pd.read_excel("areas.xls")

In [23]:
ar.head(3)

Unnamed: 0,ANREDE,FAMNAME,VORNAME,DEPARTEMENT_NAME,DEPARTEMENT_LEITZAHL,FORSCHUNGSGEBIET_E,HOMEPAGE
0,Herr,Avermaete,Tom,Architektur,2100,,
1,Herr,Block,Philippe Camille Vincent,Architektur,2100,<p>Philippe Block is a structural engineer and...,http://block.arch.ethz.ch
2,Herr,Brandlhuber,Arno Hans,Architektur,2100,,


In [24]:
ar.shape
# Only 564 researchers ... 
# Hypothesis of work: select the 'most complete' subset

(564, 7)

In [25]:
# small adjustment 
ar.loc[ar["ANREDE"]=="Herr",["ANREDE"]] = "M"
ar.loc[ar["ANREDE"]=="Frau",["ANREDE"]] = "F"

In [26]:
# producing single name column 
ar["Name"] = ar["FAMNAME"] + ", " + ar["VORNAME"]

In [27]:
# Should we translate the department names? 
# (Check how they're represented in other files)

In [28]:
# projecting into interesting columns 
# Should we keep the "FORSCHUNGSGEBIET_E" column? 396 values are NaN (circa 70%)
# same goes for the Homepage (btw 60-70% NaN)
ar_p = ar[["ANREDE","DEPARTEMENT_NAME","DEPARTEMENT_LEITZAHL","Name"]]

In [29]:
# Renaming the columns 
ar_p.columns = ["gender","department_name","department_code","name"]
ar_p.head(3)

Unnamed: 0,gender,department_name,department_code,name
0,M,Architektur,2100,"Avermaete, Tom"
1,M,Architektur,2100,"Block, Philippe Camille Vincent"
2,M,Architektur,2100,"Brandlhuber, Arno Hans"


In [30]:
# finally merging
rc_m = ar_p.merge(rc_m,how="outer",left_on="name",right_on="author")
rc_m.head(3)

Unnamed: 0,gender,department_name,department_code,name,Professor,Org. unit code,Organisation,id,author,publication date,title,publication type,journal
0,M,Architektur,2100.0,"Avermaete, Tom","Avermaete, Tom",9643.0,"Avermaete, Tom",284605.0,"Avermaete, Tom",2017,"Death of the Author, Center and Meta-Theory: E...",Book Chapter,
1,M,Architektur,2100.0,"Avermaete, Tom","Avermaete, Tom",9643.0,"Avermaete, Tom",288639.0,"Avermaete, Tom",2018,Balcony,Book Chapter,
2,M,Architektur,2100.0,"Avermaete, Tom","Avermaete, Tom",9643.0,"Avermaete, Tom",284909.0,"Avermaete, Tom",2018,The View from the Grid,Book Chapter,


In [31]:
rc_m.shape
# we gained about 250 researchers from the outer join ... 

(527498, 13)

In [32]:
rc_m = rc_m.drop(['id'], axis = 1)

In [33]:
# TODO: add pre-processing step where we remove duplicates of author/title
# ex Ursula Keller has 8 entries for the same publication

#### 3.3 Merging Abstracts


In [34]:
papers = pd.read_csv("research-data-2020-10-12.csv")

In [35]:
papers.shape

(767, 111)

In [36]:
set(papers.columns)

{'collection',
 'dc.contributor.author',
 'dc.contributor.contactPerson',
 'dc.contributor.dataCollector',
 'dc.contributor.other',
 'dc.contributor.producer',
 'dc.contributor.projectLeader',
 'dc.contributor.projectManager',
 'dc.contributor.projectMember',
 'dc.contributor.relatedPerson',
 'dc.contributor.researchGroup',
 'dc.contributor.researcher',
 'dc.contributor.rightsHolder',
 'dc.date.accessioned',
 'dc.date.available',
 'dc.date.created',
 'dc.date.issued',
 'dc.date.published',
 'dc.date.updated',
 'dc.description.abstract',
 'dc.format',
 'dc.identifier.doi',
 'dc.identifier.issn',
 'dc.identifier.olduri',
 'dc.identifier.other',
 'dc.identifier.uri',
 'dc.language.iso',
 'dc.publisher',
 'dc.relation.isnodouble',
 'dc.rights.license',
 'dc.rights.uri',
 'dc.subject',
 'dc.title',
 'dc.title.alternative',
 'dc.type',
 'ethz.availability',
 'ethz.code.ddc',
 'ethz.code.jel',
 'ethz.date.collected',
 'ethz.date.deposited',
 'ethz.date.embargoend',
 'ethz.date.retentionend',


In [37]:
abstracts = pd.DataFrame(papers["dc.description.abstract"])

In [38]:
abstracts.reset_index(level=0, inplace=True)

In [39]:
abstracts = abstracts.drop("index", axis=1)
abstracts.columns = ["text"]
abstracts.head(3)

Unnamed: 0,text
0,Due to their large dynamical mass-to-light rat...
1,Outflows are an important part of the star for...
2,R Aqr is a symbiotic binary system consisting ...


In [40]:
ed = pd.read_csv("educational-2020-10-12.csv")
books = pd.read_csv("books-2020-10-12.csv")
conf = pd.read_csv("conference-2020-10-12.csv")
journ = pd.read_csv("journal-2020-10-12.csv")
oth = pd.read_csv("other-2020-10-12.csv")
pap = pd.read_csv("papers-2020-10-12.csv")
pat = pd.read_csv("patents-2020-10-12.csv")
pres = pd.read_csv("presentations-2020-10-12.csv")

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
research_data = [ed,books,conf,journ,oth,pap,pat,pres]

In [42]:
research_data_df = pd.concat([books, conf, journ, pap], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [43]:
research_data_df.shape

(176057, 114)

In [44]:
research_data_df[research_data_df['id'].notna()].shape

(176057, 114)

In [45]:
author_array = research_data_df.loc[:,"dc.contributor.author"].apply(separate_names)
print("Check that we don't have more missing values: ", author_array.shape[0] - np.count_nonzero(author_array))
editor_array = research_data_df.loc[:,"dc.contributor.editor"].apply(separate_names)
print("Check that we don't have more missing values: ", editor_array.shape[0] - np.count_nonzero(editor_array))
research_data_df["dc.contributor.author"] = author_array
research_data_df["dc.contributor.editor"] = editor_array
research_data_df.head()

Check that we don't have more missing values:  1437
Check that we don't have more missing values:  154694


Unnamed: 0,collection,dc.contributor,dc.contributor.author,dc.contributor.editor,dc.contributor.other,dc.contributor.supervisor,dc.date.accessioned,dc.date.available,dc.date.issued,dc.date.published,...,ethz.rosetta.lastUpdated,ethz.rosetta.versionExported,ethz.size,ethz.source,ethz.tag,ethz.title.subtitle,ethz.version,ethz.version.deposit,ethz.version.edition,id
0,,,"[Schützeichel, Rainer]","[Gerber, Andri, Kurath, Stefan]",,,2017-06-11T21:44:58Z,2017-06-11T21:44:58Z,2016,,...,2018-11-02T20:54:25Z,True,,ECIT,,,,,,108069
1,,,"[Milman, Vitali, Wagner, Roy]","[Ball, Keith M., Milman, Vitali]",,,2017-10-23T13:08:25Z,2017-06-12T14:30:31Z||2017-08-21T11:33:24Z||20...,1999-04,,...,2020-02-15T08:15:29Z,True,,ECIT,,,,publishedVersion,,188444
2,,,"[Folkers, Gerd]","[Abel, Günter, Plümacher, Martina]",,,2017-06-12T18:11:46Z,2017-06-12T18:11:46Z,2016,,...,2018-12-02T06:58:19Z,True,,ECIT,,,,,,125569
3,,,"[Mack, Rüdiger, Schleich, Wolfgang P., Haase, ...","[Arendt, Wolfgang, Schleich, Wolfgang P.]",,,2017-06-14T12:24:19Z,2017-06-14T12:24:19Z,2008,,...,2017-07-13T00:14:23Z,True,,ECIT,,,,,,157378
4,,,"[Burkhard, Remo, Schmitt, Gerhard]","[Wang, Xiangyu, Schnabel, Marc Aurel]",,,2017-06-08T23:42:55Z,2017-06-08T23:42:55Z,2008,,...,2018-10-01T08:32:38Z,True,,ECIT,,New methods for education and learning,,,,19843


In [46]:
research_data_e = research_data_df.explode("dc.contributor.author")

In [47]:
research_data_e.head()

Unnamed: 0,collection,dc.contributor,dc.contributor.author,dc.contributor.editor,dc.contributor.other,dc.contributor.supervisor,dc.date.accessioned,dc.date.available,dc.date.issued,dc.date.published,...,ethz.rosetta.lastUpdated,ethz.rosetta.versionExported,ethz.size,ethz.source,ethz.tag,ethz.title.subtitle,ethz.version,ethz.version.deposit,ethz.version.edition,id
0,,,"Schützeichel, Rainer","[Gerber, Andri, Kurath, Stefan]",,,2017-06-11T21:44:58Z,2017-06-11T21:44:58Z,2016,,...,2018-11-02T20:54:25Z,True,,ECIT,,,,,,108069
1,,,"Milman, Vitali","[Ball, Keith M., Milman, Vitali]",,,2017-10-23T13:08:25Z,2017-06-12T14:30:31Z||2017-08-21T11:33:24Z||20...,1999-04,,...,2020-02-15T08:15:29Z,True,,ECIT,,,,publishedVersion,,188444
1,,,"Wagner, Roy","[Ball, Keith M., Milman, Vitali]",,,2017-10-23T13:08:25Z,2017-06-12T14:30:31Z||2017-08-21T11:33:24Z||20...,1999-04,,...,2020-02-15T08:15:29Z,True,,ECIT,,,,publishedVersion,,188444
2,,,"Folkers, Gerd","[Abel, Günter, Plümacher, Martina]",,,2017-06-12T18:11:46Z,2017-06-12T18:11:46Z,2016,,...,2018-12-02T06:58:19Z,True,,ECIT,,,,,,125569
3,,,"Mack, Rüdiger","[Arendt, Wolfgang, Schleich, Wolfgang P.]",,,2017-06-14T12:24:19Z,2017-06-14T12:24:19Z,2008,,...,2017-07-13T00:14:23Z,True,,ECIT,,,,,,157378


In [48]:
set(research_data_e.columns)

{'collection',
 'dc.contributor',
 'dc.contributor.author',
 'dc.contributor.editor',
 'dc.contributor.other',
 'dc.contributor.supervisor',
 'dc.date.accessioned',
 'dc.date.available',
 'dc.date.issued',
 'dc.date.published',
 'dc.description.abstract',
 'dc.format',
 'dc.identifier.doi',
 'dc.identifier.isbn',
 'dc.identifier.issn',
 'dc.identifier.olduri',
 'dc.identifier.other',
 'dc.identifier.uri',
 'dc.identifier.wos',
 'dc.language',
 'dc.language.iso',
 'dc.publisher',
 'dc.relation.isnodouble',
 'dc.relation.ispartof',
 'dc.relation.isreplacedbydouble',
 'dc.rights.license',
 'dc.rights.uri',
 'dc.subject',
 'dc.title',
 'dc.title.alternative',
 'dc.type',
 'ethz.availability',
 'ethz.book.title',
 'ethz.code.ddc',
 'ethz.code.jel',
 'ethz.date.deposited',
 'ethz.date.embargoend',
 'ethz.date.retentionend',
 'ethz.date.retentionendDate',
 'ethz.doipreview',
 'ethz.ecitpid',
 'ethz.ecolpid',
 'ethz.edit.source',
 'ethz.edit.status',
 'ethz.eth',
 'ethz.event',
 'ethz.event.da

In [49]:
research_data_f = research_data_e[['dc.contributor.author','dc.description.abstract','dc.title','id', 'dc.language.iso']]

In [50]:
research_data_f.columns = ['author','abstract','title','id','language']
research_data_f.head(20)

Unnamed: 0,author,abstract,title,id,language
0,"Schützeichel, Rainer",,Die Perspektive des Städtebauers. Ein Blick au...,108069,de
1,"Milman, Vitali",The goal of this note is to introduce new clas...,Asymptotic versions for operators and operator...,188444,en
1,"Wagner, Roy",The goal of this note is to introduce new clas...,Asymptotic versions for operators and operator...,188444,en
2,"Folkers, Gerd",,On Re-Positioning,125569,en
3,"Mack, Rüdiger",,Factorization,157378,en
3,"Schleich, Wolfgang P.",,Factorization,157378,en
3,"Haase, Daniel",,Factorization,157378,en
3,"Maier, Helmut",,Factorization,157378,en
4,"Burkhard, Remo",,Visualising future cities in the ETH Value Lab,19843,en
4,"Schmitt, Gerhard",,Visualising future cities in the ETH Value Lab,19843,en


In [51]:
rc_m_test = research_data_f.merge(rc_m,how="outer",left_on=["author","title"],right_on=["author","title"])
rc_m_test.head(50)


Unnamed: 0,author,abstract,title,id,language,gender,department_name,department_code,name,Professor,Org. unit code,Organisation,publication date,publication type,journal
0,"Schützeichel, Rainer",,Die Perspektive des Städtebauers. Ein Blick au...,108069.0,de,,,,,,,,2016.0,Book Chapter,Grundlagen
1,"Milman, Vitali",The goal of this note is to introduce new clas...,Asymptotic versions for operators and operator...,188444.0,en,,,,,,,,,,
2,"Wagner, Roy",The goal of this note is to introduce new clas...,Asymptotic versions for operators and operator...,188444.0,en,,,,,,,,,,
3,"Folkers, Gerd",,On Re-Positioning,125569.0,en,,,,,,,,,,
4,"Mack, Rüdiger",,Factorization,157378.0,en,,,,,,,,2008.0,Book Chapter,
5,"Schleich, Wolfgang P.",,Factorization,157378.0,en,,,,,,,,2008.0,Book Chapter,
6,"Haase, Daniel",,Factorization,157378.0,en,,,,,,,,2008.0,Book Chapter,
7,"Maier, Helmut",,Factorization,157378.0,en,,,,,,,,2008.0,Book Chapter,
8,"Burkhard, Remo",,Visualising future cities in the ETH Value Lab,19843.0,en,,,,,,,,2008.0,Book Chapter,
9,"Schmitt, Gerhard",,Visualising future cities in the ETH Value Lab,19843.0,en,,,,,,,,2008.0,Book Chapter,


In [52]:
rc_m_test.shape

(953951, 15)

In [53]:
rc_m_full = rc_m_test[rc_m_test['abstract'].notna()]
rc_m_full.shape

(164537, 15)

In [54]:
rc_m_full.head(30)

Unnamed: 0,author,abstract,title,id,language,gender,department_name,department_code,name,Professor,Org. unit code,Organisation,publication date,publication type,journal
1,"Milman, Vitali",The goal of this note is to introduce new clas...,Asymptotic versions for operators and operator...,188444.0,en,,,,,,,,,,
2,"Wagner, Roy",The goal of this note is to introduce new clas...,Asymptotic versions for operators and operator...,188444.0,en,,,,,,,,,,
77,"Marti, Christian M.",Die Studie „Integration des Gesamtsystems öffe...,Integration des Gesamtsystems öffentlicher Ver...,172729.0,de,,,,,,,,2016.0,Book Chapter,
79,"Weidmann, Ulrich",Die Studie „Integration des Gesamtsystems öffe...,Integration des Gesamtsystems öffentlicher Ver...,172729.0,de,,,,,,,,2016.0,Book Chapter,
81,"Finger, Matthias",Die Studie „Integration des Gesamtsystems öffe...,Integration des Gesamtsystems öffentlicher Ver...,172729.0,de,,,,,,,,2016.0,Book Chapter,
84,"Milman, Vitali D.",We will review a Lemma published by Ran Raz in...,Some remarks on a lemma of Ran Raz,188623.0,en,,,,,,,,,,
85,"Wagner, Roy",We will review a Lemma published by Ran Raz in...,Some remarks on a lemma of Ran Raz,188623.0,en,,,,,,,,,,
106,"Larsen, Henrik",China’s growing influence in Europe has the po...,China as a Stress Test for Europe’s Coherence,346708.0,en,,,,,,,,,,
107,"Maduz, Linda",China’s growing influence in Europe has the po...,China as a Stress Test for Europe’s Coherence,346708.0,en,,,,,,,,,,
129,"Zogg, Benno",Nowhere is China's Belt and Road Initiative (B...,"On the Belt, on the Road: China’s Pivot to Eur...",346709.0,en,,,,,,,,,,


In [55]:
rc_m_full_en = rc_m_full[rc_m_full["language"] == "en"]

In [58]:
rc_final = rc_m_full_en[['abstract','id','department_name']].drop_duplicates(subset = ['id'], keep = 'first')

In [60]:
rc_final.shape

(20494, 3)

In [61]:
rc_m_final_dept = rc_final[rc_final['department_name'].notna()]

In [63]:
rc_m_final_dept.shape

(171, 3)

In [64]:
rc_m_final_dept.head()

Unnamed: 0,abstract,id,department_name
5232,"Over the last years, a vibrant global market f...",62692.0,"Geistes-, Sozial- und Staatswissenschaften"
9424,The Swiss Eduard Imhof (1895–1986) was the fou...,285287.0,"Bau, Umwelt und Geomatik"
28261,Trefftz methods are finite element-type scheme...,111320.0,Mathematik
48636,Autonomous robotic inspection of industrial si...,176767.0,Maschinenbau und Verfahrenstechnik
66808,Normal priors with unknown variance (NUV) have...,370440.0,Informationstechnologie und Elektrotechnik


In [59]:
rc_final.columns = ['abstract', 'id']
rc_final.to_csv("abstracts_eng.csv", index=False)

In [66]:
rc_m_final_dept.columns = ['abstract', 'id','department_name']
rc_m_final_dept.to_csv("abstracts_eng_with_dept.csv", index=False)

In [202]:
rc_m_unique = rc_m_dept.drop_duplicates(subset = ['department_name','id'], keep = 'first')

In [203]:
rc_m_unique.head()

Unnamed: 0,author,abstract,title,id,gender,department_name,department_code,name,Professor,Org. unit code,Organisation,publication date,publication type,journal
244,"Hiptmair, Ralf",We consider the two-dimensional Helmholtz equa...,Plane Wave Discontinuous Galerkin Methods,78087.0,M,Mathematik,2000.0,"Hiptmair, Ralf","Hiptmair, Ralf",2501.0,Seminar for Applied Mathematics,2015,Other Conference Item,
6264,"Bechtold, Stefan","Over the last years, a vibrant global market f...",The Fashion of TV Show Formats,62692.0,M,"Geistes-, Sozial- und Staatswissenschaften",2045.0,"Bechtold, Stefan","Bechtold, Stefan",3795.0,"Bechtold, Stefan",2013,Journal Article,Michigan State Law Review
9664,"Hurni, Lorenz",Die komplett überarbeitete Neuausgabe 2017 ent...,Schweizer Weltatlas,221546.0,M,"Bau, Umwelt und Geomatik",2115.0,"Hurni, Lorenz","Hurni, Lorenz",2648.0,Institute of Cartography&Geoinformation,2010,Book Chapter,
9673,"Habert, Guillaume",Life Cycle Assessment (LCA) is increasingly us...,Design-Integrated LCA Using Early BIM,283959.0,M,"Bau, Umwelt und Geomatik",2115.0,"Habert, Guillaume","Habert, Guillaume",2604.0,Inst. Construction&Infrastructure Manag.,2018,Book Chapter,
10456,"Hurni, Lorenz",The Swiss Eduard Imhof (1895–1986) was the fou...,Karte der Gegend um den Walensee - Eduard Imho...,285287.0,M,"Bau, Umwelt und Geomatik",2115.0,"Hurni, Lorenz","Hurni, Lorenz",2648.0,Institute of Cartography&Geoinformation,2018,Book Chapter,


In [204]:
rc_m_unique.shape

(2395, 14)

In [209]:
rc_m_unique[['department_name','id']].groupby(['department_name']).count().sort_values(by='id',ascending = False)

Unnamed: 0_level_0,id
department_name,Unnamed: 1_level_1
Umweltsystemwissenschaften,362
Physik,346
Biologie,264
Maschinenbau und Verfahrenstechnik,196
Gesundheitswissenschaften und Technologie,180
Informationstechnologie und Elektrotechnik,177
Chemie und Angewandte Biowissenschaften,170
Biosysteme,163
"Bau, Umwelt und Geomatik",160
"Management, Technologie und Ökonomie",122


#### 3.4 Sub-selecting the data

In [31]:
## first naive way to sub-select the data based on the departments with the most publications

rc_sub = rc_m.loc[((rc_m["department_name"] == "Maschinenbau und Verfahrenstechnik") | 
(rc_m["department_name"] == "Physik") | 
(rc_m["department_name"] == "Informationstechnologie und Elektrotechnik")),:]
rc_sub.head()

Unnamed: 0,gender,department_name,department_code,name,Professor,Org. unit code,Organisation,id,author,publication date,title,publication type,journal
12784,M,Informationstechnologie und Elektrotechnik,2140.0,"Benini, Luca","Benini, Luca",2636.0,Integrated Systems Laboratory,12751.0,"Benini, Luca",2008,Robust and low complexity rate control for sol...,Conference Paper,
12785,M,Informationstechnologie und Elektrotechnik,2140.0,"Benini, Luca","Benini, Luca",2636.0,Integrated Systems Laboratory,12749.0,"Benini, Luca",2008,Approximate control design for solar driven se...,Conference Paper,Lecture Notes in Computer Science
12786,M,Informationstechnologie und Elektrotechnik,2140.0,"Benini, Luca","Benini, Luca",2636.0,Integrated Systems Laboratory,12750.0,"Benini, Luca",2008,An efficient solar energy harvester for wirele...,Conference Paper,
12787,M,Informationstechnologie und Elektrotechnik,2140.0,"Benini, Luca","Benini, Luca",2636.0,Integrated Systems Laboratory,7491.0,"Benini, Luca",2008,Activity Recognition from On-Body Sensors by C...,Conference Paper,
12788,M,Informationstechnologie und Elektrotechnik,2140.0,"Benini, Luca","Benini, Luca",2636.0,Integrated Systems Laboratory,13091.0,"Benini, Luca",2008,Activity Recognition from On-Body Sensors,Conference Paper,Lecture Notes in Computer Science


In [32]:
# Note that in theses steps we include only the professors 

rc_sub1 = rc_sub.loc[rc_sub["title"].notnull(),:]
rc_sub2 = rc_sub1.loc[rc_sub1["Organisation"].notnull(),:]

#### 3.4 Final processing 

In [33]:
# creating boolean column to signal if the author is a professor 
# exclude for now
# rc_m["is_professor"] = rc_m["author"]==rc_m["Professor"]
# rc_m.head(3)

In [34]:
# drop the excessive columns

In [35]:
rc_sub2 = rc_sub2.drop(columns = ["Professor", "name"])

In [36]:
rc_sub2.columns

Index(['gender', 'department_name', 'department_code', 'Org. unit code',
       'Organisation', 'id', 'author', 'publication date', 'title',
       'publication type', 'journal'],
      dtype='object')

In [37]:
# Drop the excessive columns

rc_sub2.columns = ["gender", 'department_name', 'department_code', "organisation_code","organisation_name","publication_id",
                "person_name","publication_date","publication_title","publication_type",
                "publication_journal"]
           #"person_is_professor"]

#### Wierd missing values

In [38]:
# TODO : look into the null names 
# - where does this come from? 
sum(rc_m["person_name"].isnull())
# My suspicion is that 118/126 missing authors correspond to 
# 'organisation' rows that survived the outer join (meaning that 
# those are organisations which are not linked to any publication), 
# while we actually have only 8 missing authors 

KeyError: 'person_name'

In [39]:
# This mistery is solved: 118 entries have null publication values 
# because of the outer join -> it's entries about organisations which do 
# not have any publications associated! 

# sum(rc_m["publication_type"].isnull())
# sum(rc_m["publication_id"].isnull()) 
# sum(rc_m["publication_title"].isnull())

In [46]:
# this is the most plausible- but still we should check the information
# was missing in the original file as well
sum(rc_m["publication_date"].isnull())

1351

## Import/export :):

In [40]:
# On full dataset

rc_m.to_csv("metadata_final.csv", index=False)
rc_m.to_json("metadata_final.json", orient = "records")

In [135]:
# On sub-selection

rc_sub2.to_csv("metadata_final_sub.csv", index=False)

### Neo4j import details

The following nodes will be created: 
- **person** [name, gender]
- **publication** [id, title, date, type, journal]
- **organisation** [name, code]
- **department** [name, code]


--- 


**The commands**

    
To load the csv you first have to <u>copy it into your Neo4j base directory</u>. More info [here](https://neo4j.com/docs/cypher-manual/current/clauses/load-csv/#load-csv-import-data-from-a-csv-file). I did the following: 

```cp .\metadata_final.csv C:/Users/Giulia/.Neo4jDesktop/neo4jDatabases/database-befe90d3-7991-457e-9671-62c55c830654/installation-3.5.12/import```

<u>Constraints first</u>

The constraints are here to make sure we don't create duplicate nodes.

    CREATE CONSTRAINT ON (c:Person) ASSERT c.name IS UNIQUE;
    CREATE CONSTRAINT ON (c:Organisation) ASSERT c.name IS UNIQUE;
    CREATE CONSTRAINT ON (c:Publication) ASSERT c.title IS UNIQUE;
    CREATE CONSTRAINT ON (c:Department) ASSERT c.code IS UNIQUE;

 
Now we'll <u>load the data</u> in a very lightweight manner: 

1) person nodes <br>
```
    LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
    WITH line WHERE line.person_name IS NOT NULL
    MERGE (person:Person {name:line.person_name, is_professor:line.person_is_professor, gender:line.gender});
```
        > Added 176604 labels, created 176604 nodes, set 353208 properties, completed after 8880 ms.

2) publication nodes (this might take a while) <br>
       
        LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
        WITH line where line.publication_id IS NOT NULL
        MERGE (publication: Publication {title: line.publication_title})
        SET publication.id= line.publication_id,            
            publication.journal=line.publication_journal, 
            publication.type=line.publication_type, 
            publication.date=date(line.publication_date);

        > Added 96014 labels, created 96014 nodes, set 2121683 properties, completed after 9349 ms.
        
3) organisation nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
        WITH line where line.organisation_code IS NOT NULL
        MERGE (organisation:Organisation {name:line.organisation_name, 
                                          code:line.organisation_code});
                                          
        > Added 383 labels, created 383 nodes, set 766 properties, completed after 1822 ms.                          
            
4) department nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
        WITH line where line.department_code IS NOT NULL
        MERGE (department:Department {name:line.department_name, 
                                          code:line.department_code});
                                          
                               
            
        
5) finally all the edges <br> 
        
        LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
        MATCH (person:Person {name:line.person_name}), 
               (publication:Publication {id:line.publication_id})
        MERGE (person)-[:PUBLISHED]->(publication)
        MERGE (publication)-[:AUTHORED_BY]->(person);
        
        LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
        MATCH (person:Person {name:line.person_name}),
               (organisation:Organisation {code:line.organisation_code})
        MERGE (person)-[:BELONGS_TO]->(organisation)
        MERGE (organisation)-[:CONTAINS]->(person);
        
        LOAD CSV WITH HEADERS FROM "file:///metadata_final.csv" AS line
        MATCH (person:Person {name:line.person_name}),
               (department:Department {code:line.department_code})
        MERGE (person)-[:BELONGS_TO]->(department)
        MERGE (department)-[:CONTAINS]->(person);
        
Note: in case you did something wrong and you want to erase the network here's the query: 

        MATCH (n)
        DETACH DELETE n;

    


In [48]:
# question: is it correct to say that the title is a unique identifier of the paper? 
# same goes for organisation/author names