In [1]:
import os 
import csv
import pandas as pd

The goal of this notebook is to explore the content of the listed below files, **identifying in which input fields the useful information lies, pinpointing the connections btw the files, and to merge all the data into a structure** which is ready to be fed into the graph.

In [2]:
#Available data - files
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files: print(f)

1.0.0
D-ARCH.xlsx
Data exploration.ipynb
Data Overview.docx
DSL_2020_Project_Data_Agreement.pdf
ETH Professor list.xlsx
Exercise12_GraphDatabases.ipynb
Maurer_Hansruedi_2020-10-08_16_28_49.doc
RCP2.csv
ResearchCollection.xlsx
ResearchCollectionPublications2008_2018.tsv
~$ta Overview.docx


# ResearchCollectionPublications2008_2018.tsv

In [2]:
rcp = pd.read_csv("ResearchCollectionPublications2008_2018.tsv", sep="\t", header=0, encoding="latin-1") #utf8 not working...

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
rcp.head()

Unnamed: 0,RC_ID,COLLECTION,DC_CONTRIBUTOR_AUTHOR,DC_CONTRIBUTOR_EDITOR,DC_DATE_ISSUED,DC_DATE_PUBLISHED,DC_IDENTIFIER_DOI,DC_IDENTIFIER_ISBN,DC_IDENTIFIER_ISSN,DC_IDENTIFIER_OTHER,...,ETHZ_GRANT,ETHZ_GRANT_FUNDERDOI,ETHZ_GRANT_FUNDERNAME,ETHZ_GRANT_PROGRAM,ETHZ_IDENTIFIER_DISS,ETHZ_IDENTIFIER_PUBMED,DC_IDENTIFIER_WOS,ETHZ_IDENTIFIER_ORCIDWORKCODE,DC_CONTRIBUTOR_OTHER,ETHZ_IDENTIFIER_ARXIV
0,241217,,"Renner, Renato","Kao, Ming-Yang",2008,,,978-0-387-30770-1,,,...,,,,,,,,,,
1,11574,,"Diekmann, Andreas||Jann, Ben","Farzin, Sina||Jordan, Stefan",2008,,,978-3-15-010661-7,,,...,,,,,,,,,,
2,202843,20.500.11850/13,,"Glaser, Marie A.",2008,,,978-3-0377-8111-1,,,...,,,,,,,,,,
3,4209,,"Becchi, Antonio||Ducic, Sandra||Oechslin, Wern...",Stiftung Bibliothek Werner Oechslin,2008,,,3-85676-231-0||978-3-85676-231-5,,,...,,,,,,,,,,
4,203598,20.500.11850/13,,"Cramm, Wolf-Jürgen||Keil, Geert",2008,,,978-3-9388-0845-0,,,...,,,,,,,,,,


In [13]:
rcp.columns

Index(['RC_ID', 'COLLECTION', 'DC_CONTRIBUTOR_AUTHOR', 'DC_CONTRIBUTOR_EDITOR',
       'DC_DATE_ISSUED', 'DC_DATE_PUBLISHED', 'DC_IDENTIFIER_DOI',
       'DC_IDENTIFIER_ISBN', 'DC_IDENTIFIER_ISSN', 'DC_IDENTIFIER_OTHER',
       'DC_IDENTIFIER_URI', 'DC_TITLE', 'DC_TITLE_ALTERNATIVE', 'DC_TYPE',
       'ETHZ_AVAILABILITY', 'ETHZ_BOOK_TITLE', 'ETHZ_ETH', 'ETHZ_EVENT',
       'ETHZ_EVENT_DATE', 'ETHZ_EVENT_LOCATION', 'ETHZ_IDENTIFIER_SCOPUS',
       'ETHZ_IDENTIFIER_URL', 'ETHZ_IDENTIFIER_WOS',
       'ETHZ_JOURNAL_ABBREVIATED', 'ETHZ_JOURNAL_ISSUE', 'ETHZ_JOURNAL_TITLE',
       'ETHZ_JOURNAL_VOLUME', 'ETHZ_LEITZAHLIDENTIFIERS_CERT',
       'ETHZ_PAGES_END', 'ETHZ_PAGES_START', 'ETHZ_PUBLICATION_PLACE',
       'ETHZ_TITLE_SUBTITLE', 'ETHZ_GRANT', 'ETHZ_GRANT_FUNDERDOI',
       'ETHZ_GRANT_FUNDERNAME', 'ETHZ_GRANT_PROGRAM', 'ETHZ_IDENTIFIER_DISS',
       'ETHZ_IDENTIFIER_PUBMED', 'DC_IDENTIFIER_WOS',
       'ETHZ_IDENTIFIER_ORCIDWORKCODE', 'DC_CONTRIBUTOR_OTHER',
       'ETHZ_IDENTIFIER_AR

In [31]:
# nulls for columns
rcp.isnull().sum(axis=0)

RC_ID                                 0
COLLECTION                       100328
DC_CONTRIBUTOR_AUTHOR               879
DC_CONTRIBUTOR_EDITOR             98340
DC_DATE_ISSUED                      213
DC_DATE_PUBLISHED                 91510
DC_IDENTIFIER_DOI                 97804
DC_IDENTIFIER_ISBN                90800
DC_IDENTIFIER_ISSN                49248
DC_IDENTIFIER_OTHER               35152
DC_IDENTIFIER_URI                     5
DC_TITLE                              0
DC_TITLE_ALTERNATIVE             110582
DC_TYPE                               0
ETHZ_AVAILABILITY                    14
ETHZ_BOOK_TITLE                   89151
ETHZ_ETH                              0
ETHZ_EVENT                        79389
ETHZ_EVENT_DATE                   79476
ETHZ_EVENT_LOCATION               81407
ETHZ_IDENTIFIER_SCOPUS            82704
ETHZ_IDENTIFIER_URL              107909
ETHZ_IDENTIFIER_WOS               52280
ETHZ_JOURNAL_ABBREVIATED          61435
ETHZ_JOURNAL_ISSUE                51935


## Cleaning 

**What information should be extracted?**

- authors names from "DC_CONTRIBUTOR_AUTHOR"
- editors names from "DC_CONTRIBUTOR_EDITOR" (ideally same function used for the above step) 
- date from "DC_DATE_ISSUED" OR "DC_DATE_PUBLISHED"
- title from "DC_TITLE" OR "ETHZ_BOOK_TITLE" 
- type of publication 
- place from "ETHZ_PUBLICATION_PLACE"
- journal of publication indirectly from "ETHZ_IDENTIFIER_ARXIV" and similar
- ETHZ journal information?
- ...

**Doubts/Observations**
- (Should be tested) Author field is Nan AND Collection field is filled - what does this represent? 
- ...

# D-ARCH.xlsx

In [11]:
#necessary requirement for xlsx files 
!python3 -m pip install xlrd



In [12]:
da = pd.read_excel("D-ARCH.xlsx")

In [13]:
da.head()

Unnamed: 0,DEPARTMENT,RESEARCHER,TIMESTAMP,RESEARCH_OVERVIEW
0,Architecture,Tom_Avermaete,2020-10-08 15:32:09,Title: Information for Professor Tom.Avermaete...
1,Architecture,Philippe_Block,2020-10-08 15:32:17,Title: Information for Professor Philippe.Bloc...
2,Architecture,A._Caminada,2020-10-08 15:33:28,Title: Information for Professor A..Caminada f...
3,Architecture,Caruso_Adam,2020-10-08 15:33:29,Title: Information for Professor Caruso.Adam f...
4,Architecture,François_Charbonnet,2020-10-08 15:33:31,Title: Information for Professor François.Char...


In [15]:
# nulls for columns
da.isnull().sum(axis=0)

DEPARTMENT           0
RESEARCHER           0
TIMESTAMP            0
RESEARCH_OVERVIEW    0
dtype: int64

**Complete file**

# ETH Professor list.xlsx

In [16]:
pl = pd.read_excel("ETH Professor list.xlsx")

In [17]:
pl.head()

Unnamed: 0,Title,Name,First name,Org. unit code,Organisation
0,Prof. Dr.,Abhari,Reza S.,2627,Institute of Energy Technology (former)
1,Prof. Dr.,Acciaio,Beatrice,9727,"Acciaio, Beatrice"
2,Prof. Dr.,Ackermann,Martin,3743,"Ackermann, Martin"
3,Prof. Dr.,Ackermann,Martin,2721,Inst. Biogeochem. and Pollutant Dynamics
4,Prof. Dr.,Adey,Bryan T.,2604,Inst. Construction&Infrastructure Manag.


In [19]:
# nulls for columns
pl.isnull().sum(axis=0)

Title             0
Name              0
First name        0
Org. unit code    0
Organisation      0
dtype: int64

**Complete file**

# ResearchCollection.xlsx


In [21]:
rc = pd.read_excel("ResearchCollection.xlsx")

In [22]:
rc.head()

Unnamed: 0,NAME,SHORTNAME,TIMESTAMP,RESEARCH_OVERVIEW,DEPARTMENT_RESEARCH_WEBSITE
0,Architecture,D-ARCH,2020-05-08 17:50:35,D-ARCH_2020-05-08_17_50_35,https://arch.ethz.ch/en/forschung.html
1,"Civil, Environmental and Geomatic Engineering",D-BAUG,2020-10-08 15:38:04,D-BAUG_2020-10-08_15_38_04,https://baug.ethz.ch/en/research.html
2,Biosystems Science and Engineering,D-BSSE,2020-10-08 15:38:04,D-BSSE_2020-10-08_15_38_04,https://bsse.ethz.ch/research.html
3,Computer Science,D-INFK,2020-10-08 15:42:40,D-INFK_2020-10-08_15_42_40,https://inf.ethz.ch/research.html
4,Information Technology and Electrical Engineering,D-ITET,2020-10-08 15:42:40,D-ITET_2020-10-08_15_42_40,https://ee.ethz.ch/research.html


In [23]:
# nulls for columns
rc.isnull().sum(axis=0)

NAME                           0
SHORTNAME                      0
TIMESTAMP                      0
RESEARCH_OVERVIEW              0
DEPARTMENT_RESEARCH_WEBSITE    0
dtype: int64

**Complete file**

--- 