In [None]:
import os 
import csv
import pandas as pd
import numpy as np

The goal of this notebook is to explore the content of the listed below files, **identifying in which input fields the useful information lies, pinpointing the connections btw the files, and to merge all the data into a structure** which is ready to be fed into the graph.

In [None]:
#Available data - files
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files: print(f)

# ResearchCollectionPublications2008_2018.tsv

In [None]:
rcp = pd.read_csv("ResearchCollectionPublications2008_2018.tsv", sep="\t", header=0, encoding="latin-1") #utf8 not working...

In [None]:
rcp.head(10)

In [None]:
rcp_tf= rcp.loc[(rcp["DC_TYPE"] == "Journal Article")|(rcp["DC_TYPE"] == "Conference Paper")|
        (rcp["DC_TYPE"] =="Other Conference Item")|
        (rcp["DC_TYPE"] =="Book Chapter"),:]

In [None]:
# do we also need to convert types?
rcp.dtypes

In [None]:
rcp_tf["DC_DATE_ISSUED"].str.match(pat=".*").sum()

In [None]:
#force types on dataframe

In [None]:
len(set(rcp["ETHZ_JOURNAL_TITLE"]))

In [None]:
import matplotlib as mpl 
import matplotlib.pyplot as plt

In [None]:
journals = rcp_tf.loc[(rcp_tf["DC_TYPE"]=="Journal Article"),:].groupby("ETHZ_JOURNAL_TITLE")["RC_ID"].count().sort_values(ascending=False)

In [None]:
journals[:50]

In [None]:
rcp_tf.groupby("ETHZ_PUBLICATION_PLACE")["RC_ID","ETHZ_JOURNAL_TITLE"].count().sort_values(by="RC_ID",ascending=False)

In [None]:
rcp_ff.loc[(rcp_ff["DC_DATE_ISSUED"].str.match(pat="^[0-9]{4}$")==True),:]["DC_DATE_ISSUED"]

In [None]:
# nulls for columns
rcp.isnull().sum(axis=0)

In [None]:
rcp.columns

## Cleaning 

**What information should be extracted?**

- authors names from "DC_CONTRIBUTOR_AUTHOR"
- editors names from "DC_CONTRIBUTOR_EDITOR" (ideally same function used for the above step) 
- date from "DC_DATE_ISSUED"
- title from "DC_TITLE"
- type of publication : Journal Article, Conference Paper, Other Conference Item,Book Chapter
- journal of publication from ETHZ_JOURNAL_TITLE (filtered)
- publication database from "ETHZ_IDENTIFIER_ARXIV" and similar


- ...

**Doubts/Observations**
- (Should be tested) Author field is Nan AND Collection field is filled - what does this represent? 
- Conference Poster may not have abstracts

- ...

## Filtering
-  'Journal Article' with 
-  'Journal Article' and conference papers 
-  Threshold filtering of journal titles

Postprocessing:
- convert date issued to single format (year) 

In [None]:
rcp_ff = rcp_tf[["RC_ID","DC_CONTRIBUTOR_AUTHOR","DC_DATE_ISSUED","DC_TITLE","DC_TYPE","ETHZ_JOURNAL_TITLE"]]
rcp_ff.head()

### Cleaning utils 
Here the code to clean this mess

In [None]:
def separate_names(names):
    """ Separes a string of names of the form name1||name2||name3||... into a list of names."""
    # Also: does pandas DataFrame support list of strings as fields? 
    # Andreas: Yes
    if isinstance(names, str):
        return names.split("||")

In [None]:
# name fields to lists

author_array = rcp.loc[:,"DC_CONTRIBUTOR_AUTHOR"].apply(separate_names)
print("Check that we don't have more missing values: ", author_array.shape[0] - np.count_nonzero(author_array))
editor_array = rcp.loc[:,"DC_CONTRIBUTOR_EDITOR"].apply(separate_names)
print("Check that we don't have more missing values: ", editor_array.shape[0] - np.count_nonzero(editor_array))
rcp["DC_CONTRIBUTOR_AUTHOR"] = author_array
rcp["DC_CONTRIBUTOR_EDITOR"] = editor_array
rcp.head()

In [None]:
author_array = rcp_ff.loc[:,"DC_CONTRIBUTOR_AUTHOR"].apply(separate_names)
rcp_ff["DC_CONTRIBUTOR_AUTHOR"] = author_array
rcp_ff.head()

In [None]:
def clean_date(date):
    "Get the dates into a single format (YYYY)"
    pass

In [None]:
rcp_ff_e = rcp_ff.explode("DC_CONTRIBUTOR_AUTHOR")

In [None]:
rcp_ff_e.head()

# D-ARCH.xlsx

In [None]:
#necessary requirement for xlsx files 
!python3 -m pip install xlrd

In [None]:
da = pd.read_excel("D-ARCH.xlsx")

In [None]:
da.head()

In [None]:
da.iloc[0]["RESEARCH_OVERVIEW"]

In [None]:
# nulls for columns
da.isnull().sum(axis=0)

**Complete file**

This file should be connected to the main one (Research Collection) through the professor name.<br>
Integrates research overview info.

In [None]:
# check whether the professors' names here and the authors' names there match

# ETH Professor list.xlsx

In [None]:
pl = pd.read_excel("ETH Professor list.xlsx")

In [None]:
pl.head()

In [None]:
# nulls for columns
pl.isnull().sum(axis=0)

**Complete file**

Again this file matches the research collection through the professors' name. <br>
Integrates organisation info.

In [None]:
pl["Professor"] = pl["Name"] + ", " + pl["First name"]
pl.head()

In [None]:
# check a few examples

#pl.loc[(pl["Professor"]=="Renner, Renato"),:] # matches with rcp
#pl.loc[(pl["Professor"]=="Diekmann, Andreas"),:]
#pl.loc[(pl["Name"]=="Diekmann"),:] # this guy is an ETH professor but is missing in pl
#pl.loc[(pl["Professor"]=="Mateo, Josep L."),:] 
#pl.loc[(pl["Name"]=="Mateo"),:]  # same with this guy
#pl.loc[(pl["Professor"]=="Wenger, Andreas"),:] # matches with rcp
#pl.loc[(pl["Professor"]=="Krause, Andreas"),:] # CS represent
pl.loc[(pl["Professor"]=="Buhmann, Joachim M."),:]

Notice that the Organisation data is not homogeneous. Example: Krause is IML and Hofmann is dept. CS

In [None]:
rcp.rename(columns={"ETHZ_LEITZAHLIDENTIFIERS_CERT":"Org. unit code"}, inplace=True)

In [None]:
rcp.columns

In [None]:
rcp["Org. unit code"].str.match(pat="^[0-9]{5}$")==True

In [None]:
rcp["Org. unit code"] = rcp.loc[(rcp["Org. unit code"].str.match(pat="^[0-9]{5}$")==True),"Org. unit code"].apply(lambda n: int(n))

In [None]:
rcp_pl = pd.merge(pl, rcp, on='Org. unit code')

In [None]:
rcp_pl.loc[(rcp_pl["Organisation"]=="Institute for Machine Learning"),:]

### Merging rcp and pl on professors' names

In [None]:
# filtering pl
pl_f = pl[["Organisation","Professor","Org. unit code"]]

In [None]:
rc_m = pl_f.merge(rcp_ff_e,how="outer",left_on="Professor",right_on="DC_CONTRIBUTOR_AUTHOR")

In [None]:
rc_m.head()

In [None]:
rc_m_count = pd.DataFrame(rc_m[['ETHZ_JOURNAL_TITLE', 'DC_TITLE', 'DC_CONTRIBUTOR_AUTHOR']].groupby('ETHZ_JOURNAL_TITLE').count())
rc_m_df = rc_m_count.reset_index()
rc_m_reduced = rc_m_df.loc[rc_m_df['DC_TITLE'] > 100,:]
print(rc_m_reduced.head())
print('number of rows with publication in journals with more than 100 ETH entries:',len(rc_m_reduced))

In [None]:
rc_m_reduced.to_csv('publications.csv',index = False)

- find a way to import into neo4j (probably dumping into json -> organising into a dictionary)
- visualizations 

## Visualisation - not super useful right now but could be useful if we narrow down on area of research

In [None]:
rc_m.info() # 47626 entries

In [None]:
#Check number of unique journals
len(rc_m.ETHZ_JOURNAL_TITLE.unique()) # 11881 unique journals

In [None]:
rc_mp = rc_m[rc_m.groupby('ETHZ_JOURNAL_TITLE').ETHZ_JOURNAL_TITLE.transform('count')>100].copy() 

In [None]:
len(rc_mp.groupby("ETHZ_JOURNAL_TITLE")["RC_ID"].count().sort_values(ascending=False))

In [None]:
rc_mp.groupby("ETHZ_JOURNAL_TITLE")["RC_ID"].plot.bar()

# ResearchCollection.xlsx


In [None]:
rc = pd.read_excel("ResearchCollection.xlsx")

In [None]:
rc.head()

In [None]:
# nulls for columns
rc.isnull().sum(axis=0)

# Journal papers

In [None]:
jp = pd.read_csv("journal-2020-10-12.csv")

In [None]:
jp.head()

In [None]:
jp.shape
#pd.set_option('display.max_columns', 120)

In [None]:
print(jp.columns.values)

In [None]:
jp["dc.description.abstract"].notnull().sum(axis=0)

In [None]:
jp[jp["dc.description.abstract"].notnull()]["dc.description.abstract"]

## Exploring sub-selection options

#### This is code that is dumped from Data import

In [None]:
rc_m.groupby("department_name").size().sort_values(ascending=False)

In [None]:
titlexdep = pd.DataFrame(rc_m[["department_code","title"]].groupby("title").count())

In [None]:
titlexdep= titlexdep.reset_index()

In [None]:
titlexdep.columns = ["title","num_dept"]

In [None]:
temp_df = titlexdep.merge(rc_m,how="right",left_on="title",right_on="title")

In [None]:
temp_df[["department_name","num_dept"]].groupby("department_name").mean().sort_values(by="num_dept",ascending=False)

In [None]:
rc_m.isnull().sum(axis=0)

**Complete file**

This file matches the D-ARCH file through the departments' name. <br>
Integrates departments info and websites links.

Andreas: Is this data needed at this point?

--- 

# Creating a final structure

(In my opinion) the best final data structure is a dictionary (aka a tree) - can be dumped into a json file - which is easy to load into a neo4j graph. <br>
In this section all the code to save the selected and integrated data into a dictionary.

Andreas: Yes good idea. But we could also just directly dump a dataframe into a json right? Since pandas supports nested structures. 

TODO:
* Write a function that joins authors in rcp with professors in pl
* Select relevant fields in first stage of the graph building and put in dataframe or dictionary
* Export as json

# Research-data-2020-10-12

In [None]:
papers = pd.read_csv("research-data-2020-10-12.csv")

In [None]:
papers.shape

In [None]:
set(papers.columns)

In [None]:
abstracts = pd.DataFrame(papers["dc.description.abstract"])

In [None]:
abstracts.reset_index(level=0, inplace=True)

In [None]:
abstracts = abstracts.drop("index", axis=1)
abstracts.columns = ["text"]
abstracts.head(3)

In [None]:
abstracts.shape

In [None]:
ed = pd.read_csv("educational-2020-10-12.csv")
books = pd.read_csv("books-2020-10-12.csv")
conf = pd.read_csv("conference-2020-10-12.csv")
journ = pd.read_csv("journal-2020-10-12.csv")
oth = pd.read_csv("other-2020-10-12.csv")
pap = pd.read_csv("papers-2020-10-12.csv")
pat = pd.read_csv("patents-2020-10-12.csv")
pres = pd.read_csv("presentations-2020-10-12.csv")

In [None]:
research_data = [ed,books,conf,journ,oth,pap,pat,pres]

In [None]:
tot = 0
for df in research_data:
    tot+=df.shape[0]
tot

In [None]:
non_null = 0
for df in research_data:
    non_null+=sum(df["dc.description.abstract"].notnull())
non_null

In [None]:
non_null_all = []
for df in research_data:
    nn=sum(df["dc.description.abstract"].notnull())
    non_null_all+=[nn]
non_null_all

In [None]:
avg_len = []
for df in research_data:
    avg = np.mean((df.loc[df['dc.description.abstract'].notnull(),'dc.description.abstract']
                   .apply(lambda x: len(x.split(" ")))
                   .reset_index(name='len_text'))["len_text"])
    avg_len+=[avg]
avg_len