# Extraction and insertion

This notebook contains only the **final, necessary** code for the data extraction (cleaning/filtering/merging) and data import (into the graph database neo4j).

In [None]:
import pandas as pd
import os 
import numpy as np
import re
from zipfile import ZipFile

## Extraction

### 1. Filtering 

In [None]:
# read all data files, stratified by publication type

zip_file = ZipFile('RC export 2020-10-12.zip')
files = {text_file.filename: pd.read_csv(zip_file.open(text_file.filename))
       for text_file in zip_file.infolist()
       if text_file.filename.endswith('.csv')}

ed = files["educational-2020-10-12.csv"]
books = files["books-2020-10-12.csv"]
conf = files["conference-2020-10-12.csv"]
journ = files["journal-2020-10-12.csv"]
oth = files["other-2020-10-12.csv"]
pap = files["papers-2020-10-12.csv"]
pat = files["patents-2020-10-12.csv"]
pres = files["presentations-2020-10-12.csv"]

In [None]:
# we restrict ourselves to only book chapters, conference papers, journal papers and other papers

research_data_df = pd.concat([books, conf, journ, pap], ignore_index=True)
print(research_data_df.shape)
print("There are ", research_data_df.shape[0], "documents, ", 
      sum(research_data_df["dc.description.abstract"].notna()), " of which have abstracts")
research_data_df.head()

In [None]:
# take a look at the columns
set(research_data_df.columns)

In [None]:
# let's remove all irrelevant columns

rd_f = research_data_df[["id","dc.contributor.author","dc.title","dc.date.issued","dc.type","dc.description.abstract",
                 "dc.language.iso", "ethz.journal.title"]]
rd_f = rd_f.rename({"dc.contributor.author": "author", "dc.date.issued": "publication date",
                       "dc.title": "title", "dc.type": "publication type", "dc.description.abstract": "abstract",
                       "dc.language.iso": "language", "ethz.journal.title": "journal"}, 
              axis = 1)
rd_f.head()

### 2. Cleaning

In [None]:
# define helper functions for cleaning

def separate_names(names):
    """ Separes a string of names of the form name1||name2||name3||... into a list of names."""
    if isinstance(names, str):
        return names.split("||")

def date_to_year(date):
    """Get the dates into a single format (YYYY)"""
    if isinstance(date, str) and len(re.findall("[\d]{4}",date))>0:
        return re.findall("[\d]{4}",date)[0]

def date_to_year_and_month(date):
    """Get dates into format YYYY-MM"""
    # if needed later
    if isinstance(date, str):
        if len(re.findall("[\d]{4}[-][\d]{2}",date)) > 0:
            return re.findall("[\d]{4}[-][\d]{2}",date)[0]
        else:
            return str(date)

In [None]:
author_array = rd_f["author"].apply(separate_names)
rd_f["author"] = author_array
# Date format
date_format = date_to_year
date_array = rd_f["publication date"].apply(date_format)
rd_f["publication date"] = date_array

rd_f.head()

In [None]:
# a little detour: check how many papers were published each year

rd_f[["id","publication date"]].groupby("publication date").count().sort_values("publication date", ascending = False).rename({'id':'count'}, axis = 1).head(20)

In [None]:
# now explode on author field

rd_fe = rd_f.explode("author")
print("Shape changed from ", rd_f.shape, " to ", rd_fe.shape)
rd_fe.head()

### 3. Merge with department and organisation data

#### 3.1 Organisation data

In [None]:
pl = pd.read_excel("ETH Professor list.xlsx")

In [None]:
# create professor column and keep only the relevant columns

pl["Professor"] = pl["Name"] + ", " + pl["First name"]
pl = pl[["Professor", "Org. unit code", "Organisation"]]
pl.columns = ["professor", "organisation unit code", "organisation"]
print(pl.shape)
pl.head()

In [None]:
# merge with exploded research collection

rd_m = rd_fe.merge(pl,how="outer",right_on="professor",left_on="author")
print("Shape of research collection changed from ", rd_fe.shape, " to ", rd_m.shape, ". Diff: ", 
      rd_m.shape[0]-rd_fe.shape[0])
rd_m.head(3)

#### 3.2 Leitzahl data

In [None]:
leitzahl_mapping = pd.read_csv("cost_centre_dept_mapping.csv", encoding = "ISO-8859-1")
leitzahl_mapping.head()

In [None]:
set(leitzahl_mapping["NAME"])

In [None]:
# note: we have no department-specific code/leitzahl
ln = leitzahl_mapping[["NAME","DEPT"]]

In [None]:
# map department code to department names
dep_map = {'AGRL': 'Agricultural Sciences',
 'ANBI': 'Applied Biosciences',
 'ARCH': 'Architecture',
 'BAUG': 'Civil, Environmental and Geomatic Engineering',
 'BIOL': 'Biology',
 'BSSE': 'Biosystems Science and Engineering',
 'CHAB': 'Chemistry and Applied Biosciences',
 'ERDW': 'Earth Sciences',
 'GESS': 'Humanities, Social and Political Sciences',
 'HEST': 'Health Sciences and Technology',
 'INFK': 'Computer Science',
 'ITET': 'Information Technology and Electrical Engineering',
 'MATH': 'Mathematics',
 'MATL': 'Materials',
 'MAVT': 'Mechanical and Process Engineering',
 'MTEC': 'Management, Technology and Economics',
 'PHYS': 'Physics',
 'USYS': 'Environmental Systems Science'}

def transform_dep_code(code):
    return dep_map[code]

In [None]:
def transform_dep_code(code):
    return dep_map[code]

transform_dep_code('MATH')

In [None]:
ln["DEPT"] = ln["DEPT"].apply(transform_dep_code)

In [None]:
ln.head()

In [None]:
# now try with deleting parentheses after the names

def delete_parenthesis(name):
    if isinstance(name, str):
        return re.split('(\s\([a-zA-Z.]+\))', name)[0]

print(delete_parenthesis('Schlunegger (ehem.)'))
print(delete_parenthesis('Fontana, M. (em.)'))
print(delete_parenthesis('Smith, Roy (Tit.)'))
print(delete_parenthesis('Baccini, Peter (em.)'))

In [None]:
# apply to leitzahl names, merge and check coverage

ln["NAME"] = ln["NAME"].apply(delete_parenthesis)
rd_m2 = rd_m.merge(ln, how = "left", left_on = "author", right_on = "NAME")
print("Number of entries with complete author and department: ", 
      sum((rd_m2["author"].notna() & rd_m2["DEPT"].notna())))

### 4. Final touches

In [None]:
# drop professor and name columns

rd_final = rd_m2.drop(columns = ["professor", "NAME"])

# rename for ease of neo4j import 

rd_final = rd_final.rename({"publication date": "publication_date", "publication type": "publication_type",
                           "organisation unit code": "organisation_unit_code", "DEPT": "department"},
                          axis = 1)
rd_final.head()

In [None]:
# issue: quotes need to be escaped in neo4j
# the following function helps us avoid errors when importing

def add_quote(abstract):
    if isinstance(abstract, str):
        abstract = abstract.replace('\\"', '""')
        return abstract.replace('"', '""')

rd_final["abstract"] = rd_final["abstract"].apply(add_quote)

## Import/export :):

In [None]:
rd_final.to_csv("graph_data_final.csv", index=False)

### Neo4j import details

The following nodes will be created: 
- **person** [name, gender]
- **publication** [id, title, date, type, abstract, journal]
- **organisation** [name, code]
- **department** [name, code]


--- 


**The commands**

    
To load the csv you first have to <u>copy it into your Neo4j base directory</u>. More info [here](https://neo4j.com/docs/cypher-manual/current/clauses/load-csv/#load-csv-import-data-from-a-csv-file). For instance, do the following: 

```cp .\metadata_final.csv C:/Users/Giulia/.Neo4jDesktop/neo4jDatabases/database-befe90d3-7991-457e-9671-62c55c830654/installation-3.5.12/import```

<u>Constraints first</u>

The constraints are here to make sure we don't create duplicate nodes.

    CREATE CONSTRAINT ON (c:Person) ASSERT c.name IS UNIQUE;
    CREATE CONSTRAINT ON (c:Organisation) ASSERT c.code IS UNIQUE;
    CREATE CONSTRAINT ON (c:Publication) ASSERT c.title IS UNIQUE;
    CREATE CONSTRAINT ON (c:Department) ASSERT c.name IS UNIQUE; # or c.code
    CREATE INDEX ON :Publication(id)

 
Now we'll <u>load the data</u> in a very lightweight manner: 

1) person nodes <br>
```
    LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
    WITH line WHERE line.author IS NOT NULL
    MERGE (person:Person {name: line.author})
```

2) publication nodes (this might take a while) <br>
       
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.id IS NOT NULL
        MERGE (publication: Publication {title: line.title})
        SET publication.id=line.id,             
            publication.type=line.publication_type, 
            publication.date=date(line.publication_date),
            publication.abstract=line.abstract,
            publication.journal=line.journal;

        
3) organisation nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.organisation_unit_code IS NOT NULL
        MERGE (organisation:Organisation {code:line.organisation_unit_code})
        SET organisation.name=line.organisation;
                                          
            
4) department nodes <br> 
    
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        WITH line where line.department IS NOT NULL
        MERGE (department:Department {name:line.department})
        SET department.name=line.department;
                                                                
            
        
5) finally all the edges <br> 
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}), 
               (publication:Publication {id:line.id})
        MERGE (person)-[:PUBLISHED]->(publication)
        
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (organisation:Organisation {code:line.organisation_unit_code})
        MERGE (person)-[:BELONGS_TO]->(organisation)
        
        
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (department:Department {name:line.department})
        MERGE (person)-[:WORKS_IN]->(department)
        
        Alternative without leitzahl:
        LOAD CSV WITH HEADERS FROM "file:///graph_data_final.csv" AS line
        MATCH (person:Person {name:line.author}),
               (department:Department {name:line.department})
        MERGE (person)-[:WORKS_IN]->(department)
        
        
Note: in case you did something wrong and you want to erase the network here's the query: 

        MATCH (n)
        DETACH DELETE n;

    


## Export for topic modeling

In [None]:
# select only english publications

rd_full_en = rd_m2[rd_m2["language"] == "en"]

In [None]:
# drop duplicates (keeping only the first)

abstracts_data = rd_full_en.drop_duplicates(subset = ["id"], keep = "first")
print(abstracts_data.shape)

In [None]:
# keep only the entries that have an abstract

print("Number of abstracts: ", sum(abstracts_data["abstract"].notna()))
abstracts_only = abstracts_data[abstracts_data["abstract"].notna()]

In [None]:
# export only abstract and id

abstracts_only[["abstract","id","title"]].to_csv("abstracts_eng.csv", index = False)

## Function for estimating storage requirements

In [None]:
def compute_storage_upper_bound(nrpubs, nrdeps, nrorgs, nrpeople):
    """Gives a rough upper bound of the storage required for a graph (GB) with the given input parameter values"""
    import numpy as np
    
    # Record size per node: 15B
    # Record size per edge: 34B
    # Record size per attribute: 41B
    # Record size per string or array attribute: 128B
    # https://neo4j.com/developer/kb/understanding-data-on-disk/
    
    nrtopics = np.log(nrpubs) # assume that the number of topics grows logarithmically with the number of publications
    
    # for each type on node, multiply the number of nodes with the storage required for the node annd its attributes
    node_storage = nrpubs*(15+2*41+4*128) + nrdeps*(15+41+128) + nrorgs*(15+41+128) + nrpeople*(15+41+128) + nrtopics*(15+41+128)
    
    dep_people_edges = nrdeps*40 # assume max 40 professors per department on average
    org_people_edges = nrorgs*5 # assume max 5 professors per organisation on average
    pub_people_edges = nrpubs*10 # assume max 10 authors per publication on average
    pub_topic_edges = nrpubs*20 # assume max 10 topics per publication on average
    
    # for each type on edge, multiply the number of nodes with the storage required for the node annd its attributes
    edge_storage = dep_people_edges*34 + org_people_edges*34 + pub_people_edges*34 + pub_topic_edges*(34+128)
    
    # storage required for indices
    # following neo4j heuristics: average property value size * (1/3)
    # we have four indices, one for each node
    avg_prop_size = (6*41+9*128)/15
    index_storage = avg_prop_size*(nrpubs + nrdeps + nrorgs + nrpeople)*(1/3)
    
    # add and return in GB
    return (node_storage + edge_storage + index_storage)/10**9

In [None]:
compute_storage_upper_bound(170000, 16, 400, 10000)