In [1]:
"""
Temporary routine for generation of datasets
for demonstration purposes

It is a "Fake Data Mediator" for use with the first
version of the Interactive Topic Model Trainer
"""

import argparse
import json
import os
from pathlib import Path
#from langdetect import detect
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

# Simulate downloads from HDFS

## CORDIS UC3M

In [2]:
# parquet_table = '/export/ml4ds/IntelComp/Datalake/CORDIS/20230425/parquet/projects.parquet' #In HDFS
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/cordis_Kwds3_AI.parquet' #In NFS
# lemmas_table = '/export/ml4ds/IntelComp/Datalake/CORDIS/20230425/parquet/projects_NLP.parquet' #In HDFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/cordis_Kwds3_AI_NLP.parquet' #In NFS
id_fld = 'projectID'
selectFields = 'title, objective, acronym, frameworkProgramme, startDate, ecMaxContribution, euroSciVocCode, countryContr, coordinatorCountry, publicationID, patentID, fundingScheme, topic'
filterCondition = '' #'frameworkProgramme = "HORIZON"'
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/cordis_Kwds3_AI.parquet" #In NFS

## Semantic Scholar

### Cancer dataset pregenerated from PMID given by HCERES

In [17]:
# parquet_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_Cancer.parquet' # In HDFS
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/cancer/S2_Cancer.parquet' #In NFS
# lemmas_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_Cancer_NLP.parquet' # In HDFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/cancer/S2_Cancer_NLP.parquet' # In HDFS
id_fld = 'id'
selectFields = 'doi, title, paperAbstract, year' #, fieldsOfStudy, publicationtype, isopenaccess, referencecount, citationcount, influencialcitationcount'
filterCondition = ''
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/S2_Cancer.parquet" #In NFS

### AI dataset generated from AI keywords

In [23]:
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/S2_Kwds3_AI.parquet' #In NFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/S2_Kwds3_AI_NLP.parquet' # In HDFS
id_fld = 'id'
selectFields = 'doi, title, paperAbstract, year' #, fieldsOfStudy, publicationtype, isopenaccess, referencecount, citationcount, influencialcitationcount'
filterCondition = ''
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/S2_Kwds3_AI.parquet" #In NFS

### Whole dataset filtering according to FOS

In [None]:
#parquet_table = '/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers.parquet'
#selectFields = 'title, paperAbstract, doi, year, fieldsOfStudy'
#filterCondition = "array_contains(fieldsOfStudy, 'Computer Science')"
#path_dataset = "/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS_scalability/S2CS_1.parquet"
#perc = float(1)/100

## OpenAIRE

### Cancer dataset pregenerated from PMID given by HCERES

In [28]:
# parquet_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_Cancer.parquet' # In HDFS
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/cancer/OA_Cancer.parquet' #In NFS
# lemmas_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_Cancer_NLP.parquet' # In HDFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/cancer/OA_Cancer_NLP.parquet' # In HDFS
id_fld = 'id'
selectFields = 'title, description' #, fieldsOfStudy, publicationtype, isopenaccess, referencecount, citationcount, influencialcitationcount'
filterCondition = ''
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/OA_Cancer.parquet" #In NFS

### AI dataset generated from AI keywords

In [34]:
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/OA_Kwds3_AI.parquet' #In NFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/OA_Kwds3_AI_NLP.parquet' # In HDFS
id_fld = 'id'
selectFields = 'title, description' #, fieldsOfStudy, publicationtype, isopenaccess, referencecount, citationcount, influencialcitationcount'
filterCondition = ''
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/OA_Kwds3_AI.parquet" #In NFS

## Patstat

### Cancer dataset generated from CPCs identified by HCERES

In [2]:
parquet_table = '/export/ml4ds/IntelComp/Datalake/patstat/2023_Spring/parquet/patents_Cancer_CPC.parquet' # In HDFS
lemmas_table = '/export/ml4ds/IntelComp/Datalake/patstat/2023_Spring/parquet/patents_Cancer_CPC_NLP.parquet' # In HDFS
id_fld = 'appln_id'
selectFields = 'appln_title, appln_abstract, docdb_family_id, appln_filing_year, earliest_filing_year, granted, appln_auth, receiving_office, ipr_type'
filterCondition = ''
path_dataset = "/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/Patstat_Cancer_CPC.parquet" #In NFS

### Cancer dataset generated from keywords given by HCERES

In [7]:
parquet_table = '/export/ml4ds/IntelComp/Datalake/patstat/2023_Spring/parquet/patents_Cancer_Kwds.parquet' # In HDFS
lemmas_table = '/export/ml4ds/IntelComp/Datalake/patstat/2023_Spring/parquet/patents_Cancer_Kwds_NLP.parquet' # In HDFS
id_fld = 'appln_id'
selectFields = 'appln_title, appln_abstract, docdb_family_id, appln_filing_year, earliest_filing_year, granted, appln_auth, receiving_office, ipr_type, Kwd_count'
filterCondition = ''
path_dataset = "/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/Patstat_Cancer_Kwds.parquet" #In NFS

### AI dataset generated from AI keywords

In [43]:
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/PATSTAT_Kwds3_AI.parquet' #In NFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/AI/PATSTAT_Kwds3_AI_NLP.parquet' # In HDFS
id_fld = 'appln_id'
selectFields = 'docdb_family_id, appln_title, appln_abstract, appln_filing_year, earliest_filing_year, granted, appln_auth, receiving_office, ipr_type' 
filterCondition = ''
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/PATSTAT_Kwds3_AI.parquet" #In NFS

## HFRI projects

In [4]:
parquet_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/HFRI/202302_HFRI.parquet' #In NFS
lemmas_table = 'file:///export/usuarios_ml4ds/jarenas/github/IntelComp/KPIs/WP3-pipelines/datasets/HFRI/202302_HFRI_NLP.parquet' # In HDFS
id_fld = 'id'
selectFields = 'Work, Title, Summary, keywords, Budget, Duration, `Host Institutions (Greek)`, Action, field1, `field1 code`, `field1 name`, field2, `Gender IP`' 
filterCondition = ''
path_dataset = "file:///export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/HFRI.parquet" #In NFS

## Generate parquet files

In [5]:
# We read the main table including selected fields and the identifier
flds = [el.strip() for el in selectFields.split(',')]
query = "SELECT " + id_fld + " AS id, " + (",").join(flds) + \
                " FROM parquet.`" + parquet_table + "`"

# Add filtering condition to SELECT clause if necessary
if len(filterCondition.strip()):
    query += " WHERE " + filterCondition
dataset = spark.sql(query)

print('Number of documents in dataset:', dataset.count())
dataset.show(n=2, truncate=120, vertical=True)

23/09/29 18:23:30 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/09/29 18:23:30 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/09/29 18:23:34 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/09/29 18:23:34 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore jarenas@192.168.148.147
23/09/29 18:23:34 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
23/09/29 18:23:34 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of documents in dataset: 688


[Stage 4:>                                                          (0 + 1) / 1]

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------
 id                        | 1                                                                                                                        
 Work                      | 664                                                                                                                      
 Title                     | Development of sustainable chemoenzymatic processes for the production of optically active amines from alcohols and a... 
 Summary                   | Synthesis of optically pure amines has attracted significant research interest, due to the fact that this group of co... 
 keywords                  | cascade reactions;graphene oxide;amine transaminase;immobilization;bioprocess;optically pure amine;API synthons;chemo... 
 Budget                    | 200000.0                                                         

                                                                                

In [6]:
# We read the table with the output of NLP processes and identify id field
query = "SELECT " + id_fld + " AS id, raw_text, lemmas FROM parquet.`" + lemmas_table + "`"
lemmas_df = spark.sql(query)

print('Number of documents in dataset:', lemmas_df.count())
lemmas_df.show(n=2, truncate=120, vertical=True)

23/09/29 18:24:23 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of documents in dataset: 687


[Stage 9:>                                                          (0 + 1) / 1]

-RECORD 0----------------------------------------------------------------------------------------------------------------------------
 id       | 1                                                                                                                        
 raw_text | Development of sustainable chemoenzymatic processes for the production of optically active amines from alcohols and a... 
 lemmas   | development sustainable chemoenzymatic process production active amine alcohol alkynes synthesis pure amine attract_s... 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------
 id       | 2                                                                                                                        
 raw_text | Zr4+ MOFs with Excellent Adsorption Capacity of Toxic Ions from Water In the proposed research, we plan to demonstrat... 
 lemmas   | mof excellent adsorption capacity toxic ion water 

                                                                                

In [7]:
# Join tables
lemmas_df = lemmas_df.withColumnRenamed(id_fld,"id")
dataset = (dataset.join(lemmas_df, dataset.id ==  lemmas_df.id, "left")
                      .drop(lemmas_df.id)
                )

print('Number of documents in dataset:', dataset.count())
dataset.show(n=10, truncate=120, vertical=True)

                                                                                

Number of documents in dataset: 688


                                                                                

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------
 id                        | 1                                                                                                                        
 Work                      | 664                                                                                                                      
 Title                     | Development of sustainable chemoenzymatic processes for the production of optically active amines from alcohols and a... 
 Summary                   | Synthesis of optically pure amines has attracted significant research interest, due to the fact that this group of co... 
 keywords                  | cascade reactions;graphene oxide;amine transaminase;immobilization;bioprocess;optically pure amine;API synthons;chemo... 
 Budget                    | 200000.0                                                         

In [38]:
# Sampling dataset if necessary
"""
if perc < 1:
    dataset = dataset.sample(fraction=perc)

print('Number of documents in dataset:', dataset.count())
#dataset.show(n=10, truncate=120, vertical=True)
"""

"\nif perc < 1:\n    dataset = dataset.sample(fraction=perc)\n\nprint('Number of documents in dataset:', dataset.count())\n#dataset.show(n=10, truncate=120, vertical=True)\n"

In [8]:
# Save dataset
dataset.write.parquet(path_dataset,
    mode="overwrite",
)

                                                                                

# Generate datasetMeta.json

In [9]:
import pandas as pd
import pyarrow.parquet as pt
import os
import datetime as DT

In [10]:
fromHDFS_folder = Path("/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/")
print([el for el in fromHDFS_folder.iterdir() if el.suffix==".parquet"])

[PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/cordis_Kwds3_AI.parquet'), PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/PATSTAT_Kwds3_AI.parquet'), PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/S2_Kwds3_AI.parquet'), PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/S2_Cancer.parquet'), PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/HFRI.parquet'), PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/OA_Kwds3_AI.parquet'), PosixPath('/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/OA_Cancer.parquet')]


In [11]:
for el in fromHDFS_folder.iterdir():
    if el.suffix==".parquet":
        print(el)

/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/cordis_Kwds3_AI.parquet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/PATSTAT_Kwds3_AI.parquet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/S2_Kwds3_AI.parquet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/S2_Cancer.parquet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/HFRI.parquet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/OA_Kwds3_AI.parquet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/OA_Cancer.parquet


In [13]:
metadata = {}

desc_content = {
    "cordis"  : "FP7, H2020 and HE projects from Cordis",
    "S2"      : "Semantic Scholarpublications",
    "OA"      : "OpenAIRE publications",
    "PATSTAT" : "Patents from PATSTAT",
    "HFRI" : "HFRI funded projects"
}

for el in fromHDFS_folder.iterdir():
    if el.suffix==".parquet":
        source = el.name.split('.')[0].split('_')[0]
        #df = pd.read_parquet(el)
        df = spark.read.parquet('file://' + el.as_posix())
        datasetMeta = {
            "name": el.name,
            "description": desc_content[source],
            "visibility": "Public",
            "type": "RAW",
            "download_date": DT.datetime.fromtimestamp(os.path.getmtime(el)),
            # "records"       : len(pd.read_parquet(path_dataset, columns=[])),
            "records": df.count(),
            "source": source,
            "schema" : df.columns
            # "schema": pt.read_schema([file_pt for file_pt in el.iterdir()
            #                          if file_pt.name.endswith('.parquet')][0]).names
        }
        metadata[el.name] = datasetMeta

                                                                                

In [14]:
metadata

{'cordis_Kwds3_AI.parquet': {'name': 'cordis_Kwds3_AI.parquet',
  'description': 'FP7, H2020 and HE projects from Cordis',
  'visibility': 'Public',
  'type': 'RAW',
  'download_date': datetime.datetime(2023, 9, 8, 23, 59, 26, 328399),
  'records': 852,
  'source': 'cordis',
  'schema': ['id',
   'title',
   'objective',
   'acronym',
   'frameworkProgramme',
   'startDate',
   'ecMaxContribution',
   'euroSciVocCode',
   'countryContr',
   'coordinatorCountry',
   'publicationID',
   'patentID',
   'fundingScheme',
   'topic',
   'raw_text',
   'lemmas']},
 'PATSTAT_Kwds3_AI.parquet': {'name': 'PATSTAT_Kwds3_AI.parquet',
  'description': 'Patents from PATSTAT',
  'visibility': 'Public',
  'type': 'RAW',
  'download_date': datetime.datetime(2023, 9, 9, 0, 39, 31, 728364),
  'records': 146022,
  'source': 'PATSTAT',
  'schema': ['id',
   'docdb_family_id',
   'appln_title',
   'appln_abstract',
   'appln_filing_year',
   'earliest_filing_year',
   'granted',
   'appln_auth',
   'receivi

In [15]:
path_datasetMeta = fromHDFS_folder.joinpath('datasetMeta.json')
with path_datasetMeta.open('w', encoding='utf-8') as outfile:
    json.dump(metadata, outfile, ensure_ascii=False,
                          indent=2, default=str)