In [1]:
"""
Temporary routine for generation of datasets
for demonstration purposes

It is a "Fake Data Mediator" for use with the first
version of the Interactive Topic Model Trainer
"""

import argparse
import json
import os
from pathlib import Path
#from langdetect import detect
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

# Simulate downloads from HDFS

## CORDIS UC3M

In [None]:
parquet_table = '/export/ml4ds/IntelComp/Datalake/CORDIS/20230425/parquet/projects.parquet' #In HDFS
lemmas_table = '/export/ml4ds/IntelComp/Datalake/CORDIS/20230425/parquet/projects_NLP.parquet' #In HDFS
id_fld = 'projectID'
selectFields = 'title, objective, frameworkProgramme, startDate, ecMaxContribution, euroSciVocCode'
filterCondition = 'frameworkProgramme = "HORIZON"'
path_dataset = "/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/CORDIS_HORIZON.parquet" #In NFS

## Semantic Scholar

### Cancer dataset pregenerated from PMID given by HCERES

In [6]:
parquet_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_Cancer.parquet' # In HDFS
lemmas_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_Cancer_NLP.parquet' # In HDFS
id_fld = 'corpusid'
selectFields = 'PMID, title, abstract, year, referencecount, citationcount, influentialcitationcount, S2fieldsofstudy'
filterCondition = ''
path_dataset = "/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/Scholar_Cancer.parquet" #In NFS

### AI dataset generated from AI keywords

In [14]:
parquet_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_AI_Kwds.parquet' # In HDFS
lemmas_table = '/export/ml4ds/IntelComp/Datalake/semanticscholar/20230418/parquet/papers_AI_Kwds_NLP.parquet' # In HDFS
id_fld = 'corpusid'
selectFields = 'title, abstract, year, referencecount, citationcount, influentialcitationcount, S2fieldsofstudy'
filterCondition = ''
path_dataset = "/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/Scholar_AI_Kwds.parquet" #In NFS

### Whole dataset filtering according to FOS

In [None]:
#parquet_table = '/export/ml4ds/IntelComp/Datalake/SemanticScholar/20220201/papers.parquet'
#selectFields = 'title, paperAbstract, doi, year, fieldsOfStudy'
#filterCondition = "array_contains(fieldsOfStudy, 'Computer Science')"
#path_dataset = "/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS_scalability/S2CS_1.parquet"
#perc = float(1)/100

## Generate parquet files

In [15]:
# We read the main table including selected fields and the identifier
flds = [el.strip() for el in selectFields.split(',')]
query = "SELECT " + id_fld + " AS id, " + (",").join(flds) + \
                " FROM parquet.`" + parquet_table + "`"

# Add filtering condition to SELECT clause if necessary
if len(filterCondition.strip()):
    query += " WHERE " + filterCondition
dataset = spark.sql(query)

print('Number of documents in dataset:', dataset.count())
dataset.show(n=2, truncate=120, vertical=True)

23/05/18 21:05:58 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of documents in dataset: 1632946
-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------
 id                       | 59789696                                                                                                                 
 title                    | Computer-aided earthquake engineering                                                                                    
 abstract                 | Computer-aided earthquake engineering requires the integration of many new and developing computer technologies. It i... 
 year                     | 1987                                                                                                                     
 referencecount           | 0                                                                                                                        
 citationcount            | 0                               

In [16]:
# We read the table with the output of NLP processes and identify id field
query = "SELECT " + id_fld + " AS id, raw_text, lemmas FROM parquet.`" + lemmas_table + "`"
lemmas_df = spark.sql(query)

print('Number of documents in dataset:', lemmas_df.count())
lemmas_df.show(n=2, truncate=120, vertical=True)

23/05/18 21:06:21 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

Number of documents in dataset: 1575171




-RECORD 0----------------------------------------------------------------------------------------------------------------------------
 id       | 234639473                                                                                                                
 raw_text | Remarks on Recognition of Aromas from Tea Sources Using MQ3, MQ5, MQ7 Sensor Signal This study investigated the capac... 
 lemmas   | remark recognition aromas tea sources sensor signal study capacity deep neural network distinguish tea source base ar... 
-RECORD 1----------------------------------------------------------------------------------------------------------------------------
 id       | 61407153                                                                                                                 
 raw_text | Signal quality improvement of holographic data storage using adaptive two-dimensional filter Holographic data storage... 
 lemmas   | signal quality improvement holographic datum stora

                                                                                

In [17]:
# Join tables
lemmas_df = lemmas_df.withColumnRenamed(id_fld,"id")
dataset = (dataset.join(lemmas_df, dataset.id ==  lemmas_df.id, "left")
                      .drop(lemmas_df.id)
                )

print('Number of documents in dataset:', dataset.count())
dataset.show(n=10, truncate=120, vertical=True)

                                                                                

Number of documents in dataset: 1632946




-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------
 id                       | 685869                                                                                                                   
 title                    | New Bio-Inspired Coordination Strategies for Multi-Agent Systems Applied to Foraging Tasks                               
 abstract                 | Multiple agent systems can be applied to foraging tasks, thus solving this problem in a cooperative approach. The maj... 
 year                     | 2016                                                                                                                     
 referencecount           | 20                                                                                                                       
 citationcount            | 7                                                                       

                                                                                

In [None]:
# Sampling dataset if necessary
"""
if perc < 1:
    dataset = dataset.sample(fraction=perc)

print('Number of documents in dataset:', dataset.count())
#dataset.show(n=10, truncate=120, vertical=True)
"""

In [18]:
# Save dataset
dataset.write.parquet(f"file://{path_dataset}",
    mode="overwrite",
)

                                                                                

# Generate datasetMeta.json

In [19]:
import pandas as pd
import pyarrow.parquet as pt
import os
import datetime as DT

In [20]:
fromHDFS_folder = Path("/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/")
print([el for el in fromHDFS_folder.iterdir() if el.suffix==".parquet"])

[PosixPath('/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/CORDIS.parquet'), PosixPath('/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/Scholar_AI_Kwds.parquet'), PosixPath('/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/Scholar_Cancer.parquet'), PosixPath('/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/CORDIS_H2020.parquet'), PosixPath('/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/CORDIS_HORIZON.parquet'), PosixPath('/export/clusterdata/jarenas/github/IntelComp/ITMT/topicmodeler/fromHDFS/CORDIS_FP7.parquet')]


In [21]:
metadata = {}

for el in fromHDFS_folder.iterdir():
    if el.suffix==".parquet":
        df = pd.read_parquet(el)
        datasetMeta = {
            "name": el.name,
            "description": el.stem.replace('_',' ') + " projects",
            "visibility": "public",
            "type": "RAW",
            "download_date": DT.datetime.fromtimestamp(os.path.getmtime(el)),
            # "records"       : len(pd.read_parquet(path_dataset, columns=[])),
            "records": len(df),
            "source": el.name.split('_')[0],
            "schema": pt.read_schema([file_pt for file_pt in el.iterdir()
                                      if file_pt.name.endswith('.parquet')][0]).names
        }
    metadata[el.name] = datasetMeta

In [22]:
metadata

{'CORDIS.parquet': {'name': 'CORDIS.parquet',
  'description': 'CORDIS projects',
  'visibility': 'public',
  'type': 'RAW',
  'download_date': datetime.datetime(2023, 5, 5, 19, 34, 47, 490148),
  'records': 65760,
  'source': 'CORDIS.parquet',
  'schema': ['id',
   'title',
   'objective',
   'frameworkProgramme',
   'startDate',
   'ecMaxContribution',
   'euroSciVocCode',
   'raw_text',
   'lemmas']},
 'Scholar_AI_Kwds.parquet': {'name': 'Scholar_AI_Kwds.parquet',
  'description': 'Scholar AI Kwds projects',
  'visibility': 'public',
  'type': 'RAW',
  'download_date': datetime.datetime(2023, 5, 18, 21, 9, 23, 588181),
  'records': 1632946,
  'source': 'Scholar',
  'schema': ['id',
   'title',
   'abstract',
   'year',
   'referencecount',
   'citationcount',
   'influentialcitationcount',
   'S2fieldsofstudy',
   'raw_text',
   'lemmas']},
 'Scholar_Cancer.parquet': {'name': 'Scholar_Cancer.parquet',
  'description': 'Scholar Cancer projects',
  'visibility': 'public',
  'type': 'R

In [23]:
path_datasetMeta = fromHDFS_folder.joinpath('datasetMeta.json')
with path_datasetMeta.open('w', encoding='utf-8') as outfile:
    json.dump(metadata, outfile, ensure_ascii=False,
                          indent=2, default=str)