<a href="https://colab.research.google.com/github/MizanMustakim/big_data_processing_final_project/blob/Mizan/DataAcquisition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark
!pip install sparknlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 59.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=e5094e45dfe2cfe365f05fb7fcc7b758083ad57f9f36bbc4222e325068dcdf04
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sparknlp


In [3]:
import sparknlp
from pyspark.sql import SparkSession
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.functions import json_tuple
from pyspark.sql.types import MapType,StringType,ArrayType,StructType,StructField
from pyspark.sql.functions import *

In [4]:
# Checking the versions of pyspark and sparknlp

spark = SparkSession.builder \
    .master("local") \
    .appName("Proj") \
    .config("spark.sql.warehouse.dir")\
    .getOrCreate()
print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.4.4
Apache Spark version: 3.2.1


In [5]:
# load the files to a data frame
# select only the fearures needed:
# paper_id, metadata
df = spark.read.option("multiline","true")\
          .json("/content/drive/MyDrive/pdf_json")\
          .select('paper_id','metadata')

In [6]:
# the data frame's schema
df.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- metadata: struct (nullable = true)
 |    |-- authors: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- affiliation: struct (nullable = true)
 |    |    |    |    |-- institution: string (nullable = true)
 |    |    |    |    |-- laboratory: string (nullable = true)
 |    |    |    |    |-- location: struct (nullable = true)
 |    |    |    |    |    |-- addrLine: string (nullable = true)
 |    |    |    |    |    |-- country: string (nullable = true)
 |    |    |    |    |    |-- postBox: string (nullable = true)
 |    |    |    |    |    |-- postCode: string (nullable = true)
 |    |    |    |    |    |-- region: string (nullable = true)
 |    |    |    |    |    |-- settlement: string (nullable = true)
 |    |    |    |-- email: string (nullable = true)
 |    |    |    |-- first: string (nullable = true)
 |    |    |    |-- last: string (nullable = true)
 |    |    |    |-- middle: array

In [7]:
# brief preview of the original data frame data before flattening
df.select('paper_id','metadata.*').show()

+--------------------+--------------------+--------------------+
|            paper_id|             authors|               title|
+--------------------+--------------------+--------------------+
|2c4cd65b373b8524d...|                  []|2 Lexikalischer H...|
|2b1cbb43a4f06e232...|[{{null, null, nu...|Level 3 guideline...|
|1faae7ce711b362a2...|                  []|                    |
|2bfe70ed52c64f411...|[{{University of ...|PA03 -Pattern Rec...|
|2a37e35929251d28d...|                  []|Protocol Protocol...|
|2bf80b20f4f497112...|[{{null, null, nu...|Oral Presentation...|
|2cf6a96f9b5336b13...|[{{Swinburne Univ...|Blood Pressure Se...|
|2ea7d915eb6c38431...|[{{null, null, nu...|Band 161 · Supple...|
|02cf6f1572731d9b3...|[{{null, null, nu...|                    |
|1f63075aa219ae291...|[{{null, null, nu...|P1 Cerebral autor...|
|02e359329ec030452...|[{{null, null, nu...|The 15th Congress...|
|2cf63588b20274781...|[{{null, null, nu...|                    |
|2e9f02301be41d902...|[{{

In [8]:
# extract the elements of the authors from array to a single element entries
df2=df.select('paper_id',col('metadata.title').alias('title'),\
              explode('metadata.authors').alias('author'))

In [9]:
# the new data frame schema does not have an array field
df2.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: struct (nullable = true)
 |    |-- affiliation: struct (nullable = true)
 |    |    |-- institution: string (nullable = true)
 |    |    |-- laboratory: string (nullable = true)
 |    |    |-- location: struct (nullable = true)
 |    |    |    |-- addrLine: string (nullable = true)
 |    |    |    |-- country: string (nullable = true)
 |    |    |    |-- postBox: string (nullable = true)
 |    |    |    |-- postCode: string (nullable = true)
 |    |    |    |-- region: string (nullable = true)
 |    |    |    |-- settlement: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- first: string (nullable = true)
 |    |-- last: string (nullable = true)
 |    |-- middle: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- suffix: string (nullable = true)



In [10]:
# flatten the data frame
# drop the original rows and a few useless features 
final_df=df2.select('*','author.*')\
.select('*','affiliation.*')\
.select('*','location.*')\
.drop('affiliation').drop('location')\
.drop('postBox').drop('author')\
.drop('middle').drop('suffix')\

In [11]:
# concat the first and last name to a single feature
final_df=final_df.select('*',concat(col('first'),lit(' '),col('last')).alias('name'))\
       .drop('first').drop('last')

In [12]:
# replace the empty fields with null
final_df=final_df.na.replace('',None)

In [13]:
# remove the dublicate rows
final_df=final_df.distinct()

In [14]:
#sort by paper_id
final_df=final_df.sort('paper_id')

In [15]:
# combine rows, which are not identical, but has the same
# paper_id and name and are missing data
final_df = final_df.groupBy("paper_id", "name")\
        .agg(last('email',True).alias('email'),
            last('title',True).alias( 'title'),
            last('institution',True).alias( 'institution'),
            last('laboratory',True).alias( 'laboratory'),
            last('addrLine',True).alias( 'addrLine'),
            last('country',True).alias( 'country'),
            last('postCode',True).alias( 'postCode'),
            last('region',True).alias( 'region'),
            last('settlement',True).alias( 'settlement'),
            )

In [16]:
# the final data frame schema
final_df.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- title: string (nullable = true)
 |-- institution: string (nullable = true)
 |-- laboratory: string (nullable = true)
 |-- addrLine: string (nullable = true)
 |-- country: string (nullable = true)
 |-- postCode: string (nullable = true)
 |-- region: string (nullable = true)
 |-- settlement: string (nullable = true)



In [17]:
# data frame data preview
final_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+--------+--------------------+
|            paper_id|                name|               email|               title|         institution|          laboratory|            addrLine|    country|postCode|  region|          settlement|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+--------+--------------------+
|0000b6da665726420...|             Alex Ba|                null|The cell phone vi...|Uniformed Service...|                null|                null|        USA|    null|Maryland|            Bethesda|
|0000b6da665726420...|    Anthony Tolisano|anthony.m.tolisan...|The cell phone vi...|Walter Reed Natio...|                null|                null|        USA|    null|Maryland|            Bethesda|


In [18]:
# number of people researching a paper by institution 
institutions = final_df.groupBy(['paper_id','institution']).count()\
                  .na.drop(how='any')\
                  .sort(desc('count'))

In [19]:
institutions.show()

+--------------------+--------------------+-----+
|            paper_id|         institution|count|
+--------------------+--------------------+-----+
|2b06dda25000e6085...|I, Calvo AO, Gall...|  129|
|002b2e094126e78ea...|Harvard Medical S...|   64|
|2a155491b04182a59...|Imperial College ...|   48|
|2cb5497c065e3c1ef...|Leiden University...|   46|
|2d09cf04f003ed93e...|Aristotle Univers...|   41|
|2bfe70ed52c64f411...|University of Not...|   41|
|2cf60380bb05becd1...|Virginia Polytech...|   37|
|2ec268221c21259c0...|OpenSAFELY Collab...|   37|
|1e04601589c695728...|University of Pen...|   36|
|2da484bab5fea1c1e...|Azienda Ospedalie...|   32|
|2d5498a8adecb9440...|Hospital Clinic o...|   32|
|1e79d1c817213e5f7...|University of Pen...|   32|
|2bc16fdd79243daa5...|Imperial College ...|   31|
|02d6132a22879e828...|Nanogen Biopharma...|   30|
|2aed363d02c801222...|National Institut...|   30|
|2d677e610aa84b54b...|University of Liv...|   30|
|2da3afb5392440668...|       James Chodosh|   30|


In [20]:
# The number of total rows of this dataframe
institutions.count()

14556

In [22]:
# put the df into pandas and export it into a single file for our purpoces
institutions.toPandas().to_csv('/content/drive/MyDrive/processed_data/institutions.csv',index=False)

In [23]:
# map all the researchers participated  in covid research
# and the amount of papers they took part of
researchers =final_df.groupBy(['name']).count()\
                  .na.drop(how='any')\
                  .sort(desc('count'))

In [24]:
# Showing the preview of the researchers dataframe
researchers.show()

+------+-----+
|  name|count|
+------+-----+
|    † |  202|
|    M |  101|
|    J |   58|
|    A |   46|
|    Y |   45|
| Wang |   44|
|Zhang |   38|
|    D |   36|
| Chen |   35|
|    S |   34|
|    W |   32|
|    C |   29|
|    B |   29|
|    E |   27|
|    R |   26|
|    ; |   25|
|    ‡ |   24|
|    ✉ |   24|
| Zhou |   23|
| Zhao |   23|
+------+-----+
only showing top 20 rows



In [25]:
# The number of total rows of this researchers dataframe
researchers.count()

69181

In [26]:
# put the df into pandas and export it into a single file for our purpoces
researchers.toPandas().to_csv('/content/drive/MyDrive/processed_data/researchers.csv',index=False)

In [27]:
# researchers by country
countries=final_df.groupBy(['country']).count()\
                  .na.drop(how='any')\
                  .sort(desc('count'))

In [28]:
# Showing the preview of the countries dataframe
countries.show()

+--------------------+-----+
|             country|count|
+--------------------+-----+
|                 USA| 5656|
|               China| 3645|
|               Italy| 2444|
|                  UK| 1985|
|             Germany| 1377|
|              France| 1076|
|               Spain| 1025|
|               Japan| 1013|
|               India|  932|
|              Canada|  909|
|              Brazil|  874|
|           Australia|  805|
|                Iran|  663|
|      United Kingdom|  639|
|       United States|  634|
|United States of ...|  392|
|             Belgium|  387|
|         Switzerland|  379|
|              Taiwan|  356|
|     The Netherlands|  352|
+--------------------+-----+
only showing top 20 rows



In [29]:
# The number of total rows of this countries dataframe
countries.count()

634

In [30]:
# put the df into pandas and export it into a single file for our purpoces
countries.toPandas().to_csv('/content/drive/MyDrive/processed_data/countries.csv',index=False)