<a href="https://colab.research.google.com/github/MizanMustakim/big_data_processing_final_project/blob/Mizan/CORD_19_BigDataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Big Data Analytical System Build on a portion of CORD-19 Dataset

**Mounting the google drive where the data was stored.**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Installing pyspark & sparknlp**

In [2]:
!pip install pyspark
!pip install sparknlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 32 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 53.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=3716df68d783ac6e101bf5ab7afb3379af3bd7e7f655fd064dd7d7d978c8ed43
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sparknlp


**Import the libraries**

In [3]:
import sparknlp
from pyspark.sql import SparkSession
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.functions import json_tuple
from pyspark.sql.types import MapType,StringType,ArrayType,StructType,StructField
from pyspark.sql.functions import *

## Data Processing using Big data framework *pyspark*


**Build the spark session. And also check the versions of pyspark and sparknlp**

In [4]:
# Checking the versions of pyspark and sparknlp

spark = SparkSession.builder \
    .master("local") \
    .appName("Proj") \
    .config("spark.sql.warehouse.dir")\
    .getOrCreate()
print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.4.4
Apache Spark version: 3.2.1


**Load the json datasets into *df* database**

In [5]:
# load the files to a data frame
df = spark.read.option("multiline","true").json("/content/drive/MyDrive/pdf_json").select('paper_id','metadata')

**Checking the schemas of df database**

In [6]:
# the data frame's schema
df.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- metadata: struct (nullable = true)
 |    |-- authors: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- affiliation: struct (nullable = true)
 |    |    |    |    |-- institution: string (nullable = true)
 |    |    |    |    |-- laboratory: string (nullable = true)
 |    |    |    |    |-- location: struct (nullable = true)
 |    |    |    |    |    |-- addrLine: string (nullable = true)
 |    |    |    |    |    |-- country: string (nullable = true)
 |    |    |    |    |    |-- postBox: string (nullable = true)
 |    |    |    |    |    |-- postCode: string (nullable = true)
 |    |    |    |    |    |-- region: string (nullable = true)
 |    |    |    |    |    |-- settlement: string (nullable = true)
 |    |    |    |-- email: string (nullable = true)
 |    |    |    |-- first: string (nullable = true)
 |    |    |    |-- last: string (nullable = true)
 |    |    |    |-- middle: array

In [7]:
# brief preview of the original data frame data before flattening
df.select('paper_id','metadata.*').show()

+--------------------+--------------------+--------------------+
|            paper_id|             authors|               title|
+--------------------+--------------------+--------------------+
|dc0795b37b5378b1c...|[{{, Hematopoieti...|ABSTRACTS FROM TH...|
|de6018b3043cdccd4...|[{{Stanford Unive...|Development/Novel...|
|dc2f210539245c7a0...|[{{null, null, nu...|OR-01. Distinct R...|
|2c4cd65b373b8524d...|                  []|2 Lexikalischer H...|
|2b1cbb43a4f06e232...|[{{null, null, nu...|Level 3 guideline...|
|1faae7ce711b362a2...|                  []|                    |
|dc91dd2b0be757c23...|                  []|                    |
|2bfe70ed52c64f411...|[{{University of ...|PA03 -Pattern Rec...|
|dc6e18ee4dbe5ac14...|[{{null, null, nu...|SCIENTIFIC REPORT...|
|2a37e35929251d28d...|                  []|Protocol Protocol...|
|2bf80b20f4f497112...|[{{null, null, nu...|Oral Presentation...|
|dcedc8ca7b10557df...|                  []|                    |
|2cf6a96f9b5336b13...|[{{

In [8]:
# extract the elements of the authors from array to a single element entries
df2=df.select('paper_id',col('metadata.title').alias('title'),\
              explode('metadata.authors').alias('author'))

In [9]:
# the new data frame schema does not have an array field
df2.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: struct (nullable = true)
 |    |-- affiliation: struct (nullable = true)
 |    |    |-- institution: string (nullable = true)
 |    |    |-- laboratory: string (nullable = true)
 |    |    |-- location: struct (nullable = true)
 |    |    |    |-- addrLine: string (nullable = true)
 |    |    |    |-- country: string (nullable = true)
 |    |    |    |-- postBox: string (nullable = true)
 |    |    |    |-- postCode: string (nullable = true)
 |    |    |    |-- region: string (nullable = true)
 |    |    |    |-- settlement: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- first: string (nullable = true)
 |    |-- last: string (nullable = true)
 |    |-- middle: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- suffix: string (nullable = true)



In [10]:
# flatten the data frame
# drop the original rows and a few useless features 
final_df=df2.select('*','author.*')\
.select('*','affiliation.*')\
.select('*','location.*')\
.drop('affiliation').drop('location')\
.drop('postBox').drop('author')\
.drop('middle').drop('suffix')\

In [11]:
# concat the first and last name to a single feature
final_df=final_df.select('*',concat(col('first'),lit(' '),col('last')).alias('name'))\
       .drop('first').drop('last')

In [12]:
# replace the empty fields with null
final_df=final_df.na.replace('',None)

In [13]:
# remove the dublicate rows
final_df=final_df.distinct()

In [14]:
#sort by paper_id
final_df=final_df.sort('paper_id')

In [15]:
# combine rows, which are not identical, but has the same
# paper_id and name and are missing data
final_df = final_df.groupBy("paper_id", "name")\
        .agg(last('email',True).alias('email'),
            last('title',True).alias( 'title'),
            last('institution',True).alias( 'institution'),
            last('laboratory',True).alias( 'laboratory'),
            last('addrLine',True).alias( 'addrLine'),
            last('country',True).alias( 'country'),
            last('postCode',True).alias( 'postCode'),
            last('region',True).alias( 'region'),
            last('settlement',True).alias( 'settlement'),
            )

In [16]:
# the final data frame schema
final_df.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- title: string (nullable = true)
 |-- institution: string (nullable = true)
 |-- laboratory: string (nullable = true)
 |-- addrLine: string (nullable = true)
 |-- country: string (nullable = true)
 |-- postCode: string (nullable = true)
 |-- region: string (nullable = true)
 |-- settlement: string (nullable = true)



In [17]:
# data frame data preview
final_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+--------+--------------------+
|            paper_id|                name|               email|               title|         institution|          laboratory|            addrLine|    country|postCode|  region|          settlement|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------+--------+--------------------+
|0000b6da665726420...|             Alex Ba|                null|The cell phone vi...|Uniformed Service...|                null|                null|        USA|    null|Maryland|            Bethesda|
|0000b6da665726420...|    Anthony Tolisano|anthony.m.tolisan...|The cell phone vi...|Walter Reed Natio...|                null|                null|        USA|    null|Maryland|            Bethesda|


In [18]:
# number of people researching a paper by institution 
institutions = final_df.groupBy(['paper_id','institution']).count()\
                  .na.drop(how='any')\
                  .sort(desc('count'))

In [19]:
institutions.show()

+--------------------+--------------------+-----+
|            paper_id|         institution|count|
+--------------------+--------------------+-----+
|2b06dda25000e6085...|I, Calvo AO, Gall...|  129|
|db0c0308a7405e6ba...|Orange County Hea...|  104|
|db6db211b8abddafe...|Northern Illinois...|   77|
|002b2e094126e78ea...|Harvard Medical S...|   64|
|db1d31b294ab30c2e...|Zentrum für Rhino...|   64|
|de41dc462ed4d3576...|Sechenov First Mo...|   51|
|2a155491b04182a59...|Imperial College ...|   48|
|dd863cabb9e0edd30...|Hospital Sírio Li...|   46|
|2cb5497c065e3c1ef...|Leiden University...|   46|
|2d09cf04f003ed93e...|Aristotle Univers...|   41|
|2bfe70ed52c64f411...|University of Not...|   41|
|db3dbfeec25904ea1...|Duke University S...|   39|
|2cf60380bb05becd1...|Virginia Polytech...|   37|
|2ec268221c21259c0...|OpenSAFELY Collab...|   37|
|1e04601589c695728...|University of Pen...|   36|
|dc6a1bf7e383925a8...|    Emory University|   36|
|2da484bab5fea1c1e...|Azienda Ospedalie...|   32|


In [21]:
# The shape of this dataframe
print((institutions.count(), len(institutions.columns)))

(21302, 3)


In [23]:
# Save the cleaned institution dataset into csv file
institutions.toPandas().to_csv('/content/drive/MyDrive/processed_data/institutions.csv',index=False)

In [24]:
# map all the researchers participated  in covid research and the amount of papers they took part of
researchers =final_df.groupBy(['name']).count().na.drop(how='any').sort(desc('count'))

In [25]:
# Showing the preview of the first 10 of the researchers dataframe
researchers.show(10)

+------+-----+
|  name|count|
+------+-----+
|    † |  305|
|    M |  155|
|    J |   86|
| Wang |   74|
|    A |   72|
|    Y |   68|
|Zhang |   61|
|    D |   58|
| Chen |   51|
|    W |   47|
+------+-----+
only showing top 10 rows



In [26]:
# The shape of this researchers dataframe
print((researchers.count(), len(researchers.columns)))

(98841, 2)


In [28]:
# Save the cleaned researchers dataset into csv file
researchers.toPandas().to_csv('/content/drive/MyDrive/processed_data/researchers.csv',index=False)

In [29]:
# researchers by country
countries=final_df.groupBy(['country']).count().na.drop(how='any').sort(desc('count'))

In [30]:
# Showing the preview of the first 10 of the countries dataframe
countries.show(10)

+-------+-----+
|country|count|
+-------+-----+
|    USA| 8506|
|  China| 5622|
|  Italy| 3663|
|     UK| 2910|
|Germany| 1989|
|  Spain| 1515|
|  Japan| 1449|
| France| 1392|
|  India| 1376|
| Canada| 1292|
+-------+-----+
only showing top 10 rows



In [31]:
# The shape of of this countries dataframe
print((countries.count(), len(countries.columns)))

(823, 2)


In [32]:
# Save the cleaned countries dataset into csv file
countries.toPandas().to_csv('/content/drive/MyDrive/processed_data/countries.csv',index=False)

## Data Analytical System

**Import the necessary libraries for the analysis**

In [33]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

### Institution

In [34]:
institute_df = pd.read_csv('/content/drive/MyDrive/processed_data/institutions.csv')
institute_df.head()

Unnamed: 0,paper_id,institution,count
0,2b06dda25000e6085207ba00b338b9932cc3d751,"I, Calvo AO, Galleymore PR, Carmona SA",129
1,db0c0308a7405e6ba90a205f26b146bcec1ecf0d,Orange County Healthcare Agency,104
2,db6db211b8abddafe781607e981c6e348edd1562,Northern Illinois University,77
3,002b2e094126e78ea17748c0b4febecb903fb42c,Harvard Medical School,64
4,db1d31b294ab30c2e8cdd12e47d680c21ab32f7f,Zentrum für Rhinologie und Allergologie,64


**The view of the institution column and the count column of the dataframe in a plotting fame**

In [37]:
fig1 = go.Figure(data=[go.Table(header=dict(values=['Institution Name', 'Count']),
                 cells=dict(values=[institute_df['institution'], institute_df['count']]))
                     ])
fig1.show()

In [38]:
dfb = institute_df.head(30)
fig2 = px.bar(dfb, x='institution', y='count',title='Top 30 quantity of article researches')

fig2.update_traces(marker_color='rgb(158,200,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)

fig2.show()

In [39]:
dfp = institute_df.query("count > 30")
fig3 = px.pie(dfp, values='count', names='institution', title='More than 30 article researches')
fig3.show()

### Countries

In [40]:
country_df = pd.read_csv('/content/drive/MyDrive/processed_data/countries.csv')
country_df.head()

Unnamed: 0,country,count
0,USA,8506
1,China,5622
2,Italy,3663
3,UK,2910
4,Germany,1989


In [41]:
Country = [
    ('US', 'USA'),
    ('AF', 'Afghanistan'),
    ('AL', 'Albania'),
    ('DZ', 'Algeria'),
    ('AS', 'American Samoa'),
    ('AD', 'Andorra'),
    ('AO', 'Angola'),
    ('AI', 'Anguilla'),
    ('AQ', 'Antarctica'),
    ('AG', 'Antigua And Barbuda'),
    ('AR', 'Argentina'),
    ('AM', 'Armenia'),
    ('AW', 'Aruba'),
    ('AU', 'Australia'),
    ('AT', 'Austria'),
    ('AZ', 'Azerbaijan'),
    ('BS', 'Bahamas'),
    ('BH', 'Bahrain'),
    ('BD', 'Bangladesh'),
    ('BB', 'Barbados'),
    ('BY', 'Belarus'),
    ('BE', 'Belgium'),
    ('BZ', 'Belize'),
    ('BJ', 'Benin'),
    ('BM', 'Bermuda'),
    ('BT', 'Bhutan'),
    ('BO', 'Bolivia'),
    ('BA', 'Bosnia And Herzegowina'),
    ('BW', 'Botswana'),
    ('BV', 'Bouvet Island'),
    ('BR', 'Brazil'),
    ('BN', 'Brunei Darussalam'),
    ('BG', 'Bulgaria'),
    ('BF', 'Burkina Faso'),
    ('BI', 'Burundi'),
    ('KH', 'Cambodia'),
    ('CM', 'Cameroon'),
    ('CA', 'Canada'),
    ('CV', 'Cape Verde'),
    ('KY', 'Cayman Islands'),
    ('CF', 'Central African Rep'),
    ('TD', 'Chad'),
    ('CL', 'Chile'),
    ('CN', 'China'),
    ('CX', 'Christmas Island'),
    ('CC', 'Cocos Islands'),
    ('CO', 'Colombia'),
    ('KM', 'Comoros'),
    ('CG', 'Congo'),
    ('CK', 'Cook Islands'),
    ('CR', 'Costa Rica'),
    ('CI', 'Cote D`ivoire'),
    ('HR', 'Croatia'),
    ('CU', 'Cuba'),
    ('CY', 'Cyprus'),
    ('CZ', 'Czech Republic'),
    ('DK', 'Denmark'),
    ('DJ', 'Djibouti'),
    ('DM', 'Dominica'),
    ('DO', 'Dominican Republic'),
    ('TP', 'East Timor'),
    ('EC', 'Ecuador'),
    ('EG', 'Egypt'),
    ('SV', 'El Salvador'),
    ('GQ', 'Equatorial Guinea'),
    ('ER', 'Eritrea'),
    ('EE', 'Estonia'),
    ('ET', 'Ethiopia'),
    ('FK', 'Falkland Islands (Malvinas)'),
    ('FO', 'Faroe Islands'),
    ('FJ', 'Fiji'),
    ('FI', 'Finland'),
    ('FR', 'France'),
    ('GF', 'French Guiana'),
    ('PF', 'French Polynesia'),
    ('TF', 'French S. Territories'),
    ('GA', 'Gabon'),
    ('GM', 'Gambia'),
    ('GE', 'Georgia'),
    ('DE', 'Germany'),
    ('GH', 'Ghana'),
    ('GI', 'Gibraltar'),
    ('GR', 'Greece'),
    ('GL', 'Greenland'),
    ('GD', 'Grenada'),
    ('GP', 'Guadeloupe'),
    ('GU', 'Guam'),
    ('GT', 'Guatemala'),
    ('GN', 'Guinea'),
    ('GW', 'Guinea-bissau'),
    ('GY', 'Guyana'),
    ('HT', 'Haiti'),
    ('HN', 'Honduras'),
    ('HK', 'Hong Kong'),
    ('HU', 'Hungary'),
    ('IS', 'Iceland'),
    ('IN', 'India'),
    ('ID', 'Indonesia'),
    ('IR', 'Iran'),
    ('IQ', 'Iraq'),
    ('IE', 'Ireland'),
    ('IL', 'Israel'),
    ('IT', 'Italy'),
    ('JM', 'Jamaica'),
    ('JP', 'Japan'),
    ('JO', 'Jordan'),
    ('KZ', 'Kazakhstan'),
    ('KE', 'Kenya'),
    ('KI', 'Kiribati'),
    ('KP', 'Korea (North)'),
    ('KR', 'Korea'),
    ('KW', 'Kuwait'),
    ('KG', 'Kyrgyzstan'),
    ('LA', 'Laos'),
    ('LV', 'Latvia'),
    ('LB', 'Lebanon'),
    ('LS', 'Lesotho'),
    ('LR', 'Liberia'),
    ('LY', 'Libya'),
    ('LI', 'Liechtenstein'),
    ('LT', 'Lithuania'),
    ('LU', 'Luxembourg'),
    ('MO', 'Macau'),
    ('MK', 'Macedonia'),
    ('MG', 'Madagascar'),
    ('MW', 'Malawi'),
    ('MY', 'Malaysia'),
    ('MV', 'Maldives'),
    ('ML', 'Mali'),
    ('MT', 'Malta'),
    ('MH', 'Marshall Islands'),
    ('MQ', 'Martinique'),
    ('MR', 'Mauritania'),
    ('MU', 'Mauritius'),
    ('YT', 'Mayotte'),
    ('MX', 'Mexico'),
    ('FM', 'Micronesia'),
    ('MD', 'Moldova'),
    ('MC', 'Monaco'),
    ('MN', 'Mongolia'),
    ('MS', 'Montserrat'),
    ('MA', 'Morocco'),
    ('MZ', 'Mozambique'),
    ('MM', 'Myanmar'),
    ('NA', 'Namibia'),
    ('NR', 'Nauru'),
    ('NP', 'Nepal'),
    ('NL', 'Netherlands'),
    ('AN', 'Netherlands Antilles'),
    ('NC', 'New Caledonia'),
    ('NZ', 'New Zealand'),
    ('NI', 'Nicaragua'),
    ('NE', 'Niger'),
    ('NG', 'Nigeria'),
    ('NU', 'Niue'),
    ('NF', 'Norfolk Island'),
    ('MP', 'Northern Mariana Islands'),
    ('NO', 'Norway'),
    ('OM', 'Oman'),
    ('PK', 'Pakistan'),
    ('PW', 'Palau'),
    ('PA', 'Panama'),
    ('PG', 'Papua New Guinea'),
    ('PY', 'Paraguay'),
    ('PE', 'Peru'),
    ('PH', 'Philippines'),
    ('PN', 'Pitcairn'),
    ('PL', 'Poland'),
    ('PT', 'Portugal'),
    ('PR', 'Puerto Rico'),
    ('QA', 'Qatar'),
    ('RE', 'Reunion'),
    ('RO', 'Romania'),
    ('RU', 'Russia'),
    ('RW', 'Rwanda'),
    ('KN', 'Saint Kitts And Nevis'),
    ('LC', 'Saint Lucia'),
    ('VC', 'St Vincent/Grenadines'),
    ('WS', 'Samoa'),
    ('SM', 'San Marino'),
    ('ST', 'Sao Tome'),
    ('SA', 'Saudi Arabia'),
    ('SN', 'Senegal'),
    ('SC', 'Seychelles'),
    ('SL', 'Sierra Leone'),
    ('SG', 'Singapore'),
    ('SK', 'Slovakia'),
    ('SI', 'Slovenia'),
    ('SB', 'Solomon Islands'),
    ('SO', 'Somalia'),
    ('ZA', 'South Africa'),
    ('ES', 'Spain'),
    ('LK', 'Sri Lanka'),
    ('SH', 'St. Helena'),
    ('PM', 'St.Pierre'),
    ('SD', 'Sudan'),
    ('SR', 'Suriname'),
    ('SZ', 'Swaziland'),
    ('SE', 'Sweden'),
    ('CH', 'Switzerland'),
    ('SY', 'Syrian Arab Republic'),
    ('TW', 'Taiwan'),
    ('TJ', 'Tajikistan'),
    ('TZ', 'Tanzania'),
    ('TH', 'Thailand'),
    ('TG', 'Togo'),
    ('TK', 'Tokelau'),
    ('TO', 'Tonga'),
    ('TT', 'Trinidad And Tobago'),
    ('TN', 'Tunisia'),
    ('TR', 'Turkey'),
    ('TM', 'Turkmenistan'),
    ('TV', 'Tuvalu'),
    ('UG', 'Uganda'),
    ('UA', 'Ukraine'),
    ('AE', 'United Arab Emirates'),
    ('UK', 'UK'),
    ('UY', 'Uruguay'),
    ('UZ', 'Uzbekistan'),
    ('VU', 'Vanuatu'),
    ('VA', 'Vatican City State'),
    ('VE', 'Venezuela'),
    ('VN', 'Viet Nam'),
    ('VG', 'Virgin Islands (British)'),
    ('VI', 'Virgin Islands (U.S.)'),
    ('EH', 'Western Sahara'),
    ('YE', 'Yemen'),
    ('YU', 'Yugoslavia'),
    ('ZR', 'Zaire'),
    ('ZM', 'Zambia'),
    ('ZW', 'Zimbabwe')
]

In [42]:
Country1 = pd.DataFrame(list(Country), columns=["Code_name", "Country"])
Country1.head()

Unnamed: 0,Code_name,Country
0,US,USA
1,AF,Afghanistan
2,AL,Albania
3,DZ,Algeria
4,AS,American Samoa


In [43]:
data = pd.read_csv('/content/drive/MyDrive/processed_data/countries.csv')
filter1 = data["country"].isin(Country1["Country"].to_numpy())
data=data[filter1]

In [44]:

fig4 = go.Figure(data=go.Choropleth(
    locations =  data['country'],
    locationmode = 'country names',
    z = data['count'],
    text = data['country'],
    colorscale = 'Blues',
    autocolorscale=True,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '',
    colorbar_title = 'count',
))

fig4.update_layout(
    title_text='Number of Article Researches of Different Countries',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper'
    )]
)

fig4.show()

In [46]:
fig5 = px.bar(data.head(30), x='country', y='count',title='Top 30 countries with highest number of article researches')
fig5.show()

### Researchers

In [47]:
researcher_df = pd.read_csv('/content/drive/MyDrive/processed_data/researchers.csv')

researcher_df = researcher_df[np.where((researcher_df['name'].str.len()>1) & 
         (researcher_df['name'].str.len()>2), True, False)]
researcher_df.head()


Unnamed: 0,name,count
3,Wang,74
6,Zhang,61
8,Chen,51
17,Liu,34
18,Zhou,33


In [48]:
fig6 = px.pie(researcher_df.head(10).reset_index(), values='count', names='name', title='Top 10 article creator')


# Add dropdown
fig6.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=["type", "pie"],
                    label="Pie Chart",
                    method="restyle"
                ),                
                dict(
                    args=["type", "bar"],
                    label="Bar Chart",
                    method="restyle"
                ),
                dict(
                    args=["type", "line"],
                    label="Line Graph",
                    method="restyle",
                    
                ),

            ]),
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ]
)

# Add annotation
fig6.update_layout(
    annotations=[
        dict(text="Type:", showarrow=False,
        x=0, y=1.085, yref="paper", align="left")
    ]
)


fig6.show()

In [49]:
fig7 = px.line_polar(researcher_df.head(10).reset_index(), r='count', theta='name', line_close=True,title='Top 10 article creator')
fig7.show()