In [10]:
import pandas as pd
import numpy as np
import math

import time

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import os

## Webscraping of the information regarding articles:

In [2]:
def webscraping_wos_range(website_path, start_article, end_article):
    """
    Scrapes scientific articles from Web of Science within a specified range.

    Parameters:
    - website_path (str): The URL to the Web of Science search results page.
    - start_article (int): The starting article number for scraping.
    - end_article (int): The ending article number for scraping.

    Description:
    This function automates the web scraping process of scientific articles from Web of Science.
    It allows you to specify a range of articles you want to scrape (e.g., articles 5000 to 6000).
    The function will visit the provided website, reject cookies, and then scrape articles within
    the specified range.

    Note:
    - Ensure you have the ChromeDriver installed and its path specified in the function.
    - The function interacts with the Web of Science website, so it may need adjustments
      if the website's structure changes.
    - When you are scraping too many articles the function might stop because it asks to prove that
      you are human.

    Example Usage:
    website_path = "URL_TO_WEB_OF_SCIENCE_SEARCH_RESULTS"
    start_article = 5000
    end_article = 6000
    webscraping_wos_range(website_path, start_article, end_article)
    """

    # path for chrome driver
    path = Service(ChromeDriverManager().install())

    # create the driver
    driver = webdriver.Chrome(service = path)

    # open chromedriver window
    driver.get(website_path)
    time.sleep(10)
    
    # reject cookies
    driver.find_element('xpath', '//*[@id="onetrust-reject-all-handler"]').click()
    
    # calculate total number of publications, nem é bem preciso para nada
    n_publications = int(driver.find_element(By.CLASS_NAME, "brand-blue").text.replace(',', ''))
    
    for page in range(math.ceil((end_article - start_article) / 500)):

        time.sleep(2)
        
        # calculate the range of articles to scrape on the current page
        page_start_article = page * 500 + 1 + start_article
        page_end_article = min((page + 1) * 500 + start_article, end_article)
        
        try:
            # open "Export" options
            driver.find_element('xpath', '//*[@id="snRecListTop"]/app-export-menu/div/button').click()

            time.sleep(2)

            print(f"Scraping articles {page_start_article} to {page_end_article}")
            
            # export to tab-delimited file
            driver.find_element('xpath', '//*[@id="exportToTabWinButton"]').click()

            # export to bibtex file
            #driver.find_element('xpath', '//*[@id="exportToBibtexButton"]').click()

        except:
            print(f'Page number {page_start_article} until {page_end_article} was not exported')
            continue
        
        time.sleep(3)
        driver.find_element('xpath', '//*[@id="radio3"]/label/span[1]/span[2]').click()
        # choose range of papers to export
        # if page == 0:
        #     driver.find_element('xpath', '//*[@id="radio3"]/label/span[1]/span[2]').click()
        # else:
        #     driver.find_element('xpath', '//*[@id="radio3"]/label/span[1]/span[1]').click()

            #//*[@id="radio3"]/label/span[1]/span[2]
        #driver.find_element(By.CLASS_NAME, "mat-radio-inner-circle")
      
        time.sleep(1)
        
        # starting range
        driver.find_element(By.NAME, "markFrom").clear()
        time.sleep(0.5)
        driver.find_element(By.NAME, "markFrom").send_keys(page_start_article)
        time.sleep(0.5)
        
        # ending range
        driver.find_element(By.NAME, "markTo").clear()
        time.sleep(0.5)
        driver.find_element(By.NAME, "markTo").send_keys(page_end_article)
        time.sleep(0.5)
        
        # dropdown "Record Content:"
        driver.find_element('xpath', '/html/body/app-wos/main/div/div/div[2]/div/div/div[2]/app-input-route[1]/app-export-overlay/div/div[3]/div[2]/app-export-out-details/div/div[2]/form/div/div[1]/wos-select/button').click()

        time.sleep(2)
        
        # choose the record content to be "Full Record and Cited References"
        driver.find_element('xpath', '//*[@id="global-select"]/div/div/div[4]/span').click()

        time.sleep(2)
        
        # click export
        driver.find_element('xpath', '/html/body/app-wos/main/div/div/div[2]/div/div/div[2]/app-input-route[1]/app-export-overlay/div/div[3]/div[2]/app-export-out-details/div/div[2]/form/div/div[2]/button[1]').click()

        time.sleep(13)
    
    # close the chrome window
    driver.quit()

In [3]:
# 2022
# define website to do scraping
# website = "https://www.webofscience.com/wos/woscc/summary/5da50233-0285-43be-8a71-7988704ec650-b14b1657/relevance/1"
# webscraping_wos_range(website, 0, 2239)

In [4]:
# 2021
# define website to do scraping
# website = "https://www.webofscience.com/wos/woscc/summary/91e4b578-c1f9-49cc-bd35-86dbfd429de1-b1766607/relevance/1"
# webscraping_wos_range(website, 0, 1817)

In [5]:
# 2020
# define website to do scraping
# website = "https://www.webofscience.com/wos/woscc/summary/bee91581-b996-4fc4-90e1-c4146250e604-b176f092/relevance/1"
# webscraping_wos_range(website, 0, 1596)

In [6]:
def webscraping_wos_range(website_path, first_year, last_year):
    """
    Scrapes scientific articles from Web of Science within a specified range.

    Parameters:
    - website_path (str): The URL to the Web of Science search results page.
    - start_article (int): The starting article number for scraping.
    - end_article (int): The ending article number for scraping.

    Description:
    This function automates the web scraping process of scientific articles from Web of Science.
    It allows you to specify a range of articles you want to scrape (e.g., articles 5000 to 6000).
    The function will visit the provided website, reject cookies, and then scrape articles within
    the specified range.

    Note:
    - Ensure you have the ChromeDriver installed and its path specified in the function.
    - The function interacts with the Web of Science website, so it may need adjustments
      if the website's structure changes.
    - When you are scraping too many articles the function might stop because it asks to prove that
      you are human.

    Example Usage:
    website_path = "URL_TO_WEB_OF_SCIENCE_SEARCH_RESULTS"
    start_article = 5000
    end_article = 6000
    webscraping_wos_range(website_path, start_article, end_article)
    """

    # path for chrome driver
    path = Service(ChromeDriverManager().install())

    # create the driver
    driver = webdriver.Chrome(service = path)

    # open chromedriver window
    driver.get(website_path)
    time.sleep(10)
    
    # reject cookies
    driver.find_element('xpath', '//*[@id="onetrust-reject-all-handler"]').click()
    
    id = 1
    for year in range(first_year, last_year+1):
        # click on search bar
        driver.find_element('xpath', '/html/body/app-wos/main/div/div/div[2]/div/div/div[2]/app-input-route/app-base-summary-component/app-search-friendly-display/div[1]/app-general-search-friendly-display/app-query-modifier/div/div[1]/div').click()

        # select correct year
        driver.find_element(By.CSS_SELECTOR,'input[aria-label="Search box 2"]').clear()
        time.sleep(0.5)
        driver.find_element(By.CSS_SELECTOR,'input[aria-label="Search box 2"]').send_keys(year)

        # click search 
        driver.find_element('xpath', '//*[@id="snSearchType"]/div[5]/button[2]/span[1]').click()
        time.sleep(3)

        # calculate total number of publications, nem é bem preciso para nada
        n_publications = int(driver.find_element(By.CLASS_NAME, "brand-blue").text.replace(',', ''))
        
        for page in range(math.ceil((n_publications - 0) / 500)):

            time.sleep(2)
            
            # calculate the range of articles to scrape on the current page
            page_start_article = page * 500 + 1 + 0
            page_end_article = min((page + 1) * 500 + 0, n_publications)
            
            try:
                # open "Export" options
                driver.find_element('xpath', '//*[@id="snRecListTop"]/app-export-menu/div/button').click()

                time.sleep(2)

                print(f"Scraping articles {page_start_article} to {page_end_article}")
                
                # export to tab-delimited file
                driver.find_element('xpath', '//*[@id="exportToTabWinButton"]').click()

                # export to bibtex file
                #driver.find_element('xpath', '//*[@id="exportToBibtexButton"]').click()

            except:
                print(f'Page number {page_start_article} until {page_end_article} was not exported')
                continue
            
            time.sleep(3)
            driver.find_element('xpath', '//*[@id="radio3"]/label/span[1]/span[2]').click()
            # choose range of papers to export
            # if page == 0:
            #     driver.find_element('xpath', '//*[@id="radio3"]/label/span[1]/span[2]').click()
            # else:
            #     driver.find_element('xpath', '//*[@id="radio3"]/label/span[1]/span[1]').click()

                #//*[@id="radio3"]/label/span[1]/span[2]
            #driver.find_element(By.CLASS_NAME, "mat-radio-inner-circle")
        
            time.sleep(1)
            
            # starting range
            driver.find_element(By.NAME, "markFrom").clear()
            time.sleep(0.5)
            driver.find_element(By.NAME, "markFrom").send_keys(page_start_article)
            time.sleep(0.5)
            
            # ending range
            driver.find_element(By.NAME, "markTo").clear()
            time.sleep(0.5)
            driver.find_element(By.NAME, "markTo").send_keys(page_end_article)
            time.sleep(0.5)
            
            # dropdown "Record Content:"
            driver.find_element('xpath', '/html/body/app-wos/main/div/div/div[2]/div/div/div[2]/app-input-route[1]/app-export-overlay/div/div[3]/div[2]/app-export-out-details/div/div[2]/form/div/div[1]/wos-select/button').click()

            time.sleep(2)
            
            # choose the record content to be "Full Record and Cited References"
            driver.find_element('xpath', '//*[@id="global-select"]/div/div/div[4]/span').click()

            time.sleep(2)
            
            # click export
            driver.find_element('xpath', '/html/body/app-wos/main/div/div/div[2]/div/div/div[2]/app-input-route[1]/app-export-overlay/div/div[3]/div[2]/app-export-out-details/div/div[2]/form/div/div[2]/button[1]').click()

            time.sleep(13)
        id +=1
        
    # close the chrome window
    driver.quit()

In [8]:
website = "https://www.webofscience.com/wos/woscc/summary/91e4b578-c1f9-49cc-bd35-86dbfd429de1-b1766607/relevance/1"
webscraping_wos_range(website, 2011, 2019)

Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1500
Scraping articles 1501 to 1634
Scraping articles 1 to 500
Scraping articles 501 to 909
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1028
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1004
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1284
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1240
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1319
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1496
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1500
Scraping articles 1501 to 1535


In [9]:
webscraping_wos_range(website, 2000, 2009)

Scraping articles 1 to 163
Scraping articles 1 to 172
Scraping articles 1 to 206
Scraping articles 1 to 246
Scraping articles 1 to 305
Scraping articles 1 to 398
Scraping articles 1 to 500
Scraping articles 501 to 536
Scraping articles 1 to 500
Scraping articles 501 to 668
Scraping articles 1 to 500
Scraping articles 501 to 716
Scraping articles 1 to 500
Scraping articles 501 to 1000
Scraping articles 1001 to 1129


In [16]:
webscraping_wos_range(website, 1990, 1999)

Scraping articles 1 to 29
Scraping articles 1 to 33
Scraping articles 1 to 24
Scraping articles 1 to 34
Scraping articles 1 to 43
Scraping articles 1 to 56
Scraping articles 1 to 54
Scraping articles 1 to 64
Scraping articles 1 to 84
Scraping articles 1 to 90


In [17]:
webscraping_wos_range(website, 1980, 1989)

Scraping articles 1 to 16
Scraping articles 1 to 6
Scraping articles 1 to 10
Scraping articles 1 to 7
Scraping articles 1 to 3
Scraping articles 1 to 6
Scraping articles 1 to 12
Scraping articles 1 to 15
Scraping articles 1 to 14
Scraping articles 1 to 11


In [18]:
downloads_folder =  os.path.expanduser('~') + '/Downloads' #"C:/Users/isabe/Downloads"
count = 0

# List all files in the Downloads folder
for filename in os.listdir(downloads_folder):
    # Check if the filename contains "savedrecs" and ends with ".csv"
    if "savedrecs" in filename and filename.endswith(".txt"):
        count += 1

print(f"Number of 'savedrecs' CSV files in Downloads: {count}")

Number of 'savedrecs' CSV files in Downloads: 79


In [27]:
retractions_data = pd.DataFrame()
for file in range(count):
    if file == 0:
        data = pd.read_csv("C:/Users/isabe/Downloads/savedrecs.txt", sep = '\t')
        retractions_data = pd.concat([retractions_data, data], ignore_index= True)
    else:
        data = pd.read_csv(f"C:/Users/isabe/Downloads/savedrecs ({file}).txt", sep = '\t')
        retractions_data = pd.concat([retractions_data, data], ignore_index= True)

In [31]:
rename_columns = {
    "FN": "File Name",
    "VR": "Version Number",
    "PT": "Publication Type", # (J=Journal; B=Book; S=Series; P=Patent)
    "AU": "Authors",
    "AF": "Author Full Name",
    "BA": "Book Authors",
    "BF": "Book Authors Full Name",
    "CA": "Group Authors",
    "GP": "Book Group Authors",
    "BE": "Editors",
    "TI": "Document Title",
    "SO": "Publication Name",
    "SE": "Book Series Title",
    "BS": "Book Series Subtitle",
    "LA": "Language",
    "DT": "Document Type",
    "CT": "Conference Title",
    "CY": "Conference Date",
    "CL": "Conference Location",
    "SP": "Conference Sponsors",
    "HO": "Conference Host",
    "DE": "Author Keywords",
    "ID": "Keywords Plus",
    "AB": "Abstract",
    "C1": "Author Address",
    "RP": "Reprint Address",
    "EM": "E-mail Address",
    "RI": "ResearcherID Number",
    "OI": "ORCID Identifier (Open Researcher and Contributor ID)",
    "FU": "Funding Agency and Grant Number",
    "FX": "Funding Text",
    "CR": "Cited References",
    "NR": "Cited Reference Count",
    "TC": "Web of Science Core Collection Times Cited Count",
    "Z9": "Total Times Cited Count",
    "U1": "Usage Count (Last 180 Days)",
    "U2": "Usage Count (Since 2013)",
    "PU": "Publisher",
    "PI": "Publisher City",
    "PA": "Publisher Address",
    "SN": "International Standard Serial Number (ISSN)",
    "EI": "Electronic International Standard Serial Number (eISSN)",
    "BN": "International Standard Book Number (ISBN)",
    "J9": "29-Character Source Abbreviation",
    "JI": "ISO Source Abbreviation",
    "PD": "Publication Date",
    "PY": "Year Published",
    "VL": "Volume",
    "IS": "Issue",
    "SI": "Special Issue",
    "PN": "Part Number",
    "SU": "Supplement",
    "MA": "Meeting Abstract",
    "BP": "Beginning Page",
    "EP": "Ending Page",
    "AR": "Article Number",
    "DI": "Digital Object Identifier (DOI)",
    "D2": "Book Digital Object Identifier (DOI)",
    "EA": "Early access date",
    "EY": "Early access year",
    "PG": "Page Count",
    "P2": "Chapter Count (Book Citation Index)",
    "WC": "Web of Science Categories",
    "SC": "Research Areas",
    "GA": "Document Delivery Number",
    "PM": "PubMed ID",
    "UT": "Accession Number",
    "OA": "Open Access Indicator",
    "HP": "ESI Hot Paper", # Note that this field is valued only for ESI subscribers.
    "HC": "ESI Highly Cited Paper", # Note that this field is valued only for ESI subscribers.
    "DA": "Date this report was generated",
    "ER": "End of Record",
    "EF": "End of File"
}

retractions_data.rename(columns = rename_columns, inplace = True)
retractions_data.columns

Index(['Publication Type', 'Authors', 'Book Authors', 'Editors',
       'Book Group Authors', 'Author Full Name', 'Book Authors Full Name',
       'Group Authors', 'Document Title', 'Publication Name',
       'Book Series Title', 'Book Series Subtitle', 'Language',
       'Document Type', 'Conference Title', 'Conference Date',
       'Conference Location', 'Conference Sponsors', 'Conference Host',
       'Author Keywords', 'Keywords Plus', 'Abstract', 'Author Address', 'C3',
       'Reprint Address', 'E-mail Address', 'ResearcherID Number',
       'ORCID Identifier (Open Researcher and Contributor ID)',
       'Funding Agency and Grant Number', 'FP', 'Funding Text',
       'Cited References', 'Cited Reference Count',
       'Web of Science Core Collection Times Cited Count',
       'Total Times Cited Count', 'Usage Count (Last 180 Days)',
       'Usage Count (Since 2013)', 'Publisher', 'Publisher City',
       'Publisher Address', 'International Standard Serial Number (ISSN)',
       '

In [32]:
retractions_data

Unnamed: 0,Publication Type,Authors,Book Authors,Editors,Book Group Authors,Author Full Name,Book Authors Full Name,Group Authors,Document Title,Publication Name,...,Web of Science Categories,WE,Research Areas,Document Delivery Number,PubMed ID,Open Access Indicator,ESI Highly Cited Paper,ESI Hot Paper,Date this report was generated,Accession Number
0,J,"Qiang, RL; Guang, LF; Hua, SY; Maia, D",,,,"Qiang, Ran Li; Guang, Li Fa; Hua, Sun Yan; Mai...",,,RETRACTED: Art Research Based on Machine Learn...,SECURITY AND COMMUNICATION NETWORKS,...,"Computer Science, Information Systems; Telecom...",Science Citation Index Expanded (SCI-EXPANDED),Computer Science; Telecommunications,1M9VB,,gold,,,2023-11-05,WOS:000800311400007
1,J,"Fan, L; Xia, MY; Huang, P; Hu, JM",,,,"Fan, Ling; Xia, Meiyi; Huang, Ping; Hu, Jianmin",,,RETRACTED: Research on Educational Information...,SECURITY AND COMMUNICATION NETWORKS,...,"Computer Science, Information Systems; Telecom...",Science Citation Index Expanded (SCI-EXPANDED),Computer Science; Telecommunications,UW7ON,,gold,,,2023-11-05,WOS:000700342400003
2,J,"Peng, AH; Liu, CW; Yang, L",,,,"Peng, Anhua; Liu, Chengwen; Yang, Le",,,RETRACTED: Intelligent electrical equipment fi...,EURASIP JOURNAL ON ADVANCES IN SIGNAL PROCESSING,...,"Engineering, Electrical & Electronic",Science Citation Index Expanded (SCI-EXPANDED),Engineering,UU7XW,,gold,,,2023-11-05,WOS:000699010400001
3,J,"Cui, LH",,,,"Cui, Lianhe",,,RETRACTED: Research on English translation dis...,EURASIP JOURNAL ON IMAGE AND VIDEO PROCESSING,...,"Engineering, Electrical & Electronic; Imaging ...",Science Citation Index Expanded (SCI-EXPANDED),Engineering; Imaging Science & Photographic Te...,HJ4MN,,gold,,,2023-11-05,WOS:000457148200002
4,J,"Zhu, SS",,,,"Zhu, Shanshan",,,RETRACTED: Real-time detection of aerobics pos...,EURASIP JOURNAL ON ADVANCES IN SIGNAL PROCESSING,...,"Engineering, Electrical & Electronic",Science Citation Index Expanded (SCI-EXPANDED),Engineering,XA5VI,,gold,,,2023-11-05,WOS:000720713700001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23553,J,"GROSSARTHMATICEK, R; EYSENCK, HJ",,,,"GROSSARTHMATICEK, R; EYSENCK, HJ",,,RETRACTED: LENGTH OF SURVIVAL AND LYMPHOCYTE P...,PSYCHOLOGICAL REPORTS,...,"Psychology, Multidisciplinary",Social Science Citation Index (SSCI),Psychology,AN942,2780939.0,,,,2023-11-05,WOS:A1989AN94200055
23554,J,"CHANDRA, RK; PURI, S; HAMED, A",,,,"CHANDRA, RK; PURI, S; HAMED, A",,,RETRACTED: INFLUENCE OF MATERNAL DIET DURING L...,BMJ-BRITISH MEDICAL JOURNAL,...,"Medicine, General & Internal",Science Citation Index Expanded (SCI-EXPANDED),General & Internal Medicine,AG617,2504375.0,"Green Published, hybrid",,,2023-11-05,WOS:A1989AG61700016
23555,J,"AITKEN, JC; THOMPSON, J",,,,"AITKEN, JC; THOMPSON, J",,,RETRACTED: THE EFFECTS OF DIETARY MANIPULATION...,EUROPEAN JOURNAL OF APPLIED PHYSIOLOGY,...,Physiology; Sport Sciences,Science Citation Index Expanded (SCI-EXPANDED),Physiology; Sport Sciences,U8775,2737192.0,,,,2023-11-05,WOS:A1989U877500008
23556,J,"BOLDT, J; KLING, D; VONBORMANN, B; ZUGE, M; SC...",,,,"BOLDT, J; KLING, D; VONBORMANN, B; ZUGE, M; SC...",,,RETRACTED: BLOOD CONSERVATION IN CARDIAC OPERA...,JOURNAL OF THORACIC AND CARDIOVASCULAR SURGERY,...,Cardiac & Cardiovascular Systems; Respiratory ...,Science Citation Index Expanded (SCI-EXPANDED),Cardiovascular System & Cardiology; Respirator...,AA700,2786116.0,,,,2023-11-05,WOS:A1989AA70000004


In [33]:
retractions_data.to_csv('./retractions_data/wos_retractions_data.csv', index= False)

### Importing the information to python

In [160]:
# def import_text_files_as_dataframe(folder_path, prefix = ''):
#     """
#     Import all text files from a folder whose names start with a specific prefix
#     and concatenate them into a single pandas DataFrame.

#     Parameters:
#     - folder_path (str): The path to the folder containing the text files.
#     - prefix (str, optional): The prefix that the filenames should start with. Default is an empty string.

#     Returns:
#     - pandas.DataFrame: A DataFrame containing the content of all matching text files.

#     Description:
#     This function reads all text files within a specified folder that have filenames
#     starting with the provided prefix. It imports the content of these files and
#     concatenates them into a single pandas DataFrame.

#     - folder_path: The path to the folder containing the text files.
#     - prefix: The prefix that the filenames should start with. If not provided, all text files
#       in the folder will be imported.

#     Example Usage:
#     folder_path = '/path/to/your/folder'
#     prefix = 'your_prefix'
#     df = import_text_files_as_dataframe(folder_path, prefix)
#     """

#     files_in_folder = os.listdir(folder_path)
#     matching_files = [file for file in files_in_folder if file.startswith(prefix) and file.endswith('.txt')]

#     if not matching_files:
#         print("No matching files found.")
#         return None

#     concat_df = pd.DataFrame()
#     for file_name in matching_files:
#         file_path = os.path.join(folder_path, file_name)
#         df = pd.read_csv(file_path, sep = '\t')
#         concat_df = pd.concat([concat_df, df])
        
#     return concat_df

In [161]:
# folder_path = os.path.join(work_path, "WoS Electric Car tsv")
# prefix = 'savedrecs'

# df = import_text_files_as_dataframe(folder_path, prefix)
# df.head()

In [162]:
# rename_columns = {
#     "FN": "File Name",
#     "VR": "Version Number",
#     "PT": "Publication Type", # (J=Journal; B=Book; S=Series; P=Patent)
#     "AU": "Authors",
#     "AF": "Author Full Name",
#     "BA": "Book Authors",
#     "BF": "Book Authors Full Name",
#     "CA": "Group Authors",
#     "GP": "Book Group Authors",
#     "BE": "Editors",
#     "TI": "Document Title",
#     "SO": "Publication Name",
#     "SE": "Book Series Title",
#     "BS": "Book Series Subtitle",
#     "LA": "Language",
#     "DT": "Document Type",
#     "CT": "Conference Title",
#     "CY": "Conference Date",
#     "CL": "Conference Location",
#     "SP": "Conference Sponsors",
#     "HO": "Conference Host",
#     "DE": "Author Keywords",
#     "ID": "Keywords Plus",
#     "AB": "Abstract",
#     "C1": "Author Address",
#     "RP": "Reprint Address",
#     "EM": "E-mail Address",
#     "RI": "ResearcherID Number",
#     "OI": "ORCID Identifier (Open Researcher and Contributor ID)",
#     "FU": "Funding Agency and Grant Number",
#     "FX": "Funding Text",
#     "CR": "Cited References",
#     "NR": "Cited Reference Count",
#     "TC": "Web of Science Core Collection Times Cited Count",
#     "Z9": "Total Times Cited Count",
#     "U1": "Usage Count (Last 180 Days)",
#     "U2": "Usage Count (Since 2013)",
#     "PU": "Publisher",
#     "PI": "Publisher City",
#     "PA": "Publisher Address",
#     "SN": "International Standard Serial Number (ISSN)",
#     "EI": "Electronic International Standard Serial Number (eISSN)",
#     "BN": "International Standard Book Number (ISBN)",
#     "J9": "29-Character Source Abbreviation",
#     "JI": "ISO Source Abbreviation",
#     "PD": "Publication Date",
#     "PY": "Year Published",
#     "VL": "Volume",
#     "IS": "Issue",
#     "SI": "Special Issue",
#     "PN": "Part Number",
#     "SU": "Supplement",
#     "MA": "Meeting Abstract",
#     "BP": "Beginning Page",
#     "EP": "Ending Page",
#     "AR": "Article Number",
#     "DI": "Digital Object Identifier (DOI)",
#     "D2": "Book Digital Object Identifier (DOI)",
#     "EA": "Early access date",
#     "EY": "Early access year",
#     "PG": "Page Count",
#     "P2": "Chapter Count (Book Citation Index)",
#     "WC": "Web of Science Categories",
#     "SC": "Research Areas",
#     "GA": "Document Delivery Number",
#     "PM": "PubMed ID",
#     "UT": "Accession Number",
#     "OA": "Open Access Indicator",
#     "HP": "ESI Hot Paper", # Note that this field is valued only for ESI subscribers.
#     "HC": "ESI Highly Cited Paper", # Note that this field is valued only for ESI subscribers.
#     "DA": "Date this report was generated",
#     "ER": "End of Record",
#     "EF": "End of File"
# }

# df.rename(columns = rename_columns, inplace = True)
# df.columns

In [163]:
# df.shape

In [164]:
# df.drop_duplicates(subset = "Accession Number", inplace = True)
# df.shape

In [165]:
# # saving to csv the dataframe to later use

# csv_path = os.path.join(work_path, "WoSElectricCarData.csv")
# df.to_csv(csv_path, index = False)

## Analysis

In [97]:
# # importing the data from the csv file

# csv_path = os.path.join(work_path, "WoSElectricCarData.csv")

# df = pd.read_csv(csv_path)
# df.head()

In [98]:
# df.info()

In [99]:
# df['Year Published'].describe()

In [100]:
# df_year = df.groupby("Year Published", as_index = False).count()[["Year Published", "Publication Name"]]
# df_year

In [101]:
# # fig = px.line(df_year, x = 'Year Published', y = 'Publication Name', markers = True, text = 'Publication Name', title = "Number of Articles by Year", labels = {"Publication Name": "", "Year Published": "Year"}, width = 800, height = 600)
# # fig.update_traces(textposition = 'top center')

# fig = px.line(df_year, x = 'Year Published', y = 'Publication Name', markers = True, title = "Number of Articles by Year", labels = {"Publication Name": "", "Year Published": "Year"}, width = 800, height = 600)

# number_articles_year_path = os.path.join(work_path, "NumberArticlesYear.pdf")
# fig.write_image(number_articles_year_path)

# fig.show()

In [102]:
# # importing the data from the csv file

# csv_path = os.path.join(work_path, "WoS_Rdata.csv")

# df = pd.read_csv(csv_path)
# df.head()

In [103]:
# df.columns

In [104]:
# df['PY'].describe()

In [105]:
# df_year = df.groupby("PY", as_index = False).count()[["PY", "TI"]]
# df_year

In [106]:
# # fig = px.line(df_year, x = 'Year Published', y = 'Publication Name', markers = True, text = 'Publication Name', title = "Number of Articles by Year", labels = {"Publication Name": "", "Year Published": "Year"}, width = 800, height = 600)
# # fig.update_traces(textposition = 'top center')

# fig = px.line(df_year, x = 'PY', y = 'TI', markers = True, title = "Number of Articles by Year", labels = {"TI": "", "PY": "Year"}, width = 800, height = 600)

# number_articles_year_path = os.path.join(work_path, "NumberArticlesYear.pdf")
# fig.write_image(number_articles_year_path)

# fig.show()