## 1. Download the webpage

In [1]:
import requests

In [7]:
url = "https://www.ilovephd.com/list-of-subject-areas-covered-by-scopus-database/"
response = requests.get(url)
if response.status_code == 200:
    print("Downloaded the page successfully")
else:
    print("Failed to download the page")
    print(f"Status code: {response.status_code}")

Downloaded the page successfully


In [8]:
with open("../../data/raw/classification_codes_raw.html", "w") as file:
    file.write(response.text)

## 2. Scrape the webpage

### 2.1 Turn the webpage into a dataframe

In [13]:
from bs4 import BeautifulSoup
import pandas as pd
import re

In [14]:
with open("../../data/raw/classification_codes_raw.html") as file:
    soup = BeautifulSoup(file)

In [15]:
for elem in soup.find(string=re.compile("Supergroup")).parents:
    if elem.name == "table":
        table = elem
        break

In [16]:
subjects = pd.DataFrame(
    [
        [cell.text.strip() for cell in row.find_all(["th", "td"])]
        for row in table.find_all("tr")[1:]
    ],
    columns=[cell.text for cell in table.find("tr").find_all(["th", "td"])]
)
subjects.head()

Unnamed: 0,Scopus Code,Subject Areas,Supergroup
0,1000,Multidisciplinary,
1,1100,Agricultural and Biological Sciences,Life Sciences
2,1200,Arts and Humanities,Social Sciences
3,1300,"Biochemistry, Genetics and Molecular Biology",Life Sciences
4,1400,"Business, Management, and Accounting",Social Sciences


In [18]:
subjects.rename(columns={
    "Scopus Code": "code",
    "Subject Areas": "subject_area",
    "Supergroup": "supergroup"
}, inplace=True)
subjects.head()

Unnamed: 0,code,subject_area,supergroup
0,1000,Multidisciplinary,
1,1100,Agricultural and Biological Sciences,Life Sciences
2,1200,Arts and Humanities,Social Sciences
3,1300,"Biochemistry, Genetics and Molecular Biology",Life Sciences
4,1400,"Business, Management, and Accounting",Social Sciences


### 2.2 Clean the data

In [19]:
subjects.rename(columns={
    "code": "code_prefix"
}, inplace=True)

def get_code_prefix(code):
    return code[:2]

subjects["code_prefix"] = subjects["code_prefix"].apply(get_code_prefix)

subjects.head()

Unnamed: 0,code_prefix,subject_area,supergroup
0,10,Multidisciplinary,
1,11,Agricultural and Biological Sciences,Life Sciences
2,12,Arts and Humanities,Social Sciences
3,13,"Biochemistry, Genetics and Molecular Biology",Life Sciences
4,14,"Business, Management, and Accounting",Social Sciences


In [20]:
subjects["subject_area"].unique()

array(['Multidisciplinary', 'Agricultural and Biological Sciences',
       'Arts and Humanities',
       'Biochemistry, Genetics and Molecular Biology',
       'Business, Management, and Accounting', 'Chemical Engineering',
       'Chemistry', 'Computer Science', 'Decision Sciences',
       'Earth and Planetary Sciences',
       'Economics, Econometrics and Finance', 'Energy', 'Engineering',
       'Environmental Science', 'Immunology and Microbiology',
       'Materials Science', 'Mathematics', 'Medicine', 'Neuroscience',
       'Nursing', 'Pharmacology, Toxicology, and Pharmaceutics',
       'Physics and Astronomy', 'Psychology', 'Social Sciences',
       'Veterinary', 'Dentistry', 'Health Professions'], dtype=object)

In [21]:
subjects["supergroup"].unique()

array(['', 'Life Sciences', 'Social Sciences', 'Physical Sciences',
       'Health Sciences'], dtype=object)

In [22]:
subjects[subjects["supergroup"] == ""]

Unnamed: 0,code_prefix,subject_area,supergroup
0,10,Multidisciplinary,


In [23]:
subjects.loc[subjects["supergroup"] == "", "supergroup"] = "Multidisciplinary"
subjects["supergroup"].unique()

array(['Multidisciplinary', 'Life Sciences', 'Social Sciences',
       'Physical Sciences', 'Health Sciences'], dtype=object)

### 2.3 Join on other tables to get abbreviation of subjects

In [30]:
with open("../../data/processed/classification_codes.csv", "r") as file:
    classification_codes = pd.read_csv(file)
classification_codes.head()

Unnamed: 0,name,code,abbreviation
0,Medicine (all),2700,MEDI
1,Electrical and Electronic Engineering,2208,ENGI
2,"Electronic, Optical and Magnetic Materials",2504,MATE
3,Chemistry (all),1600,CHEM
4,Chemical Engineering (all),1500,CENG


In [31]:
classification_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321 entries, 0 to 320
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          321 non-null    object
 1   code          321 non-null    int64 
 2   abbreviation  321 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.7+ KB


In [32]:
classification_codes = classification_codes.astype({
    "code": "str",
})

In [33]:
classification_codes["code_prefix"] = classification_codes["code"].apply(get_code_prefix)
classification_codes.head()

Unnamed: 0,name,code,abbreviation,code_prefix
0,Medicine (all),2700,MEDI,27
1,Electrical and Electronic Engineering,2208,ENGI,22
2,"Electronic, Optical and Magnetic Materials",2504,MATE,25
3,Chemistry (all),1600,CHEM,16
4,Chemical Engineering (all),1500,CENG,15


In [35]:
code_prefix_to_abbreviation = classification_codes.groupby("code_prefix")["abbreviation"].first()
code_prefix_to_abbreviation.head()

code_prefix
10    MULT
11    AGRI
12    ARTS
13    BIOC
14    BUSI
Name: abbreviation, dtype: object

In [37]:
subjects = subjects.merge(
    code_prefix_to_abbreviation,
    on="code_prefix",
    how="left"
)
subjects.head()

Unnamed: 0,code_prefix,subject_area,supergroup,abbreviation
0,10,Multidisciplinary,Multidisciplinary,MULT
1,11,Agricultural and Biological Sciences,Life Sciences,AGRI
2,12,Arts and Humanities,Social Sciences,ARTS
3,13,"Biochemistry, Genetics and Molecular Biology",Life Sciences,BIOC
4,14,"Business, Management, and Accounting",Social Sciences,BUSI


In [39]:
subjects[subjects["abbreviation"].isna()]

Unnamed: 0,code_prefix,subject_area,supergroup,abbreviation


### 2.4 Save the dataframe to a CSV file

In [40]:
with open("../../data/processed/subjects.csv", "w") as file:
    subjects.to_csv(file, index=False)