**Part 1 - Web-scraping**


In [1]:
import bs4
import requests

from fuzzywuzzy import process



In [2]:
url = "https://ic2s2-2023.org/program"

response = requests.get(url)
soup = bs4.BeautifulSoup(response.content, 'html.parser')

#get content in section with id "main"
main = soup.find_all('section', id='main')

#get the list of each <\i> tag in the main section
authors_list = []

for section in main:
    #find italic tags since authors are in italic
    items = section.find_all('i')
    for item in items:
        authors = item.text
        for author in authors.split(","):
            author_strip = author.strip()
            #remove Chair: in the beginning of the name if it exists
            if author_strip.startswith("Chair:"):
                author_strip = author_strip[6:]
            authors_list.append(author_strip)

#remove duplicates
authors_list = list(set(authors_list))
print(f"initial length of authors list: {len(authors_list)}")

initial length of authors list: 1524


In [3]:
#use fuzzywuzzy to remove similar authors to further prevent misspelling and duplicates
def remove_similar_authors(authors, similarity_threshold = 85):
    unique_authors = []
    for author in authors:
        # Check if the author is too similar to any already in unique_authors
        if not unique_authors:
            unique_authors.append(author)
        else:
            similarities = [process.extractOne(author, [ua])[1] for ua in unique_authors]
            if all(sim < similarity_threshold for sim in similarities):
                unique_authors.append(author)
    return unique_authors

# #sort authors list
# authors_list.sort()

# #split authors list into lists of n authors to speed up the process
# n = 100
# split_authors = [authors_list[i:i + n] for i in range(0, len(authors_list), n)]
# cleaned_authors = []
# for split in split_authors:
#     cleaned_authors += remove_similar_authors(split)

# print(f"length of cleaned authors: {len(cleaned_authors)}")
# # Save the list to a file with UTF-8 encoding to handle special characters
# with open("authors.txt", "w", encoding="utf-8") as f:
#     for author in cleaned_authors:
#         f.write(author + "\n")

1. How many researchers did you get?\
Initially, we retrieved 1524 authors, After cleaning the list of authors, we retrieved 1394 authors

2. Explain the process you followed to web-scrape the page \
For scraping, we noticed authors names were italicized so we retrieved this text. This also contained people with the Chair: tag in front, so we removed tag this by a simple comparison while retaining the name in the set. After this we used fuzzywuzzy to extract and compare similar author names. This removes duplicates as a result of misspellings or alternative names and such.

**Part 2 - Ready Made vs Custom Made Data**
1. What are the pros and cons \
...
2. How can these differences influence interpretation (max 150. words)\
...

**Part 3 - Gathering Research Articles using the OpenAlex API**

In [4]:
import pandas as pd
import tqdm
import concurrent.futures
import ast

In [5]:
#load author txt
def load_author_txt(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        lines = f.readlines()
        authors = []
        for line in lines:
            line = line.strip()
            if len(line) > 0:
                authors.append(line)
    return authors

authors = load_author_txt('authors.txt')

In [6]:
endpoint = "https://api.openalex.org/authors?search="


def get_author_response(author):
    url = endpoint + author
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

#helper function to get the concepts from the author dictionary and the intercection if both required concept groups are present
def get_concepts(author_dict):
    concepts = [[c.get('display_name') for c in author_dict.get('results')[a].get('x_concepts')] for a in range(len(author_dict.get('results')))]
    return concepts

def get_best_match(concepts):
    target_1 = ['Sociology', 'Psychology', 'Economics', 'Political Science']
    target_2 = ['Mathematics', 'Physics', 'Computer Science']

    for i, author in enumerate(concepts):
        #get authors with concepts in target_1 AND target_2
        if (x in author for x in target_1) and (x in author for x in target_2):
            return i
    
    return 0

def get_info(auth_info):
    concepts = get_concepts(auth_info)
    c_index = get_best_match(concepts)
    
    #get their id
    author_id = auth_info.get('results')[c_index].get('id')[-11:]
    #get their display_name
    display_name = auth_info.get('results')[c_index].get('display_name')
    #get their works_api_url
    works_api_url = "https://api.openalex.org/works?filter=author.id:" + author_id
    #get their h_index
    h_index = auth_info.get('results')[c_index].get('summary_stats').get('h_index')
    #get their works count
    works_count = auth_info.get('results')[c_index].get('works_count')
    #get their cited_by_count
    cited_by_count = auth_info.get('results')[c_index].get('cited_by_count')
    
    #get their country code
    country_code = auth_info.get('results')[c_index].get('last_known_institution').get('country_code')

    return {
        'author_id': author_id,
        'display_name': display_name,
        'works_api_url': works_api_url,
        'h_index': h_index,
        'works_count': works_count,
        'cited_by_count': cited_by_count,
        'country_code': country_code
    }

def get_author_df(authors,*, n_workers = 3, load_if_exists = True, save_path = 'authors_df.csv'):
    
    if load_if_exists:
        try:
            authors_df = pd.read_csv(save_path)
            print(f'Loaded {len(authors_df)} authors from {save_path}')
            return authors_df
        except:
            print(f'Failed to load {save_path}, will create a new one..')
            
    authors_df = pd.DataFrame(columns=['author_id', 'display_name', 'works_api_url', 'h_index', 'works_count', 'cited_by_count', 'country_code'])
    err_authors = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
        future_to_author = {executor.submit(get_author_response, author): author for author in authors}
        futures = tqdm.tqdm(concurrent.futures.as_completed(future_to_author), total=len(authors))
        for future in futures:
            author = future_to_author[future]
            try:
                res = future.result()
                if res:
                    try:
                        info = get_info(res)
                        authors_df = pd.concat([authors_df, pd.DataFrame([info])])
                    except:
                        err_authors.append(author)
                else:
                    err_authors.append(author)
            except Exception as e:
                err_authors.append(author)
    print(f'Done!, Failed authors: {len(err_authors)}')
    #save to csv

    authors_df.to_csv(save_path, index=False)
    return authors_df

authors_df = get_author_df(authors)

Loaded 1237 authors from authors_df.csv


**Part 4 - The Network of Computational Social Scientists**