In [1]:
# EconWebscrape.ipynb
'''
    Author: Kevin Yao (email: kevinxy00@gmail.com)

    Purpose: To get abstracts from the top 10 Economics Journals,
    in terms of impact factors for the last 10 years. 
    To accomplish this, I will webscrape 
    https://ideas.repec.org/top/top.journals.simple10.html
    for links to each journal stored on the site. 
'''

# Dependencies
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time

In [2]:
# Simple Impact Factors (Last 10 Years) for Journals:
''' gets the html from ideas.repec.org
    This webpage contains the link to each journals,
    which we will need to get the abstracts 
'''

base_url = "https://ideas.repec.org"
journs_route = "/top/top.journals.simple10.html"
url = base_url + journs_route

r = requests.get(url)
data = r.text # turns response into texts
soup = BeautifulSoup(data, "html.parser") # changes the response from text to html

In [3]:
soup_topList = soup.find(class_="toplist")
soup_href = soup_topList.find_all("a", href=True)


In [11]:
# get list of links to the top 10 journals 
''' 
NOTE: (includes occasional multiple links to same journal at different times)
# e.g. https://ideas.repec.org/s/wly/emetrp.html (Econometrica from sep. 2014 to present)
vs https://ideas.repec.org/s/ecm/emetrp.html (Econometrica from 1950 to Nov. 2013)
'''
number = 0
journ_ls = []
for entry in range(0, 14):
    number = number + 1
    base_link = soup_href[entry]["href"]
    url = base_url + base_link
    print(number, url)
    journ_ls.append(url)
    


1 https://ideas.repec.org/s/oup/qjecon.html
2 https://ideas.repec.org/s/aea/jeclit.html
3 https://ideas.repec.org/s/aea/aejmac.html
4 https://ideas.repec.org/s/wly/emetrp.html
5 https://ideas.repec.org/s/ecm/emetrp.html
6 https://ideas.repec.org/s/aea/aejapp.html
7 https://ideas.repec.org/s/ucp/jpolec.html
8 https://ideas.repec.org/s/oup/restud.html
9 https://ideas.repec.org/s/bla/jfinan.html
10 https://ideas.repec.org/s/oup/ecpoli.html
11 https://ideas.repec.org/s/bla/ecpoli.html
12 https://ideas.repec.org/s/oup/jeurec.html
13 https://ideas.repec.org/s/bla/jeurec.html
14 https://ideas.repec.org/s/tpr/jeurec.html


In [4]:
''' 
    Function for getting a list of links to journal articles 
    from the first page of a journal
'''
def get_article_links(url):
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    # page is broken up into lists for each volume/issue
    # this gets the raw text of each page
    soup_issues = soup.find_all(class_="list-group paperlist")
    
    # this converts raw text into links by finding href
    number = 0
    issues_link_ls = []
    for issue in soup_issues:
        soup_issues_link = issue.find_all("a", href=True)
        for link in soup_issues_link:
            number = number + 1
            base_link = link["href"]
            url = base_url + base_link
            print(number, url)
            issues_link_ls.append(url)

    return issues_link_ls


In [None]:
''' 
# Test for above section: for number 1 article (journal of quarterly economics),
# get list of all hrefs
number = 0
issues_link_ls = []
for issue in soup_issues:
    soup_issues_link = issue.find_all("a", href=True)
    for link in soup_issues_link:
        number = number + 1
        base_link = link["href"]
        url = base_url + base_link
        print(number, url)
        issues_link_ls.append(url)
'''

In [None]:
'''
# preliminary TEST for TEST: get abstract 
r = requests.get(issues_link_ls[0])
data = r.text
soup = BeautifulSoup(data, "html.parser")
soup_abstract = soup.find(id = "abstract-body")
print(soup_abstract.text)
'''

In [None]:
'''
# TEST for getting the abstract from each issues link for the first journal
abstract_ls = []
for url in issues_link_ls:
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    soup_abstract = soup.find(id = "abstract-body")
    abstract_ls.append(soup_abstract.text)
    # sleep for 1 second in between requests so webpage doesn't hate me too much
    time.sleep(1) 
abstract_ls
'''    


In [5]:
# PUTTING IT ALL TOGETHER

''' For each journal in the journal list of the top 10 journals, get list of url of articles
    for the first page of each journal. Then using those url, get the abstract for each article.
    Save to a list
'''
def get_abstracts(ls):
    abstract_ls = []
    journ_count = 0
    art_count = 0
    for journal_url in ls:
        article_links_ls = get_article_links(journal_url)
        journ_count = journ_count + 1
        print("****** Processing Journal # " + str(journ_count) + " ******")
        for url in article_links_ls:
            art_count = art_count + 1
            print(str(art_count))
            r = requests.get(url)
            data = r.text
            soup = BeautifulSoup(data, "html.parser")
            soup_abstract = soup.find(id = "abstract-body")
            abstract_ls.append(soup_abstract.text)
            time.sleep(.25)
        # sleep in between each journal so webpage doesn't hate me too much
        time.sleep(40) 
        print("resting for a moment so webpage doesn't hate me too much")
    return abstract_ls
    

In [None]:
'''
    **************************************************************************
    Finding the top 10 or so Econ journal abstracts in terms of impact factor
    according to https://ideas.repec.org.
    
    Runtime: estimated to be 21 mins (1.5 minutes per journal, 14 journals including
    occasional links to the same journal at different times)
    **************************************************************************

note: output cleared for brevity.
'''
abstract_ls = get_abstracts(journ_ls)

In [None]:
# get rid entries where abstracts are not available
for entry in abstract_ls:
    if entry == 'No abstract is available for this item.':
        abstract_ls.remove(entry)
        
len(abstract_ls)

In [None]:
# save list of abstracts to dataframe
abstract_df = pd.DataFrame(abstract_ls, columns=["abstract"])
abstract_df.head()

In [None]:
# save to csv file 
abstract_df.to_csv(r'raw_data_econ/topJournals_abstract.csv')

### Getting journals rank 11-20, in case we need it

In [29]:
''' Getting journals url, including same journals at multiple time 
periods as laid out in the webpage. '''
number = 0
journ_ls = []
for entry in range(14, 27):
    number = number + 1
    base_link = soup_href[entry]["href"]
    url = base_url + base_link
    print(number, url)
    journ_ls.append(url)
    

1 https://ideas.repec.org/s/aea/aecrev.html
2 https://ideas.repec.org/s/bin/bpeajo.html
3 https://ideas.repec.org/s/tpr/restat.html
4 https://ideas.repec.org/s/aea/jecper.html
5 https://ideas.repec.org/s/oup/rfinst.html
6 https://ideas.repec.org/s/kap/jecgro.html
7 https://ideas.repec.org/s/eee/moneco.html
8 https://ideas.repec.org/s/eee/crcspp.html
9 https://ideas.repec.org/s/eee/jfinec.html
10 https://ideas.repec.org/s/anr/reveco.html
11 https://ideas.repec.org/s/now/fnteco.html
12 https://ideas.repec.org/s/wly/econjl.html
13 https://ideas.repec.org/s/ecj/econjl.html


In [None]:
'''
*********************************************
Getting journals rank 11-20 by impact factors over 10 years.

Estimated run time: 19.5 minutes (now set sleep to .25 seconds between articles
(primarily 200 articles per journal) and 40 seconds between journals (13 nondistinct))
note: output cleared for brevity.
*********************************************
'''
abstract_ls = get_abstracts(journ_ls)

In [None]:
# get rid entries where abstracts are not available
for entry in abstract_ls:
    if entry == 'No abstract is available for this item.':
        abstract_ls.remove(entry)
        
# len(abstract_ls)

In [32]:
# save list of abstracts to dataframe
abstract_df = pd.DataFrame(abstract_ls, columns=["abstract"])
abstract_df.head()

Unnamed: 0,abstract
0,How large are the benefits of transportation i...
1,Macro developments leading up to the 2008 cris...
2,"This paper provides a simple, yet robust frame..."
3,We establish the importance of team-specific c...
4,We document substantial increases in agricultu...


In [33]:
# save to csv file 
abstract_df.to_csv(r'raw_data_econ/hiRankJournals_abstract.csv')

### Getting the "Not top journals" abstracts

In [7]:
'''
    For the journals not at the top, random sampling is used to generate a list of
    ranks between 20 and 100, with the 127th web scraped href being a link to rank 100. 
'''
np.random.seed(42)
random_ranks = np.random.randint(20, 127, 10)
random_ranks

array([122,  71, 112,  34, 126,  91,  80,  40, 122, 102])

In [28]:
number = 0
journ_ls = []
for entry in random_ranks:
    number = number + 1
    base_link = soup_href[entry]["href"]
    url = base_url + base_link
    print(number, url)
    journ_ls.append(url)
 

1 https://ideas.repec.org/s/bla/germec.html
2 https://ideas.repec.org/s/aea/aejmic.html
3 https://ideas.repec.org/s/ucp/ecdecc.html
4 https://ideas.repec.org/s/ucp/jlabec.html
5 https://ideas.repec.org/s/pia/review.html
6 https://ideas.repec.org/s/aen/journl.html
7 https://ideas.repec.org/s/cup/jfinqa.html
8 https://ideas.repec.org/s/mcb/jmoncb.html
9 https://ideas.repec.org/s/bla/germec.html
10 https://ideas.repec.org/s/ekn/ekonom.html


In [None]:
'''
*********************************************
Getting random sample of journals rank 20-100 by impact factors over 10 years.

Estimated run time: 19.5 minutes (now set sleep to .25 seconds between articles
(primarily 200 articles per journal) and 40 seconds between journals (13 nondistinct))
note: output cleared for brevity.
*********************************************
'''
abstract_ls = get_abstracts(journ_ls)

In [68]:
# get rid entries where abstracts are not available
for entry in abstract_ls:
    if entry == 'No abstract is available for this item.':
        abstract_ls.remove(entry)
    if entry[0:12] == ' type="main"':
        entry = entry[41:]
        
len(abstract_ls)

1336

In [75]:
# save list of abstracts to dataframe
abstract_df = pd.DataFrame(abstract_ls, columns=["abstract"])
abstract_df.head()

Unnamed: 0,abstract
0,We conducted six treatments of a standard mora...
1,This paper analyzes blindfolded versus informe...
2,We analyze the drivers of the size of the audi...
3,Japan has been in a benign liquidity trap sinc...
4,This paper addresses tax loopholes that allow ...


In [76]:
# get rid of beginnings which start with ' type="main"'
for index, row in abstract_df.iterrows():
    if row["abstract"][0:12] == ' type="main"':
        row["abstract"] = row["abstract"][41:]

abstract_df.iloc[10]

abstract    We study how subjects in an experiment use dif...
Name: 10, dtype: object

In [78]:
# save to csv file 
abstract_df.to_csv(r'raw_data_econ/notHiJournals_abstract.csv')

#### More data for not top rank journals

In [8]:
'''
    For the journals not at the top, random sampling is used to generate a list of
    ranks between 20 and 100, with the 127th web scraped href being a link to rank 100. 
'''
np.random.seed(25)
random_ranks2 = np.random.randint(20, 127, 20)

for item in random_ranks2:
    if item in random_ranks:
        random_ranks2.drop(item)
        
random_ranks2

array([ 24,  82, 110,  35,  81,  43,  64,  70,  28,  48,  24, 109,  51,
        89,  21,  59,  23, 108,  75,  23])

In [10]:
number = 0
journ_ls = []
for entry in random_ranks2:
    number = number + 1
    base_link = soup_href[entry]["href"]
    url = base_url + base_link
    print(number, url)
    journ_ls.append(url)
 

1 https://ideas.repec.org/s/now/fnteco.html
2 https://ideas.repec.org/s/ags/ajaeap.html
3 https://ideas.repec.org/s/rpo/leanco.html
4 https://ideas.repec.org/s/eee/jfinin.html
5 https://ideas.repec.org/s/eee/respol.html
6 https://ideas.repec.org/s/eee/pubeco.html
7 https://ideas.repec.org/s/eee/jimfin.html
8 https://ideas.repec.org/s/oup/wbecrv.html
9 https://ideas.repec.org/s/aea/aejpol.html
10 https://ideas.repec.org/s/ijc/ijcjou.html
11 https://ideas.repec.org/s/now/fnteco.html
12 https://ideas.repec.org/s/eee/enepol.html
13 https://ideas.repec.org/s/wly/quante.html
14 https://ideas.repec.org/s/spr/weltar.html
15 https://ideas.repec.org/s/eee/crcspp.html
16 https://ideas.repec.org/s/eee/jeeman.html
17 https://ideas.repec.org/s/anr/reveco.html
18 https://ideas.repec.org/s/now/jirere.html
19 https://ideas.repec.org/s/eee/finsta.html
20 https://ideas.repec.org/s/anr/reveco.html


In [None]:
'''
*********************************************
Getting random sample of journals rank 20-100 by impact factors over 10 years.

Estimated run time: 39 minutes (now set sleep to .25 seconds between articles
(primarily 200 articles per journal) and 40 seconds between journals (20 nondistinct))

*********************************************
'''
abstract_ls = get_abstracts(journ_ls)

1 https://ideas.repec.org/a/now/fnteco/0800000033.html
2 https://ideas.repec.org/a/now/fnteco/0800000030.html
3 https://ideas.repec.org/a/now/fnteco/0800000026.html
4 https://ideas.repec.org/a/now/fnteco/0800000023.html
5 https://ideas.repec.org/a/now/fnteco/0800000028.html
6 https://ideas.repec.org/a/now/fnteco/0800000022.html
7 https://ideas.repec.org/a/now/fnteco/0800000019.html
8 https://ideas.repec.org/a/now/fnteco/0800000018.html
9 https://ideas.repec.org/a/now/fnteco/0800000017.html
10 https://ideas.repec.org/a/now/fnteco/0800000020.html
11 https://ideas.repec.org/a/now/fnteco/0800000011.html
12 https://ideas.repec.org/a/now/fnteco/0800000016.html
13 https://ideas.repec.org/a/now/fnteco/0800000014.html
14 https://ideas.repec.org/a/now/fnteco/0800000015.html
15 https://ideas.repec.org/a/now/fnteco/0800000013.html
16 https://ideas.repec.org/a/now/fnteco/0800000010.html
17 https://ideas.repec.org/a/now/fnteco/0800000002.html
18 https://ideas.repec.org/a/now/fnteco/0800000009.html
1

In [None]:
# get rid entries where abstracts are not available
for entry in abstract_ls:
    if entry == 'No abstract is available for this item.':
        abstract_ls.remove(entry)
    if entry[0:12] == ' type="main"':
        entry = entry[41:]
        
len(abstract_ls)