In [None]:
import time
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
def get_articles_name() :
    '''
    Extract article's name
    ----
    Args : - 
    Output : list of article's name
    '''
    WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='title']")))
    articles_name_raw = driver.find_elements(By.XPATH, "//div[@class='title']")

    articles_name = []

    for i in range (len(articles_name_raw)) : 
        article_name = articles_name_raw[i].text 
        articles_name.append(article_name)

    return articles_name

In [None]:
def get_articles_link(journal_link) :
    '''
    Extract article's link
    ----
    Args : link of the journal
    Output : list of article's link
    '''
    response = requests.get(journal_link)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        html_content = response.content
        parsed_html = BeautifulSoup(html_content, 'html.parser')
        
        # Find links
        links = parsed_html.find_all('a')
        
        articles_link = []
        
        for link in links : 
            href_link = link.get('href')
            if (len(href_link) == 58 or len(href_link) == 57) and href_link[:53] == 'https://jurnal.ipb.ac.id/index.php/jikk/article/view/' :
                articles_link.append(href_link)
            else :
                continue
        return articles_link

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

In [None]:
def get_pdf_articles_link(journal_link) :
    '''
    Extract pdf's link of an article 
    ----
    Args : link of the journal
    Output : list of pdf's link
    '''
    response = requests.get(journal_link)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        html_content = response.content
        parsed_html = BeautifulSoup(html_content, 'html.parser')
        
        # Find links
        links = parsed_html.find_all('a')
        
        pdf_articles_link = []
        
        for link in links : 
            href_link = link.get('href')
            if (  61 <= len(href_link) <= 64 or 68 <= len(href_link) <= 69) and href_link[:53] == 'https://jurnal.ipb.ac.id/index.php/jikk/article/view/' :
                pdf_articles_link.append(href_link)
            else :
                continue
        return pdf_articles_link

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

In [None]:
def get_abstracts(articles_link) :
    '''
    Extract abstract from an article
    ----
    Args : link of the articles
    Output : list of abstracts
    '''
    driver = webdriver.Chrome()

    abstracts = []
    results = articles_link.copy()
    for i in range (len(results)) : 
        driver.get(results[i])
        WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.XPATH, "//div[@class='item abstract']")))
        abstract = driver.find_element(By.XPATH, "//div[@class='item abstract']").text.split("\n")[1]
        abstracts.append(abstract)
        
    return abstracts

In [None]:
def get_df(list_articles_name, list_pdf_articles, list_abstracts) :
    '''
    Convert into dataframe
    ----
    Args : list article's name, list pdf's link of articles, list abstracts of articles
    Output : dataframe 
    '''
    data = {'pdf_article_link': list_pdf_articles, 'article_name': list_articles_name, 'abstract' : list_abstracts}
    dataframe = pd.DataFrame.from_dict(data)
    return dataframe

In [None]:
# Main Program

list_url = ['https://jurnal.ipb.ac.id/index.php/jikk/issue/archive','https://jurnal.ipb.ac.id/index.php/jikk/issue/archive/2']
driver = webdriver.Chrome()

try:
    for url in list_url:
        driver.get(url)
        driver.maximize_window()

        WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, "//a[@class='title']")))

        while True:
            list_title = driver.find_elements(By.XPATH, "//a[@class='title']")
            
            dataframe_full = pd.DataFrame()
            

            for i in range(len(list_title)):
    
                time.sleep(2)  

                list_title = driver.find_elements(By.XPATH, "//a[@class='title']")

                title = list_title[i]
                driver.execute_script("arguments[0].click();", title)
                
                list_articles_name = get_articles_name()
                
                current_url = driver.current_url
                
                list_articles_link = get_articles_link(current_url)
                
                list_pdf_articles = get_pdf_articles_link(current_url)
                
                list_abstracts = get_abstracts(list_articles_link)
                
                dataframe = get_df(list_articles_name, list_pdf_articles, list_abstracts)
                
                dataframe_full = pd.concat([dataframe_full, dataframe]).reset_index(drop=True)                
                

                WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, '//a[@href="https://jurnal.ipb.ac.id/index.php/jikk/issue/archive"]')))

                driver.execute_script("window.history.go(-1);")

                WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.XPATH, '//a[@href="https://jurnal.ipb.ac.id/index.php/jikk/issue/archive"]')))

            break

except Exception as e:
    print(f"An error occurred: {str(e)}")
finally:
    driver.quit()

In [None]:
dataframe_full.to_csv('journal_scrape.csv', index=False)