In [108]:
import os
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import json
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count

In [132]:
'''

def extract_related_journal_article_link(soup):
    related_journal_article_link = None
    related_journal_article_block = soup.find(class_='widget hidden-print')
    print(related_journal_article_block)
    if related_journal_article_block:
        related_journal_article_link_tag = related_journal_article_block.find('a', rel='nofollow')
        if related_journal_article_link_tag:
            related_journal_article_link = related_journal_article_link_tag['href']
    return related_journal_article_link
'''

def extract_links(soup):
    
    original_source_link = None
    related_journal_article = None
    
    original_source_tag = soup.find('h4', text='Original Source')
    if original_source_tag:
        original_source_link = original_source_tag.find_next('a')['href']
    
    related_journal_article_tag = soup.find('h4', text='Related Journal Article')
    if related_journal_article_tag:
        related_journal_article = related_journal_article_tag.find_next('a')['href']
    
    return original_source_link, related_journal_article

def extract_php_file_data(file_path):
    try:
        with open(file_path, 'r') as php_file:
            php_code = php_file.read()
            soup = BeautifulSoup(php_code, 'html.parser')
            
            release_date = soup.find(class_='release_date').text.strip()
            page_title = soup.find(class_='page_title').text.strip()
            meta_institute = soup.find(class_='meta_institute').text.strip()
            
            media_contact_div = soup.find(class_='contact-info')
            media_contact = extract_media_contact(media_contact_div)
            
            keywords_ul = soup.find(class_='tags')
            keywords = extract_keywords(keywords_ul)
            
            source_link, doi_link = extract_links(soup)
            
            full_text = clean_text(soup.get_text())
            
            return {
                'release_date': release_date,
                'page_title': page_title,
                'meta_institute': meta_institute,
                'media_contact': media_contact,
                'keywords': keywords,
                'full_text': full_text,
                'original_source': source_link,
                'related_journal_article_link': doi_link
            }
    except Exception as e:
        #print(f"An error occurred while parsing {file_path}: {str(e)}")
        return None

def extract_media_contact(media_contact_div):
    contact_info = {}
    if media_contact_div:
        strong_tag = media_contact_div.find('strong', string='Media Contact')
        
        if strong_tag:
            paragraphs = media_contact_div.find_all('p')
            if paragraphs:
                contact_info['name'] = paragraphs[1].get_text().strip().split('\n')[0]
                email_link = paragraphs[1].find('a', href=re.compile(r'mailto:'))
                if email_link:
                    contact_info['email'] = email_link['href'].replace('mailto:', '').strip()
                phone_match = re.match(r'(\d{2,}-\d{2,})', paragraphs[1].get_text().strip())
                if phone_match:
                    contact_info['phone'] = phone_match.group(1)
                twitter_link = paragraphs[1].find('a', href=re.compile(r'twitter\.com'))
                if twitter_link:
                    contact_info['twitter'] = twitter_link.get_text().replace('@', '').strip()
                website_link = paragraphs[-1].find('a', href=re.compile(r'http'))
                if website_link:
                    contact_info['website'] = website_link['href'].strip()
    return contact_info
    
#def extract_media_contact(media_contact_div):
#    if media_contact_div:
#        media_contact_text = media_contact_div.get_text()
#        return media_contact_text.strip()
#    return None

def extract_keywords(keywords_ul):
    if keywords_ul:
        keywords = [keyword.get_text() for keyword in keywords_ul.find_all('a')]
        return keywords
    return []

def clean_text(text):
    # Replace multiple consecutive newline characters with one newline character using regex
    cleaned_text = re.sub(r'\n+', '\n', text)
    return cleaned_text.strip()

def save_to_jsonl(parsed_data, output_file):
    with open(output_file, 'w') as jsonl_file:
        for data in parsed_data:
            jsonl_file.write(json.dumps(data) + '\n')

In [124]:
https://archive.eurekalert.org/pub_releases/2018-09/hu-mts090418.php

SyntaxError: invalid syntax (<ipython-input-124-9ffec3e89e8c>, line 1)

In [133]:
file_path = "/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/2018-09/hu-mts090418.php"
extract_php_file_data(file_path)

  original_source_tag = soup.find('h4', text='Original Source')
  related_journal_article_tag = soup.find('h4', text='Related Journal Article')


{'release_date': 'Public Release:\xa04-Sep-2018',
 'page_title': 'Methane to syngas catalyst: two for the price of one',
 'meta_institute': 'Hokkaido University',
 'media_contact': {'name': 'Naoki Namba',
  'twitter': 'hokkaidouni',
  'website': 'https://www.global.hokudai.ac.jp/'},
 'keywords': ['ATOMIC/MOLECULAR/PARTICLE PHYSICS',
  'CHEMISTRY/PHYSICS/MATERIALS SCIENCES',
  'CLIMATE CHANGE',
  'CLIMATE CHANGE',
  'ENERGY SOURCES',
  'ENERGY/FUEL (NON-PETROLEUM)',
  'MOLECULAR PHYSICS'],
 'full_text': 'Methane to syngas catalyst: two for the price of one | EurekAlert! Science News\n Skip to main content\nAdvanced Search\nHome\nCOVID-19\nNews Releases\nLatest News Releases\nNews By Subject \nAgriculture\nArchaeology\nAtmospheric Science\nBiology\nBusiness & Economics\nChemistry & Physics\nEarth Science\nEducation\nMathematics\nMedicine & Health\nPolicy & Ethics\nSocial & Behavioral\nSpace & Planetary\nTech & Engineering\nScience Business Announcements\nGrants, Awards, Books\nLanguages 

In [114]:
def process_directory(subdir_path):
    parsed_data = []
    for file in os.listdir(subdir_path):
        if file.endswith('.php'):
            file_path = os.path.join(subdir_path, file)
            data = extract_php_file_data(file_path)
            if data:
                parsed_data.append(data)
    
    output_file = output_file = subdir_path.replace('pub_releases', 'extracted') + '.jsonl'
    save_to_jsonl(parsed_data, output_file)
    print(len(parsed_data), 'lines saved to', output_file)

def process(base_directory):
    with Pool(processes=12) as pool:
        pool.map(process_directory, [os.path.join(base_directory, subdir) for subdir in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, subdir))])

In [115]:
directory_path = '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/'
process(directory_path)

An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/2000-09/AAft-Cmit-1109100.php: 'NoneType' object has no attribute 'text'
An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/2000-09/AAft-Gmff-1109100.php: 'NoneType' object has no attribute 'text'
An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/2000-09/AAft-Pcfo-2809100.php: 'NoneType' object has no attribute 'text'
An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1999-07/AFMP-BbAm-170799.php: 'NoneType' object has no attribute 'text'
An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/2000-09/AAft-TeiU-1909100.php: 'NoneType' object has no attribute 'text'
An error occurred while parsing /shared/3

Process ForkPoolWorker-22:
Process ForkPoolWorker-21:
Process ForkPoolWorker-13:
Process ForkPoolWorker-23:
Process ForkPoolWorker-15:
Process ForkPoolWorker-24:
Process ForkPoolWorker-20:
Process ForkPoolWorker-16:


KeyboardInterrupt: 

In [95]:
# Usage
directory_path = '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/'
save_dir = '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/extracted/'


filepaths = []
for subdir in tqdm(os.listdir(directory_path)):
    subdir_path = os.path.join(directory_path, subdir)
    if os.path.isdir(subdir_path):
        for file in os.listdir(subdir_path):
            if file.endswith('.php'):
                file_path = os.path.join(subdir_path, file)
                filepaths.append(file_path)
            
print('find %s valid files'%len(filepaths))

parsed_data = []
for file_path in tqdm(filepaths[:1000]):
    data = extract_php_file_data(file_path)
    if data:
        parsed_data.append(data)

save_to_jsonl(parsed_data, '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/all_extracted.jsonl')
print(len(parsed_data), 'lines saved')

100%|██████████| 311/311 [00:01<00:00, 248.51it/s]


find 472821 valid files


  9%|▉         | 92/1000 [00:04<00:38, 23.71it/s]

An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-06/CUNS-TDOC-240696.php: 'utf-8' codec can't decode byte 0xad in position 1512: invalid start byte


 63%|██████▎   | 632/1000 [00:28<00:14, 25.80it/s]

An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-11/UoG-UOGS-081196.php: 'utf-8' codec can't decode byte 0x9e in position 41626: invalid start byte


100%|██████████| 1000/1000 [00:50<00:00, 19.74it/s]


998 lines saved


In [89]:
# Usage
directory_path = '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/'
save_dir = '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/extracted/'


for root, _, files in tqdm(os.walk(directory_path)):
    print(root)
    parsed_data = []
    for file in files:
        if file.endswith('.php'):
            file_path = os.path.join(root, file)
            data = extract_php_file_data(file_path)
            if data:
                parsed_data.append(data)
    if len(parsed_data) > 0:
        save_to_jsonl(parsed_data, root.replace('pub_releases', 'extracted') + '.jsonl')
        print(len(parsed_data), 'lines saved')



0it [00:00, ?it/s]

/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1969-12


2it [00:00,  3.91it/s]

10 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-01


3it [00:00,  4.06it/s]

4 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-03
1 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-04


5it [00:01,  2.36it/s]

27 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-05


6it [00:03,  1.37it/s]

38 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-06
An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-06/CUNS-TDOC-240696.php: 'utf-8' codec can't decode byte 0xad in position 1512: invalid start byte


7it [00:05,  1.15s/it]

54 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-07


8it [00:09,  1.83s/it]

87 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-08


9it [00:12,  2.29s/it]

84 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-09


10it [00:15,  2.62s/it]

84 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-10


11it [00:20,  3.34s/it]

127 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-11
An error occurred while parsing /shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-11/UoG-UOGS-081196.php: 'utf-8' codec can't decode byte 0x9e in position 41626: invalid start byte


12it [00:26,  4.06s/it]

145 lines saved
/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/pub_releases/1996-12


12it [00:27,  2.29s/it]


KeyboardInterrupt: 