In [62]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

In [7]:
def load_jsonl_folder(folder_path):
    # Get a list of all files in the folder with the .jsonl extension
    jsonl_files = [file for file in os.listdir(folder_path) if file.endswith('.jsonl')]
    
    # Initialize an empty list to store the data from all JSONL files
    all_data = []
    
    # Loop through each JSONL file and load its contents
    for file_name in tqdm(jsonl_files):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                try:
                    data = pd.read_json(line, lines=True)
                    all_data.append(data)
                except:
                    print(file_path)
    
    # Concatenate all loaded data into a single DataFrame
    df = pd.concat(all_data, ignore_index=True)
    
    return df

def process_eurek(df):
    df = pd.concat([df, pd.DataFrame().from_records(list(df.media_contact))] , axis=1)
    df['from_university'] = [str(it)[-4:]=='.edu' for it in list(df['email'])]
    df['release_year'] = [str(it)[-4:] for it in df.release_date]
    df['doi'] = [str(it)[18:] if len(str(it))>4 else np.nan for it in tqdm(df['related_journal_article_link'])]
    
    return df

In [32]:
# Replace 'path_to_folder' with the actual path to your folder containing JSONL files
folder_path = '/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/extracted4match/'
dataframe = load_jsonl_folder(folder_path)

 28%|██▊       | 30/107 [02:49<06:59,  5.45s/it]

/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/extracted4match/2014-06.jsonl


100%|██████████| 107/107 [11:46<00:00,  6.60s/it]


In [44]:
df = dataframe

In [45]:
len(df)

261542

In [47]:
1

1

In [None]:
df.media_contact.value_counts()

In [68]:
df = pd.concat([df, pd.DataFrame().from_records(list(df.media_contact))] , axis=1)

In [69]:
df['from_university'] = [str(it)[-4:]=='.edu' for it in list(df['email'])]

In [70]:
df['release_year'] = [str(it)[-4:] for it in df.release_date]

In [63]:
df['doi'] = [str(it)[18:] if len(str(it))>4 else np.nan for it in tqdm(df['related_journal_article_link'])]



  0%|          | 0/261542 [00:00<?, ?it/s][A[A

 34%|███▎      | 87916/261542 [00:00<00:00, 879110.54it/s][A[A

100%|██████████| 261542/261542 [00:00<00:00, 870539.35it/s][A[A


In [64]:
df['doi']

0                                  NaN
1                                  NaN
2                                  NaN
3                                  NaN
4                                  NaN
                      ...             
261537                             NaN
261538                             NaN
261539                             NaN
261540    10.1371/journal.pone.0252373
261541                             NaN
Name: doi, Length: 261542, dtype: object

In [78]:
dois = set(df['doi'])
len(dois)

93567

In [71]:
df.drop_duplicates('email').from_university.value_counts()

from_university
False    29039
True     10743
Name: count, dtype: int64

In [77]:
df.drop_duplicates('email').meta_institute.value_counts()[:40]

meta_institute
Proceedings of the National Academy of Sciences        1827
PLOS                                                   1340
The JAMA Network Journals                               960
Pensoft Publishers                                      704
                                                        663
Science China Press                                     616
Oregon State University                                 484
Ohio State University                                   451
Aarhus University                                       409
University of Wisconsin-Madison                         388
University of Gothenburg                                376
Max-Planck-Gesellschaft                                 366
University of Helsinki                                  356
Lund University                                         346
Washington State University                             343
The Lancet                                              330
ETH Zurich               

In [80]:
t_df = df.drop_duplicates('email')

In [84]:
t_df[t_df.meta_institute == 'University of Wisconsin-Madison']

Unnamed: 0,release_date,page_title,meta_institute,media_contact,keywords,full_text,original_source,related_journal_article_link,doi,name,email,website,twitter,phone,from_university,release_year
1783,Public Release: 16-Jan-2014,Renewable chemical ready for biofuels scale-up,University of Wisconsin-Madison,"{'name': 'Jeremy Luterbacher', 'email': 'luter...",[BIOMEDICAL/ENVIRONMENTAL/CHEMICAL ENGINEERING...,Renewable chemical ready for biofuels scale-up...,,,,Jeremy Luterbacher,luterbacher@wisc.edu,http://www.wisc.edu,UWMadScience,,True,2014
1785,Public Release: 9-Jan-2014,SHY hypothesis explains that sleep is the pric...,University of Wisconsin-Madison,"{'name': 'Susan Lampert Smith', 'email': 'ssmi...","[MEDICINE/HEALTH, NEUROBIOLOGY, SLEEP/SLEEP DI...",SHY hypothesis explains that sleep is the pric...,,,,Susan Lampert Smith,ssmith5@uwhealth.org,http://www.wisc.edu,UWMadScience,,False,2014
1789,Public Release: 6-Jan-2014,UW-Madison researchers link protein with breas...,University of Wisconsin-Madison,"{'name': 'Lisa Brunette', 'email': 'lbrunette@...","[BREAST CANCER, CANCER]",UW-Madison researchers link protein with breas...,,,,Lisa Brunette,lbrunette@uwhealth.org,http://www.wisc.edu,UWMadScience,,False,2014
1790,Public Release: 23-Jan-2014,Wisconsin researchers identify key pathway for...,University of Wisconsin-Madison,"{'name': 'Mike Sussman', 'email': 'msussman@wi...","[AGRICULTURAL PRODUCTION/ECONOMICS, AGRICULTUR...",Wisconsin researchers identify key pathway for...,,,,Mike Sussman,msussman@wisc.edu,http://www.wisc.edu,UWMadScience,,True,2014
3659,Public Release: 24-Jan-2013,Chance finding reveals new control on blood ve...,University of Wisconsin-Madison,"{'name': 'Zhen Huang', 'email': 'z.huang@neuro...","[CELL BIOLOGY, MOLECULAR BIOLOGY, NEUROBIOLOGY...",Chance finding reveals new control on blood ve...,,,,Zhen Huang,z.huang@neurology.wisc.edu,http://www.wisc.edu,UWMadScience,,True,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257816,News Release 10-Mar-2021,IceCube detection of high-energy particle prov...,University of Wisconsin-Madison,"{'name': 'Lu Lu', 'email': 'lu.lu@icecube.wisc...","[ASTRONOMY, ASTROPHYSICS, ASTROPHYSICS, ATOMIC...",IceCube detection of high-energy particle prov...,,,,Lu Lu,lu.lu@icecube.wisc.edu,http://www.wisc.edu,UWMadScience,,True,2021
257820,News Release 18-Mar-2021,New method targets disease-causing proteins fo...,University of Wisconsin-Madison,"{'name': 'Weiping Tang', 'email': 'weiping.tan...","[BIOCHEMISTRY, BIOLOGY, CHEMISTRY/PHYSICS/MATE...",New method targets disease-causing proteins fo...,https://news.wisc.edu/new-method-targets-disea...,,,Weiping Tang,weiping.tang@wisc.edu,http://www.wisc.edu,UWMadScience,,True,2021
261331,News Release 16-Jun-2021,Correlated errors in quantum computers emphasi...,University of Wisconsin-Madison,"{'name': 'Robert McDermott', 'email': 'rfmcder...","[CHEMISTRY/PHYSICS/MATERIALS SCIENCES, COMPUTE...",Correlated errors in quantum computers emphasi...,,,,Robert McDermott,rfmcdermott@wisc.edu,http://www.wisc.edu,UWMadScience,,True,2021
261334,News Release 29-Jun-2021,Counties with state prisons had 11% more first...,University of Wisconsin-Madison,"{'name': 'Jeremy Foltz', 'email': 'jdfoltz@wis...","[INFECTIOUS/EMERGING DISEASES, MEDICINE/HEALTH...",Counties with state prisons had 11% more first...,,,,Jeremy Foltz,jdfoltz@wisc.edu,http://www.wisc.edu,UWMadScience,,True,2021


In [40]:
df[~df['related_journal_article_link'].isna()]

Unnamed: 0,release_date,page_title,meta_institute,media_contact,keywords,full_text,original_source,related_journal_article_link,name,email,website,twitter,phone,from_university,release_year
19375,Public Release: 30-Jul-2015,North America's salamanders at risk of epidemi...,American Association for the Advancement of Sc...,"{'name': 'Natasha Pinol', 'email': 'scipak@aaa...","[BIODIVERSITY, BIOLOGY, ECOLOGY/ENVIRONMENT, G...",North America's salamanders at risk of epidemi...,,http://dx.doi.org/10.1126/science.aab1052,Natasha Pinol,scipak@aaas.org,http://www.aaas.org,AAAS,,False,2015
24093,Public Release: 8-Oct-2015,Ben-Gurion U. and MIT researchers develop rapi...,"American Associates, Ben-Gurion University of ...","{'name': 'andrew Lavin', 'email': 'andrewlavin...","[AGRICULTURE, CHEMICAL/BIOLOGICAL WEAPONS, DIA...",Ben-Gurion U. and MIT researchers develop rapi...,,http://dx.doi.org/10.1007/s11270-015-2560-x,andrew Lavin,andrewlavin@alavin.com,http://www.aabgu.org,,,False,2015
24102,Public Release: 15-Oct-2015,Artificial 'skin' could provide prosthetics wi...,American Association for the Advancement of Sc...,"{'name': 'Natasha Pinol', 'email': 'scipak@aaa...",[MEDICINE/HEALTH],Artificial 'skin' could provide prosthetics wi...,,http://dx.doi.org/10.1126/science.aaa9306,Natasha Pinol,scipak@aaas.org,http://www.aaas.org,AAAS,,False,2015
24103,Public Release: 29-Oct-2015,An enhanced lithium-air battery,American Association for the Advancement of Sc...,"{'name': 'Natasha Pinol', 'email': 'scipak@aaa...","[CHEMISTRY/PHYSICS/MATERIALS SCIENCES, ENERGY/...",An enhanced lithium-air battery | EurekAlert! ...,,http://dx.doi.org/10.1126/science.aac7730,Natasha Pinol,scipak@aaas.org,http://www.aaas.org,AAAS,,False,2015
24106,Public Release: 8-Oct-2015,Evidence for long-lasting lakes on Mars,American Association for the Advancement of Sc...,"{'name': 'Natasha Pinol', 'email': 'scipak@aaa...","[EXPERIMENTS IN SPACE, PLANETS/MOONS, SATELLIT...",Evidence for long-lasting lakes on Mars | Eure...,,http://dx.doi.org/10.1126/science.aac7575,Natasha Pinol,scipak@aaas.org,http://www.aaas.org,AAAS,,False,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261533,News Release 3-Jun-2021,Yale-NUS College scientist discovers how leafb...,Yale-NUS College,"{'name': 'Joshua Wong', 'email': 'joshua.wong@...","[BIOLOGY, BIOMECHANICS/BIOPHYSICS, BIOTECHNOLO...",Yale-NUS College scientist discovers how leafb...,https://www.yale-nus.edu.sg/newsroom/yale-nus-...,http://dx.doi.org/10.1073/pnas.2101357118,Joshua Wong,joshua.wong@yale-nus.edu.sg,http://www.yale-nus.edu.sg/,yalenus,,False,2021
261534,News Release 10-Jun-2021,New family of atomic-thin electride materials ...,Yokohama National University,"{'name': 'Akiko Tsumura', 'email': 'kenkyu-koh...","[CHEMISTRY/PHYSICS/MATERIALS SCIENCES, MATERIALS]",New family of atomic-thin electride materials ...,,http://dx.doi.org/10.1002/adfm.202100009,Akiko Tsumura,kenkyu-koho@ynu.ac.jp,,,,False,2021
261535,News Release 8-Jun-2021,Tree diversity may save the forest: Advocating...,Yokohama National University,"{'name': 'Akiko Tsumura', 'email': 'kenkyu-koh...","[ATMOSPHERIC SCIENCE, CLIMATE CHANGE, CLIMATE ...",Tree diversity may save the forest: Advocating...,,http://dx.doi.org/10.1038/s41558-021-01062-1,Akiko Tsumura,kenkyu-koho@ynu.ac.jp,,,,False,2021
261536,News Release 3-Jun-2021,Wearable accelerometer and vibrator 'thimble' ...,Yokohama National University,"{'name': 'Akiko Tsumura', 'email': 'kenkyu-koh...",[BIOMEDICAL/ENVIRONMENTAL/CHEMICAL ENGINEERING...,Wearable accelerometer and vibrator 'thimble' ...,,http://dx.doi.org/10.1038/s41598-021-85687-4,Akiko Tsumura,kenkyu-koho@ynu.ac.jp,,,,False,2021


In [55]:
s_df = pd.read_csv('/shared/2/projects/jiaxin/media_ethnicity/data/reg_data_single_full.csv')

  0%|          | 0/261542 [02:53<?, ?it/s]


In [56]:
s_df[s_df['doi'].isin(dois)]

Unnamed: 0,has_indirect_mention,mentions_first_author_institution,has_first_author_quote,url,doi,first_aut_name,first_aut_last_name,first_fname_length,first_fname_prob,eth_first_author_ethnea,...,DNA_sequencing,Amino_acid,text_200,hedging_ratio,text_freq,text_all,hedging_word_cnt,normalized_hedging_word_cnt,6hedging_word_cnt,normalized_6hedging_word_cnt
6,False,False,True,http://ct.moreover.com/?a=34465483750&p=1pl&v=...,10.1038/s41467-018-04959-2,Thomas B. Hildebrandt,Hildebrandt,11,0.615186,GERMAN,...,0.0,0.0,Scientists Hope Lab-Grown Embryos Can Save Rhi...,0.454545,1,Scientists Hope Lab-Grown Embryos Can Save Rhi...,7.0,0.636364,0.0,0.0
7,False,False,False,http://ct.moreover.com/?a=34466001587&p=1pl&v=...,10.1038/s41467-018-04959-2,Thomas B. Hildebrandt,Hildebrandt,11,0.615186,GERMAN,...,0.0,0.0,The northern white rhinoceros (NWR) is the wor...,0.454545,1,The northern white rhinoceros (NWR) is the wor...,6.0,0.545455,0.0,0.0
8,False,False,True,http://ct.moreover.com/?a=34465618161&p=1pl&v=...,10.1038/s41467-018-04959-2,Thomas B. Hildebrandt,Hildebrandt,11,0.615186,GERMAN,...,0.0,0.0,By Kate Kelland\n\nLONDON (Reuters) - Scientis...,0.300000,1,By Kate Kelland\n\nLONDON (Reuters) - Scientis...,4.0,0.400000,0.0,0.0
9,False,False,False,http://ct.moreover.com/?a=34469101546&p=1pl&v=...,10.1038/s41467-018-04959-2,Thomas B. Hildebrandt,Hildebrandt,11,0.615186,GERMAN,...,0.0,0.0,An international team of zoologists and reprod...,0.250000,1,An international team of zoologists and reprod...,1.0,0.250000,0.0,0.0
10,False,False,True,http://ct.moreover.com/?a=34471303900&p=1pl&v=...,10.1038/s41467-018-04959-2,Thomas B. Hildebrandt,Hildebrandt,11,0.615186,GERMAN,...,0.0,0.0,The northern white rhino may be functionally e...,0.318182,1,The northern white rhino may be functionally e...,10.0,0.454545,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133681,False,False,False,http://ct.moreover.com/?a=34510571345&p=1pl&v=...,10.1063/1.5037074,S. Mironov,Mironov,7,-2.302585,SLAV,...,0.0,0.0,"Newswise -- WASHINGTON, D.C., July 10, 2018 --...",0.500000,1,"Newswise -- WASHINGTON, D.C., July 10, 2018 --...",8.0,0.800000,0.0,0.0
133682,False,True,True,http://ct.moreover.com/?a=37358100162&p=1pl&v=...,10.1016/j.fusengdes.2018.09.007,A.Q. Kuang,Kuang,5,0.246860,CHINESE,...,0.0,0.0,A new path to solving a longstanding fusion ch...,0.176471,1,A new path to solving a longstanding fusion ch...,11.0,0.323529,0.0,0.0
133683,False,False,True,http://ct.moreover.com/?a=37344630237&p=1pl&v=...,10.1016/j.fusengdes.2018.09.007,A.Q. Kuang,Kuang,5,0.246860,CHINESE,...,0.0,0.0,"A class exercise at MIT, aided by industry res...",0.461538,1,"A class exercise at MIT, aided by industry res...",11.0,0.846154,0.0,0.0
133684,False,False,True,http://feedproxy.google.com/~r/TechnologyOrg/~...,10.1016/j.fusengdes.2018.09.007,A.Q. Kuang,Kuang,5,0.246860,CHINESE,...,0.0,0.0,"A class exercise at MIT, aided by industry res...",0.428571,1,"A class exercise at MIT, aided by industry res...",11.0,0.785714,0.0,0.0


In [66]:
df.to_csv('/shared/3/projects/jiaxin/datasets/eurekalert/wget/archive.eurekalert.org/extracted_since_2013.csv',index=False)

  0%|          | 0/261542 [01:26<?, ?it/s]
