In [1]:
import pandas as pd
import textstat
# For timing
from time import clock
from math import floor

from text_processing import get_full_text_by_pmcid

In [2]:
# Index of all entries
df = pd.read_csv('PMC-ids.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Bash script to generate this txt file
retracted_df = pd.read_csv('pubmed_retracted_pmcids.txt', header=None, skip_blank_lines=False)

In [4]:
def clean(pmcid):
    pmcid = pmcid.split()[1]
    pmcid = pmcid.split(';')[0]
    return pmcid

In [5]:
# Rename column
retracted_df.columns = ['PMCID']
retracted_df['PMCID'] = retracted_df['PMCID'].map(lambda r: clean(r))

In [6]:
# Dataframe of open access subset
oa_df = pd.read_csv('oa_file_list.csv')

oa_df.columns

# Rename Accession ID -> PMCID to match
oa_df = oa_df.rename(columns={'Accession ID': 'PMCID'})

In [7]:
# Contains things in open subset AND in the main PMC-id csv
joined = df.merge(oa_df, how='inner', left_on='PMCID', right_on='PMCID')

In [8]:
# Combine open subset with retracted table
oa_retracted_df = joined.merge(retracted_df, how='inner', left_on='PMCID', right_on='PMCID')

oa_retracted_df.shape

oa_retracted_df.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID_x,Manuscript Id,Release Date,File,Article Citation,Last Updated (YYYY-MM-DD HH:MM:SS),PMID_y,License
0,Mol Cancer,,1476-4598,2003,2,,28,10.1186/1476-4598-2-28,PMC184456,12935295.0,,live,oa_package/b9/6e/PMC184456.tar.gz,Mol Cancer. 2003 Aug 7; 2:28,2012-09-17 12:08:47,12935295.0,NO-CC CODE
1,Mol Cancer,,1476-4598,2003,2,,33,10.1186/1476-4598-2-33,PMC222989,14572313.0,,live,oa_package/18/49/PMC222989.tar.gz,Mol Cancer. 2003 Sep 23; 2:33,2012-09-17 12:08:47,14572313.0,NO-CC CODE
2,Reprod Biol Endocrinol,,1477-7827,2003,1,,125,10.1186/1477-7827-1-125,PMC317376,14678567.0,,live,oa_package/3f/83/PMC317376.tar.gz,Reprod Biol Endocrinol. 2003 Dec 16; 1:125,2015-04-17 20:12:16,14678567.0,NO-CC CODE
3,Biomed Eng Online,,1475-925X,2004,3,,13,10.1186/1475-925X-3-13,PMC419711,15125779.0,,live,oa_package/90/77/PMC419711.tar.gz,Biomed Eng Online. 2004 May 5; 3:13,2014-11-05 19:21:09,15125779.0,NO-CC CODE
4,BMC Evol Biol,,1471-2148,2004,4,,18,10.1186/1471-2148-4-18,PMC459214,15222900.0,,live,oa_package/c4/42/PMC459214.tar.gz,BMC Evol Biol. 2004 Jun 28; 4:18,2018-12-23 03:55:57,15222900.0,NO-CC CODE


In [9]:
# Clean table for duplicate columns PMID_x/PMID_y => True means they are duplicates
(oa_retracted_df['PMID_x'] == oa_retracted_df['PMID_y']).all()

# axis = 1 means drop column
oa_retracted_df.drop(labels=['PMID_y'], axis=1, inplace=True)

# Rename PMID_x -> PMID
oa_retracted_df = oa_retracted_df.rename(columns={'PMID_x': 'PMID'})
oa_retracted_df.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,File,Article Citation,Last Updated (YYYY-MM-DD HH:MM:SS),License
0,Mol Cancer,,1476-4598,2003,2,,28,10.1186/1476-4598-2-28,PMC184456,12935295.0,,live,oa_package/b9/6e/PMC184456.tar.gz,Mol Cancer. 2003 Aug 7; 2:28,2012-09-17 12:08:47,NO-CC CODE
1,Mol Cancer,,1476-4598,2003,2,,33,10.1186/1476-4598-2-33,PMC222989,14572313.0,,live,oa_package/18/49/PMC222989.tar.gz,Mol Cancer. 2003 Sep 23; 2:33,2012-09-17 12:08:47,NO-CC CODE
2,Reprod Biol Endocrinol,,1477-7827,2003,1,,125,10.1186/1477-7827-1-125,PMC317376,14678567.0,,live,oa_package/3f/83/PMC317376.tar.gz,Reprod Biol Endocrinol. 2003 Dec 16; 1:125,2015-04-17 20:12:16,NO-CC CODE
3,Biomed Eng Online,,1475-925X,2004,3,,13,10.1186/1475-925X-3-13,PMC419711,15125779.0,,live,oa_package/90/77/PMC419711.tar.gz,Biomed Eng Online. 2004 May 5; 3:13,2014-11-05 19:21:09,NO-CC CODE
4,BMC Evol Biol,,1471-2148,2004,4,,18,10.1186/1471-2148-4-18,PMC459214,15222900.0,,live,oa_package/c4/42/PMC459214.tar.gz,BMC Evol Biol. 2004 Jun 28; 4:18,2018-12-23 03:55:57,NO-CC CODE


In [10]:
# Write to file
# oa_retracted_df.to_csv('oa-retracted-table.csv', sep=',', encoding='utf-8')

In [11]:
table = joined.merge(retracted_df, how='outer', left_on='PMCID', right_on='PMCID', indicator=True)

table.head()

# table[table['_merge'] == 'both'] # Retracted and in open access subset

# table[table['_merge'] == 'right_only'] # Retracted but not in open access subset

# table[table['_merge'] == 'left_only'] # Not retracted but in open access subset

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID_x,Manuscript Id,Release Date,File,Article Citation,Last Updated (YYYY-MM-DD HH:MM:SS),PMID_y,License,_merge
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,2017-04-26 12:15:50,11250746.0,NO-CC CODE,left_only
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,2016-01-20 10:58:46,11250747.0,NO-CC CODE,left_only
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,2006-02-02 19:37:52,11250748.0,NO-CC CODE,left_only
3,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,59,10.1186/bcr29,PMC13911,11056684.0,,live,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,2013-03-17 14:00:52,11056684.0,NO-CC CODE,left_only
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,2013-03-17 14:00:52,11400682.0,NO-CC CODE,left_only


In [12]:
# Delete duplicate column
table.drop(['PMID_y'], axis=1, inplace=True)

In [13]:
# Rename duplicate column
table = table.rename(columns={'PMID_x': 'PMID'})

In [14]:
# Entries with "right_only" are retracted, but not indexed by PubMed
table = table[table['_merge'] != 'right_only']

In [15]:
# Rename
table['Retracted'] = table['_merge'] == 'both'
table.drop(['_merge'], axis=1, inplace=True)

In [16]:
table.head()

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,File,Article Citation,Last Updated (YYYY-MM-DD HH:MM:SS),License,Retracted
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,2017-04-26 12:15:50,NO-CC CODE,False
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,2016-01-20 10:58:46,NO-CC CODE,False
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,2006-02-02 19:37:52,NO-CC CODE,False
3,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,59,10.1186/bcr29,PMC13911,11056684.0,,live,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,2013-03-17 14:00:52,NO-CC CODE,False
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,2013-03-17 14:00:52,NO-CC CODE,False


In [17]:
# table[(table.Retracted) & (table.ISSN.isnull())]
# Write to file
table.to_csv('full-table.csv', sep=',', encoding='utf-8', index=False)

## Sampling with GroupBy

In [18]:
retracted = table[table['Retracted'] == True]
unretracted = table[table['Retracted'] == False]
print(unretracted.shape)

(2345880, 17)


In [19]:
# Filter so we have choose from the same journal titles
retracted_journals = set(retracted['Journal Title'])
unretracted_pop = unretracted[unretracted['Journal Title'].isin(retracted_journals)]
unretracted_journals = set(unretracted_pop['Journal Title'])

In [20]:
retracted_journals == unretracted_journals

True

In [21]:
unretracted_pop.shape

(1260979, 17)

In [22]:
'Breast Cancer Res' in unretracted_journals and 'Breast Cancer Res' in retracted_journals

True

In [23]:
rgb = retracted.groupby('Journal Title', group_keys=False, as_index=False) # population
# keep index for dict
r_dict = (retracted.groupby('Journal Title', group_keys=False).count()['Retracted']).to_dict() 
ugb = unretracted_pop.groupby('Journal Title', group_keys=False, as_index=False)

In [24]:
print(len(retracted_journals))

429


In [25]:
print(len(unretracted_journals))

429


In [26]:
# rgb.count()

In [27]:
# ugb.count()

In [28]:
def custom_sample(df):
    name = df.iloc[0]['Journal Title']
    n = r_dict[name]
    return df.sample(n)
control_sample = ugb.apply(custom_sample)

In [29]:
control_sample

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,File,Article Citation,Last Updated (YYYY-MM-DD HH:MM:SS),License,Retracted
1053358,AIDS Care,0954-0121,1360-0451,2015,27,8,990,10.1080/09540121.2015.1020281,PMC4440621,25771903.0,,live,oa_package/5f/28/PMC4440621.tar.gz,AIDS Care. 2015 Aug 3; 27(8):9908-994,2015-12-23 21:43:06,CC BY,False
838515,ARYA Atheroscler,1735-3955,2251-6638,2014,10,1,46,,PMC4063513,24963314.0,,live,oa_package/9a/44/PMC4063513.tar.gz,ARYA Atheroscler. 2014 Jan; 10(1):46-54,2014-06-26 14:32:22,CC BY-NC,False
2045390,Acta Crystallogr D Struct Biol,,2059-7983,2018,74,Pt 2,132,10.1107/S2059798317009834,PMC5947777,29533239.0,,live,oa_package/90/fc/PMC5947777.tar.gz,Acta Crystallogr D Struct Biol. 2018 Feb 1; 74...,2018-05-15 20:08:27,CC BY,False
802598,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2014,70,Pt 4,o464,10.1107/S1600536814005972,PMC3998579,24826163.0,,live,oa_package/95/59/PMC3998579.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 201...,2014-06-03 12:14:37,CC BY,False
281606,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2008,64,Pt 9,o1832,10.1107/S1600536808026913,PMC2960533,21201806.0,,live,oa_package/c3/75/PMC2960533.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 200...,2018-10-12 13:36:48,CC BY,False
292723,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2009,66,Pt 1,e13,10.1107/S1600536809049976,PMC2980240,21579904.0,,live,oa_package/51/3b/PMC2980240.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 200...,2018-12-31 09:55:56,NO-CC CODE,False
323127,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2010,67,Pt 1,o169,10.1107/S1600536810052086,PMC3050273,21522676.0,,live,oa_package/5c/ec/PMC3050273.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 201...,2018-10-12 13:36:48,CC BY,False
342917,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2011,67,Pt 4,m493,10.1107/S1600536811010579,PMC3099758,21754003.0,,live,oa_package/2c/84/PMC3099758.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 201...,2018-10-12 13:36:48,CC BY,False
343177,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2011,67,Pt 4,m486,10.1107/S1600536811009895,PMC3100018,21753997.0,,live,oa_package/e0/08/PMC3100018.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 201...,2018-10-12 13:36:48,CC BY,False
947604,Acta Crystallogr Sect E Struct Rep Online,,1600-5368,2014,70,Pt 12,m399,10.1107/S1600536814024544,PMC4257430,25553009.0,,live,oa_package/a1/65/PMC4257430.tar.gz,Acta Crystallogr Sect E Struct Rep Online. 201...,2015-01-02 15:20:46,CC BY,False


In [30]:
# Data we are working with
sample_df = retracted.append(control_sample)

In [31]:
# Get full text
print('START')
start = clock()
sample_df['Text'] = sample_df['PMCID'].map(get_full_text_by_pmcid)
stop = clock()
print("Time elapsed =", floor((stop-start)/60), "minute(s), and", round((stop-start)%60, 1), "second(s)")

START
Time elapsed = 4 minute(s), and 4.6 second(s)


In [32]:
# Add gunning fog, takes ~2 minutes
print('START')
start = clock()
sample_df['gunning_fog'] = sample_df['Text'].map(textstat.gunning_fog)
stop = clock()
print("Time elapsed =", floor((stop-start)/60), "minute(s), and", round((stop-start)%60, 1), "second(s)")

START
Time elapsed = 2 minute(s), and 1.2 second(s)


In [33]:
sample_df.to_csv('full-sample.csv', sep=',', encoding='utf-8', index=False)

In [34]:
# Takes 16 seconds for 5, so ~1 hour for whole set.
# print('START')
# start = clock()
# lst = []
# for text in sample_df['Text'][0:5]:
# # for text in sample_df.text:
#     res = sentiment_analyzer_scores(text)
#     lst.append(res)
# stop = clock()
# print("Time elapsed =", floor ((stop-start)/60), "minute(s), and", round((stop-start)%60, 1), "second(s)")

In [35]:
tp = sample_df

In [37]:
# tmp = tp.iloc[0:5,].copy()
# print('START')
# start = clock()
# tmp[['pos', 'neg', 'neu']] = tmp.apply(lambda row: sentiment_analyzer_scores(r['Text']), axis=1)
# stop = clock()
# print("Time elapsed =", floor ((stop-start)/60), "minute(s), and", round((stop-start)%60, 1), "second(s)")