In [2]:
#import pandas, numpy, matplotlib, and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
#Not sure why this has to be encoded

df = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [4]:
#Take a closer look to further explore the data
df.describe()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
count,1928,2127,2126,2127,2127
unique,1880,299,984,2126,1402
top,Not yet available,Elsevier,PLoS One,"Exclusive breastfeeding, diarrhoel morbidity a...",£2040.00
freq,7,387,92,2,94


In [5]:
#Let's see what this might look like
df.groupby('Journal title')['Article title'].count().sort_values(ascending = False)

#casing and spacing seem to be an issue for this, lets fix that

Journal title
PLoS One                                                         92
PLoS ONE                                                         62
Journal of Biological Chemistry                                  48
Nucleic Acids Research                                           21
Proceedings of the National Academy of Sciences                  19
Human Molecular Genetics                                         18
PLoS Neglected Tropical Diseases                                 18
Nature Communications                                            17
PLoS Pathogens                                                   15
PLoS Genetics                                                    15
Neuroimage                                                       15
Brain                                                            14
NeuroImage                                                       14
PLOS ONE                                                         14
BMC Public Health                 

In [6]:
#First, lets see if there is any data missing a journal title?
df['Journal title'].isnull().values.any()

True

In [7]:
#Lets drop the records where the Journal title is null
df = df.dropna(subset=['Journal title'])

In [8]:
#Maybe one way of tackling this issue is removing the spacing and making the casing uniform
df['Journal title'] = df['Journal title'].str.upper()
df['Journal title'] = df['Journal title'].str.replace(' ', '')
df.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,PSYCHOLOGICALMEDICINE,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,BIOMACROMOLECULES,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,JMEDCHEM,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,JMEDCHEM,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,JORGCHEM,Regioselective opening of myo-inositol orthoes...,£685.88


In [9]:
#Here are the five most common journals and the total number of articles for each
df.groupby('Journal title')['Article title'].count().sort_values(ascending = False)[:5]

Journal title
PLOSONE                         200
JOURNALOFBIOLOGICALCHEMISTRY     53
NEUROIMAGE                       29
NUCLEICACIDSRESEARCH             25
PLOSPATHOGENS                    24
Name: Article title, dtype: int64

In [10]:
#Calculate mean, median, and standard deviation of the open-access cost per article for each journal

In [11]:
#First, I want to see if there are any missing values in the cost parameters
df['COST (£) charged to Wellcome (inc VAT when charged)'].isnull().values.any()

False

In [12]:
#Let's remove the money signs, currency agnostic, hopefully
df['COST (£) charged to Wellcome (inc VAT when charged)'] = df['COST (£) charged to Wellcome (inc VAT when charged)'].str.replace('£', '')
df['COST (£) charged to Wellcome (inc VAT when charged)'] = df['COST (£) charged to Wellcome (inc VAT when charged)'].str.replace('$', '')

In [33]:
#Change the type to numeric
df['COST (£) charged to Wellcome (inc VAT when charged)'] = pd.to_numeric(df['COST (£) charged to Wellcome (inc VAT when charged)'])

In [34]:
df.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,PSYCHOLOGICALMEDICINE,Reduced parahippocampal cortical thickness in ...,0.0
1,PMC3679557,ACS,BIOMACROMOLECULES,Structural characterization of a Model Gram-ne...,2381.04
2,23043264 PMC3506128,ACS,JMEDCHEM,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56
3,23438330 PMC3646402,ACS,JMEDCHEM,Orvinols with mixed kappa/mu opioid receptor a...,669.64
4,23438216 PMC3601604,ACS,JORGCHEM,Regioselective opening of myo-inositol orthoes...,685.88


In [40]:
df['COST (£) charged to Wellcome (inc VAT when charged)'].mean()

24072.451608654806

In [42]:
#999999.00 appears to be a problem - let's just remove these entries
df_not_999 = df[df['COST (£) charged to Wellcome (inc VAT when charged)'] != 999999.00]

In [44]:
#PLOSONE

plosone = df_not_999[df_not_999['Journal title'] == 'PLOSONE']
plosone

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
1282,3517619,PLoS,PLOSONE,HCN1 and HCN2 in Rat DRG Neurons: Levels in No...,1001.03
1283,3498109,PLoS,PLOSONE,Fetal alcohol exposure and IQ at age 8: Eviden...,1004.15
1284,3515553,PLoS,PLOSONE,Vitamin B-12 status during pregnancy and child...,1011.45
1285,3522679,PLoS,PLOSONE,Validation of Dual Energy X-ray Absorptiometry...,1011.45
1286,3485223,PLoS,PLOSONE,Associations of different phenotypes of wheezi...,1015.73
1287,PMC3547059,PLoS,PLOSONE,"""Involvement of EphB1 receptors signalling in ...",1023.41
1288,3573029,PLoS,PLOSONE,Reactive oxygen species modulate the barrier f...,1039.87
1289,3769269,PLoS,PLOSONE,Chronic pravastatin but not atorvastatin treat...,1061.24
1290,3782430,PLoS,PLOSONE,Expression of HIV-1 Vpu Leads to Loss of the V...,1061.24
1291,3797097,PLoS,PLOSONE,Molecular phylogeny of a RING E3 ubiquitin lig...,1061.24


In [45]:
plosone['COST (£) charged to Wellcome (inc VAT when charged)'].mean()

1929.9894270833322

In [47]:
#JOURNALOFBIOLOGICALCHEMISTRY

jbc = df_not_999[df_not_999['Journal title'] == 'JOURNALOFBIOLOGICALCHEMISTRY']
jbc

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
16,22610094,AMBSB,JOURNALOFBIOLOGICALCHEMISTRY,Annexin-1 interaction with FPR2/ALX,265.67
60,PMC3576085,American Soc for Biochemistry and Molecular Bi...,JOURNALOFBIOLOGICALCHEMISTRY,Understanding how noncatalytic carbohydrate bi...,1100.0
61,23239883 PMC3561570,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,Molecular architecture and functional analysis...,2259.64
64,PMCID: PMC3642348,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,Human and viral golgi anti-apoptotic protein (...,1019.71
65,PMID: 22992744 PMC3493908,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,Dynamic exchange of myosin VI on endocytic str...,1119.61
66,PMCID: PMC3531748,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,Uncoupling proteostasis and development in vit...,1131.01
67,PMCID: PMC3436574\n,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,Structural Requirements for Recognition of Maj...,1137.51
68,PMID: 23223336 PMC3543027,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,Visualization of structural changes accompanyi...,1152.72
69,PMCID: PMC3795252,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,SLP-76 sterile alpha motif (SAM) and individua...,1160.09
70,PMCID: PMC3724652,American Society for Biochemistry and Molecula...,JOURNALOFBIOLOGICALCHEMISTRY,?-Amino-3-hydroxy-5-methyl-4-isoxazole propion...,1166.85


In [48]:
jbc['COST (£) charged to Wellcome (inc VAT when charged)'].mean()

1423.5884615384614

In [49]:
#NEUROIMAGE

ni = df_not_999[df_not_999['Journal title'] == 'NEUROIMAGE']
ni

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
437,23672768,Elseveier Science,NEUROIMAGE,Gearing up for action: attentive tracking dyna...,1758.89
438,23046981,Elseveier Science,NEUROIMAGE,Good practice for conducting and reporting MEG...,2345.0
689,PMC3734349\n,Elsevier,NEUROIMAGE,Connectivity-based neurofeedback: dynamic caus...,1747.16
690,PMC3734350\n,Elsevier,NEUROIMAGE,The impact of distractor congruency on stimulu...,1760.94
691,PMC3734351\n,Elsevier,NEUROIMAGE,Distinct encoding of risk and value in economi...,1762.69
692,PMC3734352\n,Elsevier,NEUROIMAGE,Parcellation of the human substantia nigra bas...,1762.69
693,,Elsevier,NEUROIMAGE,Characterising reward outcome signals in senso...,1779.76
694,PMC3677092,Elsevier,NEUROIMAGE,Neural correlates of working memory in Tempora...,1961.25
695,PMCID: PMC3021391,Elsevier,NEUROIMAGE,Dynamic causal modelling of effective connecti...,2100.54
696,PMCID: PMC2877799,Elsevier,NEUROIMAGE,Action selection: a race model for selected an...,2118.57


In [50]:
ni['COST (£) charged to Wellcome (inc VAT when charged)'].mean()

2215.168275862069

In [51]:
#NUCLEICACIDSRESEARCH

nar = df_not_999[df_not_999['Journal title'] == 'NUCLEICACIDSRESEARCH']
nar

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
1148,23595147 PMCID: PMC3675483,OUP,NUCLEICACIDSRESEARCH,Nucleocapsid protein structures from orthobuny...,1704.0
1149,PMID: 23771140 /PMCID: PMC3753647,OUP,NUCLEICACIDSRESEARCH,Impact of Target Site Distribution for Type I ...,2184.0
1150,3467080,OUP,NUCLEICACIDSRESEARCH,Protein kinase CK2 inactivates PRH/Hhex using ...,852.0
1151,3553950,OUP,NUCLEICACIDSRESEARCH,The Type ISP Restriction-Modification enzymes ...,852.0
1152,3553963,OUP,NUCLEICACIDSRESEARCH,DNA cleavage by Type ISP Restriction-Modificat...,852.0
1153,3592466,OUP,NUCLEICACIDSRESEARCH,Organization of the BcgI restriction-modificat...,852.0
1154,3592470,OUP,NUCLEICACIDSRESEARCH,Organization of the BcgI restriction-modificat...,852.0
1155,PMC3575838,OUP,NUCLEICACIDSRESEARCH,Unwinding of primer-templates by archaeal fami...,852.0
1156,PMC3627570\n\n\n\n\n\n,OUP,NUCLEICACIDSRESEARCH,Resolving the polymorphism-in-probe problem is...,852.0
1157,PMC3627603,OUP,NUCLEICACIDSRESEARCH,Human SIRT1 regulates DNA-binding and stabilit...,852.0


In [52]:
nar['COST (£) charged to Wellcome (inc VAT when charged)'].mean()

1160.88

In [53]:
#PLOSPATHOGENS

pp = df_not_999[df_not_999['Journal title'] == 'PLOSPATHOGENS']
pp

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
1300,PMC3597521,PLOS,PLOSPATHOGENS,Th2 Cell-intrinsic Hypo-responsiveness determi...,1343.73
1301,PMC3798605,PLOS,PLOSPATHOGENS,Independent pathways can transduce the life-cy...,1438.45
1583,PMCID:\n PMC3486875\n,Public Library of Science,PLOSPATHOGENS,Histone H1 plays a role in heterochromatin for...,1254.02
1584,23326235,Public Library of Science,PLOSPATHOGENS,"Schmallenberg virus pathogenesis, tropism and ...",1397.0
1585,21483485,Public Library of Science,PLOSPATHOGENS,Lung adenocarcinoma originates from retrovirus...,1404.0
1586,22241985,Public Library of Science,PLOSPATHOGENS,Identification and characterization of a novel...,1440.0
1587,PMCID:\n PMC3578823,Public Library of Science,PLOSPATHOGENS,Induction of p16(INK4a) Is the Major Barrier t...,1600.79
1588,PMCID:\n PMC3764221,Public Library of Science,PLOSPATHOGENS,The zebrafish as a new model for the in vivo s...,1602.17
1589,PMCID:\n PMC3192839,Public Library of Science,PLOSPATHOGENS,KIR2DL2 enhances protective and detrimental HL...,1644.84
1590,PMID: 23468629 /PMCID: PMC3585132,Public Library of Science,PLOSPATHOGENS,"The plasmodium berghei Ca2+/H+Exchanger, PbCAX...",1723.16


In [54]:
pp['COST (£) charged to Wellcome (inc VAT when charged)'].mean()

1572.8668181818182