# Pyrfume data processing pipeline for Bushdid et al, 2014

In [3]:
import pandas as pd
from pyrfume.odorants import get_cids, from_cids

### Supplementary Table S1 containing information about the molecules

In [5]:
# Load and trim white space
s1 = pd.read_csv('tableS1.csv', encoding='latin1').iloc[:, :4].dropna()
s1['Odorant name'] = s1['Odorant name'].apply(str.strip)
s1.head()

Unnamed: 0,Odorant name,C.A.S.,% odorant,Solvent
0,"(3S)-3,7-dimethyloct-6-en-1-ol",7540-51-4,1.0,"1,2-propanediol"
1,(methyldisulfanyl) methane,624-92-0,0.025,"1,2-propanediol"
2,1-hexanol,111-27-3,1.0,mineral oil
3,1-Isopropyl-4-methylbenzene,99-87-6,0.5,"1,2-propanediol"
4,1-octen-3-ol,3391-86-4,0.04,mineral oil


In [6]:
# Get PubChem IDs from CAS registry numbers
cids = get_cids(s1['C.A.S.'])

HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))

Multiple CIDs for 87-44-5: [5281515, 5281522, 26318, 6887, 5322111, 5354499, 6429274]
Multiple CIDs for 17369-59-4: [5373603, 28500, 6259976]
Multiple CIDs for 5655-61-8: [93009, 6448, 44630108, 442460, 12097317, 6950274]
Multiple CIDs for 18172-67-3: [440967, 14896, 12212059, 6992019, 10290825, 24848167]
Could not find 110-01-1





In [7]:
# Which molecules could not be mapped to uniqye PubChem IDs?
s1[s1['C.A.S.'].isin(['87-44-5', '17369-59-4', '5655-61-8', '18172-67-3', '110-01-1'])]

Unnamed: 0,Odorant name,C.A.S.,% odorant,Solvent
35,caryophyllene,87-44-5,15.0,"1,2-propanediol"
36,celeriax,17369-59-4,0.2,"1,2-propanediol"
75,iso-bornyl acetate,5655-61-8,5.0,mineral oil
80,laevo-beta-pinene,18172-67-3,10.0,"1,2-propanediol"
119,thiolane,110-01-1,0.0005,"1,2-propanediol"


In [8]:
# Add those CIDs manually
cids.update({
    '17369-49-4': 5373603, # celeriax is 3-propylidene phthalide
    '87-44-5': 5281515, # caryophyllene is beta-caryophyllene
    '5655-61-8': 6448, # Assumed racemic mixture for iso-bornyl acetate
    '18172-67-3': 440967, # laevo-beta-pinene is (-)-beta-pinene
    '110-01-1': 1127, # C.A.S. for thiolane was wrong in source file (Excel error)
})
cids = pd.Series(cids, name='CID').astype(int)

In [9]:
# Add the CIDs into the Table S1 dataframe
s1 = s1.join(cids, on='C.A.S.').set_index('CID')
s1 = s1.replace('110-01-1', '110-01-0') # Fix thiolane CID
s1.head()

Unnamed: 0_level_0,Odorant name,C.A.S.,% odorant,Solvent
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7793,"(3S)-3,7-dimethyloct-6-en-1-ol",7540-51-4,1.0,"1,2-propanediol"
12232,(methyldisulfanyl) methane,624-92-0,0.025,"1,2-propanediol"
8103,1-hexanol,111-27-3,1.0,mineral oil
7463,1-Isopropyl-4-methylbenzene,99-87-6,0.5,"1,2-propanediol"
18827,1-octen-3-ol,3391-86-4,0.04,mineral oil


In [10]:
# Get standardized information for these molecules
info = from_cids(cids.values)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Retrieving 0 through 99
Retrieving 100 through 128



In [11]:
# Join data and save.
molecules = pd.DataFrame(info).set_index('CID').join(s1)
molecules.to_csv('molecules.csv')

### Supplementary Table S2 containing information about the mixtures and the triangle tests that used them

In [12]:
s2 = pd.read_csv('tableS2.csv', encoding='latin1')
s2 = s2[s2['Test UID'].str.isnumeric() > 0]  # Keep only numeric Test UIDs
s2['Test UID'] = s2['Test UID'].astype(int)

In [13]:
# Extract only the trial results for each Test UID and subject
behavior = pd.melt(s2, id_vars=['Test UID'], value_vars=['subject %d' % i for i in range(1, 27)], var_name='Subject', value_name='Correct')
behavior = behavior.replace({'subject %d' % i: i for i in range(1, 27)})
behavior = behavior.dropna().set_index(['Test UID', 'Subject']).sort_index()
behavior = behavior.replace({'wrong': False, 'right': True}).astype(bool)

# Save behavioral data
behavior.to_csv('behavior.csv')
behavior.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Correct
Test UID,Subject,Unnamed: 2_level_1
1,1,False
1,2,True
1,3,False
1,4,True
1,5,True


In [14]:
# Get all the trial metadata (including mixture composition)
mixtures = s2.loc[:, :'subject 1'].iloc[:, :-1]

# Replace raw odorant names with CIDs
cid_map = {v:k for k,v in molecules['Odorant name'].items()}
mixtures = mixtures.replace(cid_map)

# Cleanup other names
mixtures = mixtures.rename(columns={'Unnamed: %d' % i: 'Molecule %d' % (i-5) for i in range(6, 36)})
mixtures['Stimulus dilution'] = mixtures['Stimulus dilution'].replace({'1/4': 0.25, '1/2': 0.5, 'not diluted': 1})
mixtures[['Components in mixtures', 'Components that differ']] = mixtures[['Components in mixtures', 'Components that differ']].astype(int)
mixtures = mixtures.set_index(['Test UID', 'Answer']).sort_index()

# Sanity check
for i, answer in enumerate(mixtures.index.get_level_values('Answer')):
    if i % 3 == 0:
        assert answer=='right'
    if i % 3 != 0:
        assert answer=='wrong'
mixtures.head(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,Components in mixtures,Components that differ,% mixture overlap,Stimulus dilution,Molecule 1,Molecule 2,Molecule 3,Molecule 4,Molecule 5,Molecule 6,...,Molecule 21,Molecule 22,Molecule 23,Molecule 24,Molecule 25,Molecule 26,Molecule 27,Molecule 28,Molecule 29,Molecule 30
Test UID,Answer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,right,30,30,0.0,1.0,12232,7731,7888,7966,7848,18827,...,8103.0,7344.0,8051.0,7921.0,460.0,7583.0,11002.0,12367.0,6590.0,7991.0
1,wrong,30,30,0.0,0.25,440917,5281515,8030,3314,31272,798,...,6561.0,22386.0,11509.0,443162.0,8797.0,16666.0,7749.0,7793.0,7799.0,7762.0
1,wrong,30,30,0.0,0.5,440917,5281515,8030,3314,31272,798,...,6561.0,22386.0,11509.0,443162.0,8797.0,16666.0,7749.0,7793.0,7799.0,7762.0
2,right,10,4,60.0,0.5,3314,62433,5281515,7749,6561,12178,...,,,,,,,,,,
2,wrong,10,4,60.0,0.25,3314,62433,5281515,7749,6561,12178,...,,,,,,,,,,
2,wrong,10,4,60.0,1.0,3314,62433,5281515,7749,6561,12178,...,,,,,,,,,,
3,right,10,1,90.0,0.25,8797,2879,7888,6501,443158,22873,...,,,,,,,,,,
3,wrong,10,1,90.0,0.5,8797,2879,7888,6501,443158,22873,...,,,,,,,,,,
3,wrong,10,1,90.0,1.0,8797,2879,7888,6501,443158,22873,...,,,,,,,,,,


In [15]:
# Save mixtures data
mixtures.to_csv('mixtures.csv')