In [1]:
import os
import scanpy as sc

home = '/home/kojo/Code/temporary-trvae-clone'
os.chdir(home)

In [2]:
import pandas as pd

## Loading Data

In [3]:
pro3m = pd.read_csv('./data/alzPro/peaks_proteins_lf_3m.csv')
pro6m = pd.read_csv('./data/alzPro/peaks_proteins_lf_6m.csv')
pro9m = pd.read_csv('./data/alzPro/peaks_proteins_lf_9m.csv')

print(pro3m.columns)
print(pro6m.columns)
print(pro9m.columns)

Index(['Protein Group', 'Protein ID', 'Accession', 'Significance',
       'Coverage (%)', '#Peptides', '#Unique', 'PTM', 'Sample 3131',
       'Sample 3132', 'Sample 3140', 'Sample 3141', 'Sample 3157',
       'Sample 3153', 'Sample 3154', 'Sample 3156', 'Sample 3608',
       'Sample 3609', 'Sample 3606', 'Sample 3607', 'Sample 3347',
       'Sample 3346', 'Sample 3389', 'Sample 3388', 'Sample Profile (Ratio)',
       'M HET Intensity', 'F HET Intensity', 'M WT Intensity',
       'F WT Intensity', 'Group Profile (Ratio)', 'Avg. Mass', 'Description'],
      dtype='object')
Index(['Protein Group', 'Protein ID', 'Accession', 'Significance',
       'Coverage (%)', '#Peptides', '#Unique', 'PTM', 'Sample 3400',
       'Sample 3405', 'Sample 3526', 'Sample 3003', 'Sample 3513',
       'Sample 3392', 'Sample 3393', 'Sample 3703', 'Sample 3011',
       'Sample 3012', 'Sample 3013', 'Sample 3177', 'Sample 3325',
       'Sample 2998', 'Sample 3121', 'Sample 2997', 'Sample Profile (Ratio)',
      

## Column Renaming

In [4]:
pro3m_v2 = pro3m.drop(['Protein Group', 'Protein ID', 'Significance',
                        'Coverage (%)', '#Peptides', '#Unique', 'PTM',
                        'Sample Profile (Ratio)', 'M HET Intensity', 
                        'F HET Intensity', 'M WT Intensity', 'F WT Intensity',
                        'Group Profile (Ratio)', 'Avg. Mass', 'Description'],
                        axis=1)
pro3m_v2 = pro3m_v2.reindex(sorted(pro3m_v2.columns), axis=1)
pro3m_v2.columns

Index(['Accession', 'Sample 3131', 'Sample 3132', 'Sample 3140', 'Sample 3141',
       'Sample 3153', 'Sample 3154', 'Sample 3156', 'Sample 3157',
       'Sample 3346', 'Sample 3347', 'Sample 3388', 'Sample 3389',
       'Sample 3606', 'Sample 3607', 'Sample 3608', 'Sample 3609'],
      dtype='object')

In [5]:
pro6m_v2 = pro6m.drop(['Protein Group', 'Protein ID', 'Significance',
                        'Coverage (%)', '#Peptides', '#Unique', 'PTM', 
                        'Sample Profile (Ratio)', 'WT Intensity', 
                        'HET Intensity', 'Group Profile (Ratio)', 'Avg. Mass',
                        'Description'], axis=1)
pro6m_v2 = pro6m_v2.reindex(sorted(pro6m_v2.columns), axis=1)
pro6m_v2.columns

Index(['Accession', 'Sample 2997', 'Sample 2998', 'Sample 3003', 'Sample 3011',
       'Sample 3012', 'Sample 3013', 'Sample 3121', 'Sample 3177',
       'Sample 3325', 'Sample 3392', 'Sample 3393', 'Sample 3400',
       'Sample 3405', 'Sample 3513', 'Sample 3526', 'Sample 3703'],
      dtype='object')

In [6]:
pro9m_v2 = pro9m.drop(['Protein Group', 'Protein ID', 'Significance',
                        'Coverage (%)', '#Peptides', '#Unique', 'PTM', 
                        'Sample Profile (Ratio)', 'WT Intensity', 
                        'Het Intensity', 'Group Profile (Ratio)', 'Avg. Mass', 
                        'Description'], axis=1)
pro9m_v2.rename(columns={'Accession': 'Accession Intensity'}, inplace=True)
pro9m_v2.rename(columns=lambda x: x[:-10], inplace=True)
pro9m_v2 = pro9m_v2.reindex(sorted(pro9m_v2.columns), axis=1)
pro9m_v2.columns

Index(['Accession', 'Sample 2143', 'Sample 2145', 'Sample 2146', 'Sample 2147',
       'Sample 2148', 'Sample 2150', 'Sample 2156', 'Sample 2227',
       'Sample 2228', 'Sample 2229', 'Sample 2232', 'Sample 2582',
       'Sample 2583', 'Sample 2721', 'Sample 3207', 'Sample 3234'],
      dtype='object')

## Data Combination

### Removing Exclusive Proteins

In [7]:
proteins_list_df = pro3m_v2.loc[pro3m_v2["Accession"].isin(pro6m_v2["Accession"])]
proteins_list = proteins_list_df.loc[proteins_list_df["Accession"].isin(pro9m_v2["Accession"])]["Accession"]
len(proteins_list)

1729

In [8]:
pro3m_v2 = pro3m_v2.loc[pro3m_v2["Accession"].isin(proteins_list)]
pro6m_v2 = pro6m_v2.loc[pro6m_v2["Accession"].isin(proteins_list)]
pro9m_v2 = pro9m_v2.loc[pro9m_v2["Accession"].isin(proteins_list)]

### Actual Merging

In [9]:
pro3m_v2.sort_values(by=["Accession"], inplace=True)
pro6m_v2.sort_values(by=["Accession"], inplace=True)
pro9m_v2.sort_values(by=["Accession"], inplace=True)

pro3m_v2.reset_index(drop=True, inplace=True)
pro6m_v2.reset_index(drop=True, inplace=True)
pro9m_v2.reset_index(drop=True, inplace=True)

pro3m_v2.head()

Unnamed: 0,Accession,Sample 3131,Sample 3132,Sample 3140,Sample 3141,Sample 3153,Sample 3154,Sample 3156,Sample 3157,Sample 3346,Sample 3347,Sample 3388,Sample 3389,Sample 3606,Sample 3607,Sample 3608,Sample 3609
0,A2A432|CUL4B_MOUSE,231000.0,336000.0,278000.0,251000.0,279000.0,238000.0,249000.0,298000.0,270000.0,285000.0,335000.0,273000.0,393000.0,341000.0,343000.0,398000.0
1,A2A699|F1712_MOUSE,413000.0,255000.0,278000.0,243000.0,443000.0,257000.0,224000.0,331000.0,418000.0,439000.0,467000.0,0.0,417000.0,479000.0,449000.0,286000.0
2,A2AGT5|CKAP5_MOUSE,463000.0,727000.0,534000.0,638000.0,619000.0,786000.0,496000.0,619000.0,529000.0,513000.0,504000.0,588000.0,515000.0,626000.0,502000.0,615000.0
3,A2AJI0|MA7D1_MOUSE,310000.0,453000.0,349000.0,449000.0,380000.0,437000.0,241000.0,426000.0,340000.0,420000.0,403000.0,402000.0,385000.0,364000.0,379000.0,491000.0
4,A2ALS5|RPGP1_MOUSE,440000.0,763000.0,734000.0,761000.0,584000.0,565000.0,380000.0,866000.0,339000.0,638000.0,637000.0,468000.0,422000.0,767000.0,664000.0,947000.0


In [10]:
pro6m_v2.head()

Unnamed: 0,Accession,Sample 2997,Sample 2998,Sample 3003,Sample 3011,Sample 3012,Sample 3013,Sample 3121,Sample 3177,Sample 3325,Sample 3392,Sample 3393,Sample 3400,Sample 3405,Sample 3513,Sample 3526,Sample 3703
0,A2A432|CUL4B_MOUSE,140000.0,241000.0,275000.0,64400.0,277000.0,182000.0,261000.0,222000.0,177000.0,185000.0,105000.0,219000.0,145000.0,170000.0,248000.0,296000.0
1,A2A699|F1712_MOUSE,797000.0,572000.0,462000.0,296000.0,397000.0,446000.0,1070000.0,742000.0,693000.0,854000.0,1150000.0,564000.0,703000.0,415000.0,807000.0,1230000.0
2,A2AGT5|CKAP5_MOUSE,638000.0,568000.0,494000.0,552000.0,604000.0,560000.0,625000.0,567000.0,543000.0,1340000.0,577000.0,550000.0,483000.0,451000.0,543000.0,547000.0
3,A2AJI0|MA7D1_MOUSE,323000.0,188000.0,182000.0,370000.0,199000.0,431000.0,462000.0,305000.0,511000.0,359000.0,401000.0,482000.0,562000.0,448000.0,303000.0,515000.0
4,A2ALS5|RPGP1_MOUSE,889000.0,720000.0,890000.0,918000.0,876000.0,773000.0,753000.0,411000.0,733000.0,720000.0,657000.0,847000.0,620000.0,761000.0,775000.0,834000.0


In [11]:
pro9m_v2.head()

Unnamed: 0,Accession,Sample 2143,Sample 2145,Sample 2146,Sample 2147,Sample 2148,Sample 2150,Sample 2156,Sample 2227,Sample 2228,Sample 2229,Sample 2232,Sample 2582,Sample 2583,Sample 2721,Sample 3207,Sample 3234
0,A2A432|CUL4B_MOUSE,65400.0,68700.0,52100.0,52900.0,77600.0,77700.0,75800.0,89500.0,88200.0,95400.0,55000.0,84300.0,69100.0,56300.0,49100.0,58100.0
1,A2A699|F1712_MOUSE,95900.0,216000.0,143000.0,104000.0,100000.0,175000.0,89900.0,227000.0,120000.0,172000.0,165000.0,227000.0,176000.0,30000.0,80300.0,156000.0
2,A2AGT5|CKAP5_MOUSE,194000.0,210000.0,109000.0,142000.0,139000.0,234000.0,149000.0,202000.0,158000.0,173000.0,163000.0,191000.0,216000.0,159000.0,165000.0,131000.0
3,A2AJI0|MA7D1_MOUSE,98700.0,217000.0,118000.0,126000.0,100000.0,310000.0,183000.0,210000.0,213000.0,209000.0,147000.0,188000.0,258000.0,120000.0,145000.0,145000.0
4,A2ALS5|RPGP1_MOUSE,194000.0,288000.0,177000.0,191000.0,200000.0,407000.0,264000.0,281000.0,272000.0,171000.0,194000.0,237000.0,317000.0,214000.0,217000.0,201000.0


In [12]:
proteins_v2 = pd.concat([pro3m_v2, pro6m_v2.drop(columns=["Accession"], axis=1), pro9m_v2.drop(columns=["Accession"], axis=1)], axis=1)
proteins_v2

Unnamed: 0,Accession,Sample 3131,Sample 3132,Sample 3140,Sample 3141,Sample 3153,Sample 3154,Sample 3156,Sample 3157,Sample 3346,...,Sample 2156,Sample 2227,Sample 2228,Sample 2229,Sample 2232,Sample 2582,Sample 2583,Sample 2721,Sample 3207,Sample 3234
0,A2A432|CUL4B_MOUSE,231000.0,336000.0,278000.0,251000.0,279000.0,238000.0,249000.0,298000.0,270000.0,...,75800.0,89500.0,88200.0,95400.0,55000.0,84300.0,69100.0,56300.0,49100.0,58100.0
1,A2A699|F1712_MOUSE,413000.0,255000.0,278000.0,243000.0,443000.0,257000.0,224000.0,331000.0,418000.0,...,89900.0,227000.0,120000.0,172000.0,165000.0,227000.0,176000.0,30000.0,80300.0,156000.0
2,A2AGT5|CKAP5_MOUSE,463000.0,727000.0,534000.0,638000.0,619000.0,786000.0,496000.0,619000.0,529000.0,...,149000.0,202000.0,158000.0,173000.0,163000.0,191000.0,216000.0,159000.0,165000.0,131000.0
3,A2AJI0|MA7D1_MOUSE,310000.0,453000.0,349000.0,449000.0,380000.0,437000.0,241000.0,426000.0,340000.0,...,183000.0,210000.0,213000.0,209000.0,147000.0,188000.0,258000.0,120000.0,145000.0,145000.0
4,A2ALS5|RPGP1_MOUSE,440000.0,763000.0,734000.0,761000.0,584000.0,565000.0,380000.0,866000.0,339000.0,...,264000.0,281000.0,272000.0,171000.0,194000.0,237000.0,317000.0,214000.0,217000.0,201000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,Q9Z2W8|GRIA4_MOUSE,719000.0,797000.0,743000.0,984000.0,907000.0,806000.0,659000.0,781000.0,715000.0,...,451000.0,370000.0,389000.0,333000.0,318000.0,291000.0,367000.0,313000.0,287000.0,326000.0
1725,Q9Z2W9|GRIA3_MOUSE,770000.0,1060000.0,707000.0,1050000.0,894000.0,840000.0,597000.0,1230000.0,846000.0,...,241000.0,471000.0,315000.0,330000.0,428000.0,421000.0,369000.0,280000.0,388000.0,301000.0
1726,Q9Z2X1|HNRPF_MOUSE,614000.0,1130000.0,878000.0,720000.0,1070000.0,1770000.0,655000.0,585000.0,594000.0,...,380000.0,282000.0,343000.0,347000.0,251000.0,340000.0,288000.0,230000.0,224000.0,208000.0
1727,Q9Z2Y3|HOME1_MOUSE,1170000.0,1410000.0,1310000.0,1270000.0,977000.0,127000.0,1400000.0,1760000.0,1400000.0,...,606000.0,620000.0,736000.0,541000.0,568000.0,740000.0,781000.0,563000.0,648000.0,538000.0


## Cleaning and Saving

In [13]:
proteins_v2["Accession"] = proteins_v2["Accession"].apply(lambda x: x[:-6])
proteins_v2.index = proteins_v2["Accession"]
proteins_v2.drop(columns=["Accession"], inplace=True)
proteins_v2

Unnamed: 0_level_0,Sample 3131,Sample 3132,Sample 3140,Sample 3141,Sample 3153,Sample 3154,Sample 3156,Sample 3157,Sample 3346,Sample 3347,...,Sample 2156,Sample 2227,Sample 2228,Sample 2229,Sample 2232,Sample 2582,Sample 2583,Sample 2721,Sample 3207,Sample 3234
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2A432|CUL4B,231000.0,336000.0,278000.0,251000.0,279000.0,238000.0,249000.0,298000.0,270000.0,285000.0,...,75800.0,89500.0,88200.0,95400.0,55000.0,84300.0,69100.0,56300.0,49100.0,58100.0
A2A699|F1712,413000.0,255000.0,278000.0,243000.0,443000.0,257000.0,224000.0,331000.0,418000.0,439000.0,...,89900.0,227000.0,120000.0,172000.0,165000.0,227000.0,176000.0,30000.0,80300.0,156000.0
A2AGT5|CKAP5,463000.0,727000.0,534000.0,638000.0,619000.0,786000.0,496000.0,619000.0,529000.0,513000.0,...,149000.0,202000.0,158000.0,173000.0,163000.0,191000.0,216000.0,159000.0,165000.0,131000.0
A2AJI0|MA7D1,310000.0,453000.0,349000.0,449000.0,380000.0,437000.0,241000.0,426000.0,340000.0,420000.0,...,183000.0,210000.0,213000.0,209000.0,147000.0,188000.0,258000.0,120000.0,145000.0,145000.0
A2ALS5|RPGP1,440000.0,763000.0,734000.0,761000.0,584000.0,565000.0,380000.0,866000.0,339000.0,638000.0,...,264000.0,281000.0,272000.0,171000.0,194000.0,237000.0,317000.0,214000.0,217000.0,201000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Z2W8|GRIA4,719000.0,797000.0,743000.0,984000.0,907000.0,806000.0,659000.0,781000.0,715000.0,827000.0,...,451000.0,370000.0,389000.0,333000.0,318000.0,291000.0,367000.0,313000.0,287000.0,326000.0
Q9Z2W9|GRIA3,770000.0,1060000.0,707000.0,1050000.0,894000.0,840000.0,597000.0,1230000.0,846000.0,726000.0,...,241000.0,471000.0,315000.0,330000.0,428000.0,421000.0,369000.0,280000.0,388000.0,301000.0
Q9Z2X1|HNRPF,614000.0,1130000.0,878000.0,720000.0,1070000.0,1770000.0,655000.0,585000.0,594000.0,642000.0,...,380000.0,282000.0,343000.0,347000.0,251000.0,340000.0,288000.0,230000.0,224000.0,208000.0
Q9Z2Y3|HOME1,1170000.0,1410000.0,1310000.0,1270000.0,977000.0,127000.0,1400000.0,1760000.0,1400000.0,1340000.0,...,606000.0,620000.0,736000.0,541000.0,568000.0,740000.0,781000.0,563000.0,648000.0,538000.0


In [84]:
proObs = pd.read_excel("./data/alzPro/alz_sample_id.xlsx")

proObs

Unnamed: 0,Qi ID,sex,Group,Timepoint
0,3131,M,HET,3m
1,3132,M,HET,3m
2,3140,M,HET,3m
3,3141,M,HET,3m
4,3153,F,HET,3m
5,3154,F,HET,3m
6,3156,F,HET,3m
7,3157,F,HET,3m
8,3346,F,WT,3m
9,3347,F,WT,3m


In [112]:
t1 = proteins_v2.T
t1.reset_index(drop=True, inplace=True)
t2 = proObs
t3 = pd.DataFrame(index=proteins_v2.index)
t3.head()

A2A432|CUL4B
A2A699|F1712
A2AGT5|CKAP5
A2AJI0|MA7D1
A2ALS5|RPGP1


In [113]:
adata = sc.AnnData(X=t1, obs=t2, var=t3)



In [114]:
print(adata)

AnnData object with n_obs × n_vars = 48 × 1729
    obs: 'Qi ID', 'sex', 'Group', 'Timepoint'


In [119]:
adata.write_h5ad("./data/alzPro_count.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Group' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Timepoint' as categorical
