Tasks: 
- [x] Read in Significant families tables
- [x] Merge with annotation families 
- [x] write out table 


In [1]:
import pandas as pd 
# read file
basedir = '.\\'
dirnames = ['Cafe_twoPlus_3_levels_Aug2021\\', 'Cafe_twoPlus_3_levels_Aug2021_background\\', 'Cafe_twoPlus_3_levels_Aug2021_canonical\\', 'Cafe_twoPlus_3_levels_Aug2021_noncanonical\\'] # directories to get files from 

names = ['all3', 'backg3', 'canon3', 'noncan3']

# Setup for outputs
cwd = os.getcwd()
path = cwd + '\outputTables\\' 


In [2]:
def readFile(baseDir, dirname, filename): # github copilot version
    """ Read in csv file from the basedir and the specific dir names into a pandas dataframe and convert all counts to integers. Return the dataframe""" 
    filepath = os.path.join(baseDir, dirname, filename) 
    return pd.read_csv(filepath, header=0, sep='\t', index_col=0, converters={'count': lambda x: int(x)})


In [3]:
""" Read in all files in the directories found in dirnames as dataframes, using readFile function """ 

famAllSig = readFile(basedir, dirnames[0], 'Significant_families.txt')
famBackgSig = readFile(basedir, dirnames[1], 'Significant_families.txt')
famCanonSig = readFile(basedir, dirnames[2], 'Significant_families.txt')
famNoncanonSig = readFile(basedir, dirnames[3], 'Significant_families.txt')

allChange = readFile(basedir, dirnames[0], 'Base_change.tab')
backgChange = readFile(basedir, dirnames[1], 'Base_change.tab')
canChange = readFile(basedir, dirnames[2], 'Base_change.tab')
ncanChange = readFile(basedir, dirnames[3], 'Base_change.tab')

print(famNoncanonSig.head())


            pvalue Significant at 0.05
#FamilyID                             
ORTHOMCL14     0.0                   y
ORTHOMCL30     0.0                   y
ORTHOMCL48     0.0                   y
ORTHOMCL53     0.0                   y
ORTHOMCL69     0.0                   y


In [5]:
# Read in annotation file 
annotname = '.\\inputs\\FastOrtho_Amel_annotated.tsv'
annots=pd.read_csv(annotname, sep='\t', index_col='Family.ID')
countsname = '.\\inputs\\FastOrtho_countsTable.tsv'
counts=pd.read_csv(countsname, sep='\t', index_col='Family ID')


In [6]:
print(annots.head())

                  GeneID            name source    TranscriptID Class
Family.ID                                                            
ORTHOMCL38          18-w  NP_001013379.1   Amel  NM_001013361.1   NaN
ORTHOMCL4153       5-HT1  NP_001164579.1   Amel  NM_001171108.1   NaN
ORTHOMCL1395  5-HT2alpha  NP_001189389.1   Amel  NM_001202460.1   NaN
ORTHOMCL3325   5-HT2beta  NP_001191178.1   Amel  NM_001204249.1   NaN
ORTHOMCL7819       5-ht7  NP_001071289.1   Amel  NM_001077821.1   NaN


In [7]:
""" Replace NaN with 'Background' in the Class column of annots dataframe """ 
annots.Class = annots.Class.fillna('Background')
print(annots.tail())
print(annots.Class.unique())


              GeneID            name source    TranscriptID       Class
Family.ID                                                              
ORTHOMCL7825     Wat  NP_001011562.1   Amel  NM_001011562.1  Background
ORTHOMCL8232  WRNexo  NP_001229369.1   Amel  NM_001242440.1  Background
ORTHOMCL5170     Y-f  NP_001011635.1   Amel  NM_001011635.1  Background
ORTHOMCL4018     Y-h  NP_001091687.1   Amel  NM_001098217.1  Background
ORTHOMCL5152     Y-y  NP_001091693.1   Amel  NM_001098223.1  Background
['Background' 'NonCanon' 'Canon']


In [10]:
def mergeTabs(sigTable, annotsTable):
    """ Merge the counts and annotations tables and save output file. Options commented out if wish to specify merging id's. Here these were set to index. """
    # sigTable.set_index(xCol, inplace=True)                   
    # annotsTable.set_index(yCol, inplace=True)                   
    merged = pd.merge(sigTable, annotsTable, left_index=True, right_index=True)
    return merged

def mergeTabsSave(sigTable, annotsTable, outTable):
    """ Merge the counts and annotations tables and save output file. Options commented out if wish to specify merging id's. Here these were set to index. """
    # sigTable.set_index(xCol, inplace=True)                   
    # annotsTable.set_index(yCol, inplace=True)                   
    merged = pd.merge(sigTable, annotsTable, left_index=True, right_index=True)
    # del merged['Significant at 0.05']
    merged.to_csv(path+'\\'+ outTable, sep='\t')                   
    return merged

In [11]:
backgSigTab = mergeTabs(famBackgSig, annots)
del backgSigTab['Significant at 0.05']
canSigTab = mergeTabs(famCanonSig, annots)
del canSigTab['Significant at 0.05']
noncanSigTab = mergeTabs(famNoncanonSig, annots)
del noncanSigTab['Significant at 0.05']


In [12]:
print(noncanSigTab.head())

            pvalue     GeneID            name source    TranscriptID     Class
ORTHOMCL14     0.0     Gat-1B  NP_001011643.1   Amel  NM_001011643.1  NonCanon
ORTHOMCL30     0.0     Dnmt1a  NP_001164522.1   Amel  NM_001171051.1  NonCanon
ORTHOMCL48     0.0       Hbg3  NP_001011608.1   Amel  NM_001011608.1  NonCanon
ORTHOMCL53     0.0  LOC409670  XP_006561279.1   Amel  XM_006561216.3  NonCanon
ORTHOMCL69     0.0  LOC725439  XP_016768850.2   Amel  XM_016913361.2  NonCanon


In [14]:
""" Now merge with the gene counts table and save the outputs """


backgSigTabCounts = mergeTabsSave(backgSigTab, counts, 'significantChangingBackgroundGenes_counts_Aug21.tab')
canSigTabCounts = mergeTabsSave(canSigTab, counts, 'significantChangingCanonicalGenes_counts_Aug21.tab')
noncanSigTabCounts = mergeTabsSave(noncanSigTab, counts, 'significantChangingNoncanonicalGenes_counts_Aug21.tab')

In [15]:
print(noncanSigTabCounts)

              pvalue     GeneID            name source    TranscriptID  \
ORTHOMCL14     0.000     Gat-1B  NP_001011643.1   Amel  NM_001011643.1   
ORTHOMCL30     0.000     Dnmt1a  NP_001164522.1   Amel  NM_001171051.1   
ORTHOMCL48     0.000       Hbg3  NP_001011608.1   Amel  NM_001011608.1   
ORTHOMCL53     0.000  LOC409670  XP_006561279.1   Amel  XM_006561216.3   
ORTHOMCL69     0.000  LOC725439  XP_016768850.2   Amel  XM_016913361.2   
...              ...        ...             ...    ...             ...   
ORTHOMCL1955   0.028  LOC410263  XP_016770706.1   Amel  XM_016915217.2   
ORTHOMCL1980   0.037  LOC409171     XP_392696.5   Amel     XM_392696.7   
ORTHOMCL2048   0.020  LOC552049  XP_006564745.1   Amel  XM_006564682.3   
ORTHOMCL2971   0.028      GstD1  NP_001171499.1   Amel  NM_001178028.1   
ORTHOMCL2975   0.002  LOC413789     XP_397228.2   Amel     XM_397228.7   

                 Class  Desc  Aflo  Amel  Bimp  Bter  Ccal  Dnov  Emex  Hlab  \
ORTHOMCL14    NonCanon   NaN   

In [16]:
""" Now merge with the gene count changes table and save the outputs """
backgSigTabChanges = mergeTabsSave(backgSigTab, backgChange, 'significantChangingBackgroundGenes_changes_Aug21.tab')
canSigTabChanges = mergeTabsSave(canSigTab, canChange, 'significantChangingCanonicalGenes_changes_Aug21.tab')
noncanSigTabChages = mergeTabsSave(noncanSigTab, ncanChange, 'significantChangingNoncanonicalGenes_changes_Aug21.tab')

In [17]:
print(noncanSigTabChages)

              pvalue     GeneID            name source    TranscriptID  \
ORTHOMCL14     0.000     Gat-1B  NP_001011643.1   Amel  NM_001011643.1   
ORTHOMCL30     0.000     Dnmt1a  NP_001164522.1   Amel  NM_001171051.1   
ORTHOMCL48     0.000       Hbg3  NP_001011608.1   Amel  NM_001011608.1   
ORTHOMCL53     0.000  LOC409670  XP_006561279.1   Amel  XM_006561216.3   
ORTHOMCL69     0.000  LOC725439  XP_016768850.2   Amel  XM_016913361.2   
...              ...        ...             ...    ...             ...   
ORTHOMCL1955   0.028  LOC410263  XP_016770706.1   Amel  XM_016915217.2   
ORTHOMCL1980   0.037  LOC409171     XP_392696.5   Amel     XM_392696.7   
ORTHOMCL2048   0.020  LOC552049  XP_006564745.1   Amel  XM_006564682.3   
ORTHOMCL2971   0.028      GstD1  NP_001171499.1   Amel  NM_001178028.1   
ORTHOMCL2975   0.002  LOC413789     XP_397228.2   Amel     XM_397228.7   

                 Class  Bimp<0>  Bter<1>  Mqua<2>  <3>  ...  <10>  Ccal<11>  \
ORTHOMCL14    NonCanon        3 