# Create lists of ohnologs, small scale duplicates, and singletons in human

## Analysis

In [1]:
import pandas as pd

### List of all human protein-coding genes from Ensembl GRCh37 site

Ensembl human protein-coding genes on chromosomes 1-22+X+Y. Ensembl version 75 (last version that used human genome version GRCh37)

In [2]:
EnsV75ProteinCodingGenes1Y = pd.read_csv('../../datasets/geneLists/Ensembl/EnsV75ProteinCodingGenes1-Y.txt', sep="\t")
EnsV75ProteinCodingGenes1Y = EnsV75ProteinCodingGenes1Y[['Ensembl Gene ID']]
EnsV75ProteinCodingGenes1Y.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000215405
1,ENSG00000268343
2,ENSG00000230031
3,ENSG00000138593
4,ENSG00000268531


In [3]:
len(EnsV75ProteinCodingGenes1Y)

20314

### Get list of human paralogs

Ensembl human paralogs from version 75

In [4]:
humanParalogs = pd.read_csv('../../datasets/geneLists/Ensembl/EnsV75HumanParalogs.txt', sep="\t")
humanParalogs.head()

Unnamed: 0,Ensembl Gene ID,Human Paralog Ensembl Gene ID,Homology Type,Ancestor
0,ENSG00000215405,ENSG00000259455,within_species_paralog,Homo sapiens
1,ENSG00000215405,ENSG00000197414,within_species_paralog,Homo sapiens
2,ENSG00000215405,ENSG00000174450,within_species_paralog,Hominoidea
3,ENSG00000215405,ENSG00000175265,within_species_paralog,Euarchontoglires
4,ENSG00000215405,ENSG00000215252,within_species_paralog,Euarchontoglires


In [5]:
len(humanParalogs)

99258

Concatenate column 1 and 2 and unique to get a list of human paralogs

In [6]:
duplicates = pd.DataFrame(pd.concat([humanParalogs[['Ensembl Gene ID']],
               humanParalogs[['Human Paralog Ensembl Gene ID']].rename(columns={'Human Paralog Ensembl Gene ID': 'Ensembl Gene ID'})]).reset_index()['Ensembl Gene ID'].unique())
duplicates.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
duplicates = pd.merge(duplicates, EnsV75ProteinCodingGenes1Y, how='inner')
duplicates.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000215405
1,ENSG00000268343
2,ENSG00000230031
3,ENSG00000138593
4,ENSG00000268531


In [7]:
len(duplicates)

14184

### Ohnolog lists from Singh and Isambert (2019)

Three lists available:
* Strict:             q-score(outgroup) < 0.001 AND q-score(self) < 0.001
* Intermediate: q-score(outgroup) < 0.01 AND q-score(self) < 0.01
* Relaxed:         q-score(outgroup) < 0.05 AND q-score(self) < 0.3

In [8]:
strict = pd.read_csv('../../datasets/geneLists/Singh and Isambert/hsapiens.Pairs.Strict.2R.txt', sep="\t")
strict = pd.DataFrame(pd.concat([strict[['Ohno1']],
           strict[['Ohno2']].rename(columns={'Ohno2': 'Ohno1'})]).reset_index()['Ohno1'].unique())
strict.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
strict.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000164236
1,ENSG00000147465
2,ENSG00000095464
3,ENSG00000078804
4,ENSG00000155744


In [9]:
len(strict)

4940

In [10]:
intermediate = pd.read_csv('../../datasets/geneLists/Singh and Isambert/hsapiens.Pairs.Intermediate.2R.txt', sep="\t")
intermediate = pd.DataFrame(pd.concat([intermediate[['Ohno1']],
           intermediate[['Ohno2']].rename(columns={'Ohno2': 'Ohno1'})]).reset_index()['Ohno1'].unique())
intermediate.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
intermediate.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000164236
1,ENSG00000147465
2,ENSG00000095464
3,ENSG00000078804
4,ENSG00000155744


In [11]:
len(intermediate)

5964

In [12]:
relaxed = pd.read_csv('../../datasets/geneLists/Singh and Isambert/hsapiens.Pairs.Relaxed.2R.txt', sep="\t")
relaxed = pd.DataFrame(pd.concat([relaxed[['Ohno1']],
           relaxed[['Ohno2']].rename(columns={'Ohno2': 'Ohno1'})]).reset_index()['Ohno1'].unique())
relaxed.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
relaxed.head()

Unnamed: 0,Ensembl Gene ID
0,ENSG00000164236
1,ENSG00000147465
2,ENSG00000095464
3,ENSG00000078804
4,ENSG00000155744


In [13]:
len(relaxed)

6870

### Make list of ohnologs, SSDs and singletons

#### Ensure Singh ohnologs are on Ensembl V75 gene list and listed as a paralog

In [14]:
strictEns = pd.merge(strict, EnsV75ProteinCodingGenes1Y, how='inner')
strictEns = pd.merge(strictEns, duplicates, how='inner')
strictEns.to_csv('../../datasets/geneLists/Singh and Isambert/hsapiens.Pairs.Strict.2R.Ens75.1-Y.txt', sep='\t', index=False)
len(strictEns)

4833

In [15]:
intermediateEns = pd.merge(intermediate, EnsV75ProteinCodingGenes1Y, how='inner')
intermediateEns = pd.merge(intermediateEns, duplicates, how='inner')
intermediateEns.to_csv('../../datasets/geneLists/Singh and Isambert/hsapiens.Pairs.Intermediate.2R.Ens75.1-Y.txt', sep='\t', index=False)
len(intermediateEns)

5823

In [16]:
relaxedEns = pd.merge(relaxed, EnsV75ProteinCodingGenes1Y, how='inner')
relaxedEns = pd.merge(relaxedEns, duplicates, how='inner')
relaxedEns.to_csv('../../datasets/geneLists/Singh and Isambert/hsapiens.Pairs.Relaxed.2R.Ens75.1-Y.txt', sep='\t', index=False)
len(relaxedEns)

6693

#### Get list of SSDs at different stringency levels

In [17]:
SSDsStrictOhnos = duplicates.loc[~duplicates['Ensembl Gene ID'].isin(strictEns['Ensembl Gene ID'])]
SSDsStrictOhnos.to_csv('../../datasets/geneLists/SSDsStrictOhnos.txt', sep='\t', index=False)
len(SSDsStrictOhnos)

9351

In [18]:
SSDsIntermediateOhnos = duplicates.loc[~duplicates['Ensembl Gene ID'].isin(intermediateEns['Ensembl Gene ID'])]
SSDsIntermediateOhnos.to_csv('../../datasets/geneLists/SSDsIntermediateOhnos.txt', sep='\t', index=False)
len(SSDsIntermediateOhnos)

8361

In [19]:
SSDsRelaxedOhnos = duplicates.loc[~duplicates['Ensembl Gene ID'].isin(relaxedEns['Ensembl Gene ID'])]
SSDsRelaxedOhnos.to_csv('../../datasets/geneLists/SSDsRelaxedOhnos.txt', sep='\t', index=False)
len(SSDsRelaxedOhnos)

7491

#### Get list of singletons

In [20]:
singletons = EnsV75ProteinCodingGenes1Y.loc[~EnsV75ProteinCodingGenes1Y['Ensembl Gene ID'].isin(duplicates['Ensembl Gene ID'])]
singletons.to_csv('../../datasets/geneLists/singletons.txt', sep='\t', index=False)
len(singletons)

6130

## Cow genes

### List of all human protein-coding genes from Ensembl GRCh37 site

Ensembl human protein-coding genes on chromosomes 1-22+X+Y. Ensembl version 75 (last version that used human genome version GRCh37)

In [4]:
EnsV96CowProteinCodingGenes1X = pd.read_csv('../../datasets/geneLists/Ensembl/CowV96/EnsV96CowProteinCodingGenes1-X.txt', sep="\t")
EnsV96CowProteinCodingGenes1X = EnsV96CowProteinCodingGenes1X[['Gene stable ID']]
EnsV96CowProteinCodingGenes1X.rename(columns={'Gene stable ID': 'Ensembl Gene ID'}, inplace=True)
EnsV96CowProteinCodingGenes1X.head()

Unnamed: 0,Ensembl Gene ID
0,ENSBTAG00000006648
1,ENSBTAG00000054829
2,ENSBTAG00000001753
3,ENSBTAG00000046015
4,ENSBTAG00000021251


In [5]:
len(EnsV96CowProteinCodingGenes1X)

21630

### Get list of human paralogs

Ensembl human paralogs from version 75

In [11]:
cowParalogs = pd.read_csv('../../datasets/geneLists/Ensembl/CowV96/EnsV96CowParalogs.txt', sep="\t")
cowParalogs.dropna(subset=['Gene stable ID', 'Cow paralogue gene stable ID'], inplace=True)
cowParalogs.head()

Unnamed: 0,Gene stable ID,Cow paralogue gene stable ID,Cow paralogue homology type,Paralogue last common ancestor with Cow
0,ENSBTAG00000006648,ENSBTAG00000040490,within_species_paralog,Eutheria
1,ENSBTAG00000006648,ENSBTAG00000054829,within_species_paralog,Eutheria
2,ENSBTAG00000054829,ENSBTAG00000040490,within_species_paralog,Eutheria
3,ENSBTAG00000054829,ENSBTAG00000006648,within_species_paralog,Eutheria
4,ENSBTAG00000001753,ENSBTAG00000000170,other_paralog,Opisthokonta


In [12]:
len(cowParalogs)

564767

In [13]:
cowParalogs

Unnamed: 0,Gene stable ID,Cow paralogue gene stable ID,Cow paralogue homology type,Paralogue last common ancestor with Cow
0,ENSBTAG00000006648,ENSBTAG00000040490,within_species_paralog,Eutheria
1,ENSBTAG00000006648,ENSBTAG00000054829,within_species_paralog,Eutheria
2,ENSBTAG00000054829,ENSBTAG00000040490,within_species_paralog,Eutheria
3,ENSBTAG00000054829,ENSBTAG00000006648,within_species_paralog,Eutheria
4,ENSBTAG00000001753,ENSBTAG00000000170,other_paralog,Opisthokonta
5,ENSBTAG00000001753,ENSBTAG00000053662,other_paralog,Opisthokonta
6,ENSBTAG00000001753,ENSBTAG00000011843,other_paralog,Opisthokonta
7,ENSBTAG00000001753,ENSBTAG00000003989,other_paralog,Opisthokonta
8,ENSBTAG00000001753,ENSBTAG00000038540,other_paralog,Opisthokonta
9,ENSBTAG00000001753,ENSBTAG00000015085,other_paralog,Opisthokonta


Concatenate column 1 and 2 and unique to get a list of human paralogs

In [14]:
duplicates = pd.DataFrame(pd.concat([cowParalogs[['Gene stable ID']],
               cowParalogs[['Cow paralogue gene stable ID']].rename(columns={'Cow paralogue gene stable ID': 'Gene stable ID'})]).reset_index()['Gene stable ID'].unique())
duplicates.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
duplicates = pd.merge(duplicates, EnsV96CowProteinCodingGenes1X, how='inner')
duplicates.head()

Unnamed: 0,Ensembl Gene ID
0,ENSBTAG00000006648
1,ENSBTAG00000054829
2,ENSBTAG00000001753
3,ENSBTAG00000046015
4,ENSBTAG00000020035


In [15]:
len(duplicates)

17515

### Ohnolog lists from Singh and Isambert (2019)

Three lists available:
* Strict:             q-score(outgroup) < 0.001 AND q-score(self) < 0.001
* Intermediate: q-score(outgroup) < 0.01 AND q-score(self) < 0.01
* Relaxed:         q-score(outgroup) < 0.05 AND q-score(self) < 0.3

In [16]:
strict = pd.read_csv('../../datasets/geneLists/Singh and Isambert/btaurus.Pairs.Strict.2R.txt', sep="\t")
strict = pd.DataFrame(pd.concat([strict[['Ohno1']],
           strict[['Ohno2']].rename(columns={'Ohno2': 'Ohno1'})]).reset_index()['Ohno1'].unique())
strict.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
strict.head()

Unnamed: 0,Ensembl Gene ID
0,ENSBTAG00000021457
1,ENSBTAG00000003097
2,ENSBTAG00000016838
3,ENSBTAG00000014526
4,ENSBTAG00000005810


In [17]:
len(strict)

4854

In [18]:
intermediate = pd.read_csv('../../datasets/geneLists/Singh and Isambert/btaurus.Pairs.Intermediate.2R.txt', sep="\t")
intermediate = pd.DataFrame(pd.concat([intermediate[['Ohno1']],
           intermediate[['Ohno2']].rename(columns={'Ohno2': 'Ohno1'})]).reset_index()['Ohno1'].unique())
intermediate.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
intermediate.head()

Unnamed: 0,Ensembl Gene ID
0,ENSBTAG00000021457
1,ENSBTAG00000003097
2,ENSBTAG00000016838
3,ENSBTAG00000014526
4,ENSBTAG00000003727


In [19]:
len(intermediate)

5860

In [20]:
relaxed = pd.read_csv('../../datasets/geneLists/Singh and Isambert/btaurus.Pairs.Relaxed.2R.txt', sep="\t")
relaxed = pd.DataFrame(pd.concat([relaxed[['Ohno1']],
           relaxed[['Ohno2']].rename(columns={'Ohno2': 'Ohno1'})]).reset_index()['Ohno1'].unique())
relaxed.rename(columns={0: 'Ensembl Gene ID'}, inplace=True)
relaxed.head()

Unnamed: 0,Ensembl Gene ID
0,ENSBTAG00000032079
1,ENSBTAG00000021457
2,ENSBTAG00000003097
3,ENSBTAG00000016838
4,ENSBTAG00000016794


In [21]:
len(relaxed)

6774

### Make list of ohnologs, SSDs and singletons

#### Ensure Singh ohnologs are on Ensembl V75 gene list and listed as a paralog

In [22]:
strictEns = pd.merge(strict, EnsV96CowProteinCodingGenes1X, how='inner')
strictEns = pd.merge(strictEns, duplicates, how='inner')
strictEns.to_csv('../../datasets/geneLists/Singh and Isambert/btaurus.Pairs.Strict.2R.Ens96.1-X.txt', sep='\t', index=False)
len(strictEns)

4561

In [23]:
intermediateEns = pd.merge(intermediate, EnsV96CowProteinCodingGenes1X, how='inner')
intermediateEns = pd.merge(intermediateEns, duplicates, how='inner')
intermediateEns.to_csv('../../datasets/geneLists/Singh and Isambert/btaurus.Pairs.Intermediate.2R.Ens96.1-X.txt', sep='\t', index=False)
len(intermediateEns)

5465

In [24]:
relaxedEns = pd.merge(relaxed, EnsV96CowProteinCodingGenes1X, how='inner')
relaxedEns = pd.merge(relaxedEns, duplicates, how='inner')
relaxedEns.to_csv('../../datasets/geneLists/Singh and Isambert/btaurus.Pairs.Relaxed.2R.Ens96.1-X.txt', sep='\t', index=False)
len(relaxedEns)

6272

#### Get list of SSDs at different stringency levels

In [25]:
SSDsStrictOhnos = duplicates.loc[~duplicates['Ensembl Gene ID'].isin(strictEns['Ensembl Gene ID'])]
SSDsStrictOhnos.to_csv('../../datasets/geneLists/SSDsStrictOhnos.cow.txt', sep='\t', index=False)
len(SSDsStrictOhnos)

12954

In [26]:
SSDsIntermediateOhnos = duplicates.loc[~duplicates['Ensembl Gene ID'].isin(intermediateEns['Ensembl Gene ID'])]
SSDsIntermediateOhnos.to_csv('../../datasets/geneLists/SSDsIntermediateOhnos.cow.txt', sep='\t', index=False)
len(SSDsIntermediateOhnos)

12050

In [27]:
SSDsRelaxedOhnos = duplicates.loc[~duplicates['Ensembl Gene ID'].isin(relaxedEns['Ensembl Gene ID'])]
SSDsRelaxedOhnos.to_csv('../../datasets/geneLists/SSDsRelaxedOhnos.cow.txt', sep='\t', index=False)
len(SSDsRelaxedOhnos)

11243

#### Get list of singletons

In [28]:
singletons = EnsV96CowProteinCodingGenes1X.loc[~EnsV96CowProteinCodingGenes1X['Ensembl Gene ID'].isin(duplicates['Ensembl Gene ID'])]
singletons.to_csv('../../datasets/geneLists/singletons.cow.txt', sep='\t', index=False)
len(singletons)

4115