# Welcome to the Python notebook for Neuronal Differentation 2021 Clustering

**In this notebook, all the major steps involving the scraping and filtering of orthologous genes related to protein degradation ontologies (using python) will be shown**

### 4 scripts can be found in this notebook:

**1. ontologyscraper.py**

Used to retrieve all genes in select protein degradation ontologies, obtaining a **master.csv** (which can be found at **pipe-ABC/Clustering/GenesOfInterest/master.csv**)

**2. mergedvsmaster.py**

Used to compare the **master.csv** containing relevant genes with the list of genes obtained through our RNASeqPipeline, which have been merged with a metadata file matching ENSG IDs to Gene IDs (**merged.csv**, which can be found at **pipe-ABC/Clustering/GenesOfInterest/merged.csv**) for gene to gene comparison between files

**3. clusteryoink.py**

Used to pull out genes which are orthologous between the two species only from **vsd_transformed.csv** (which can be found at **pipe-ABC/Clustering/GenesOfInterest/vsd_transformed.csv**), matching them with the **master.csv** and which will serve as the dataset for clustering.

**4. clusterfilter.py**

Used to pull out significant recurring clusters (from **filtered_clusters_human.csv** and **filtered_clusters_mouse.csv**, which can both be found at **pipe-ABC/Clustering/GenesOfInterest/**)  between species for further analysis. Note that this step was executed AFTER clustering was performed.

*In total, this process found 779 genes of interest, which resulted in a total of 204 genes after all 4 steps were executed.*



## ontologyscraper.py

In [None]:
import pandas as pd
import numpy as np

with open('master.csv', 'a') as infile:

    df_human = pd.read_csv('human_ontology.tsv',sep='\t',names=["Label", "Gene", "Taxon", "Reference"])
    df_mouse = pd.read_csv('mouse_ontology.tsv',sep='\t',names=["Label", "Gene", "Taxon", "Reference"])

    df_mouse['Label'] = df_mouse['Label'].str.upper()

    df_final = pd.merge(df_human, df_mouse, on=['Label'])
    
    df_master = df_final.drop_duplicates(subset='Label',keep='first',ignore_index=True)
    
    df_master=df_master.assign(Ontology='OGXXXXXX') #this line requires the user to input the ontology ID they are actively scraping
    
    df_master.to_csv('master.csv', mode='a', header=False, index=False)
    
infile.close()


## mergedvsmaster.py

In [None]:
import csv
import pandas as pd

#for human

with open("output.csv", "w", newline="") as out_file:
    writer = csv.writer(out_file)
        
    df = pd.read_csv("master.csv")
    
    genes = df["Label"].tolist()
        
    with open("merged.csv", "r") as merged_file:
    
        csv_merged = csv.reader(merged_file, delimiter=",")
            
        for row_merged in csv_merged: 
            merged = row_merged[2]
            checker = str(merged)
            #print(checker)
          
               
            if merged in genes:
                
                ontology = df.loc[df['Label'] == checker, 'Ontology'].iloc[0]
                  
                  
                row_merged.append(ontology)
                writer.writerow(row_merged)
                
        merged_file.close()

out_file.close()


#for mouse

with open("output.csv", "w", newline="") as out_file:
    writer = csv.writer(out_file)
        
    df = pd.read_csv("master.csv")
    
    genes = df["Label"].tolist()
        
    with open("merged.csv", "r") as merged_file:
    
        csv_merged = csv.reader(merged_file, delimiter=",")
            
        for row_merged in csv_merged: 
            merged = row_merged[2]
            merged = merged.upper()
            checker = str(merged)
          
            if merged in genes:
                
                ontology = df.loc[df['Label'] == checker, 'Ontology'].iloc[0]
                  
                  
                row_merged.append(ontology)
                writer.writerow(row_merged)
                
        merged_file.close()

out_file.close()


## clusteryoinker.py

In [None]:
import csv
import pandas as pd

with open("cluster_output_combined.csv", "w", newline="") as out_file:
    writer = csv.writer(out_file)
        
    df = pd.read_csv("master.csv")
    
    genes = df["Label"].tolist()
        
    with open("vsd_transformed.csv", "r") as merged_file:
    
        csv_transformed = csv.reader(merged_file, delimiter=",")
            
        for row_merged in csv_transformed: 
            merged = row_merged[0]
            checker = str(merged)
            print(checker)
               
            if merged in genes:
                  
                writer.writerow(row_merged)
                
        merged_file.close()

out_file.close()

## clusterfilter.py

In [None]:
import pandas as pd
import csv 
import numpy as np

with open("recurring_clusters.csv", "w", newline="") as out_file:
    
    writer = csv.writer(out_file)
        
    df = pd.read_csv("filtered_clusters_human.csv")
    
    genes = df["gene"].tolist()
        
    with open("filtered_clusters_mouse.csv", "r") as in_file:
    
        csv_transformed = csv.reader(in_file, delimiter=",")
            
        for row_merged in csv_transformed: 
            merged = row_merged[1]
            checker = str(merged)
            print(checker)
               
            if merged in genes:
                  
                writer.writerow(row_merged)
                
        in_file.close()
        
        print('MOUSE DONE, MOVING ONTO HUMAN')
    with open("filtered_clusters_human.csv", "r") as in_file:
        
        df1 = pd.read_csv("filtered_clusters_mouse.csv")
    
        genes1 = df1["gene"].tolist()
    
        csv_transformed = csv.reader(in_file, delimiter=",")
            
        for row_merged in csv_transformed: 
            merged = row_merged[1]
            checker = str(merged)
            print(checker)
               
            if merged in genes1:
                  
                writer.writerow(row_merged)
                
        
                
        in_file.close()
    

out_file.close()