In [21]:
# Load packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json 


# 1) Create deg data for the subsets that are tested against the background 

In [22]:
# IMPORT DEG DATA 
# Dictionary to store deg_data 
deg = {}

# create an input file dict 
input_files = {'NvsA': '1_prepare_data_for_GO_analysis/comp_anoxia_normoxia.csv', 'NvsR': '1_prepare_data_for_GO_analysis/comp_reoxygenation_normoxia.csv', 'AvsR': '1_prepare_data_for_GO_analysis/comp_anoxia_reoxygenation.csv'}

# Process each input file
for name, input_file in input_files.items():
    # Assign the name to dmr_subset
    deg[name] = pd.read_csv(input_file, header=0, sep=',')



# IMPORT DMR_GENIDS
# Dictionary to store dmr_geneids with their names
with open('1_prepare_data_for_GO_analysis/dmr_unique_geneids.json', 'r') as file: 
    dmr_geneids = json.load(file)



In [23]:
# Filter the deg data to only keep the lines that correspond with the dmr_geneids 
# set geneid column (Unnamed: 0) as index
for name, df in deg.items():
    df.rename(columns={'Unnamed: 0': 'gene_id'}, inplace=True)
    df.set_index('gene_id', inplace=True)


In [24]:
# Extract the lines that correspond with the dmr_geneids (this is the subset that will be tested against the background)
 
subset_go = {}

for name,df in deg.items():
        selection = df.loc[dmr_geneids[name]]
        selection.reset_index(inplace=True)
        subset_go[name] = selection
        # export to csv
        selection.to_csv(f'1_prepare_data_for_GO_analysis/goseq_{name}.csv', sep=',', index=True)

# 2) Get gene length of all genes 

In [25]:
# import ccar_annotation.gtf as pd df 
import pandas as pd 

annotation = pd.read_csv('/home/maggy/WholeGenomeBisulphiteSequencing/DNAseq_pipeline/7_match_dmr_to_gene/ccar_annotation.gtf', sep='\t', skiprows=1, header=None)
# renaming the header of the annotation file
annotation = annotation.rename({0: 'seqname',
                                1: 'source',
                                2: 'feature',
                                3: 'start',
                                4: 'stop',
                                5: 'score',
                                6: 'strand',
                                7: 'frame',
                                8: 'attribute'}, axis=1)

    
# define a function to extract the geneid
def extract_gene_id(attributes):
    # Using a regular expression to match and extract the desired part
    match = re.search(r'gene_id "([^"]+)"', attributes)
    if match:
        return match.group(1)
    else:
        return 0 # Return 0 if there is no match
    
# filtering the column feature for transcritps
annotation = annotation[annotation['feature'] == 'transcript']

# Apply the function to the "attributes" column and create a new column with the extracted gene ids
annotation['gene_id'] = annotation['attribute'].apply(extract_gene_id)

# DEDUPLICATE 
# since a gene has multiple transcripts you have to deduplicate the geneids 
# calculate trascript length 
annotation['gene_length'] = annotation['stop'] - annotation['start']
# sort annotation for geneid and transcript length (length is sorted ascending) 
annotation = annotation.sort_values(['gene_id','gene_length'])
# deduplicate to keep the gene id with the longest transcript length 
annotation = annotation.drop_duplicates(subset=['gene_id'], keep='last')

In [26]:
gene_length = annotation[['gene_id', 'gene_length']]
# export gene_length to csv without index
gene_length.to_csv('1_prepare_data_for_GO_analysis/gene_length.csv', sep=',', index=False)

# 3) Edit geneid_GO term df to fit layout 

In [27]:
geneid_goterm = pd.read_csv('/home/maggy/WholeGenomeBisulphiteSequencing/DNAseq_pipeline/9_DMR_functional_annotation/gene_id_goterms_df_fullgenome.tsv', sep='\t', header=None)
geneid_goterm = geneid_goterm.rename({0: 'gene_id', 1: 'GO annotations'}, axis=1)
geneid_goterm = geneid_goterm.drop(columns=[2])
# export as txt
geneid_goterm.to_csv('1_prepare_data_for_GO_analysis/geneid_goterm.txt', sep='\t', index=False)

# 4) Total number of genes

In [28]:
total = pd.read_csv('1_prepare_data_for_GO_analysis/total_number_of_genes.csv', sep=',')
total.rename(columns={'x': 'gene_id'}, inplace=True)
# export as txt 
total.to_csv('1_prepare_data_for_GO_analysis/total_number_of_genes.txt', sep='\t', index=False)