#Set Environment

In [None]:
%%bash
pip install itables
pip install matplotlib
pip install pandas
pip install scipy
pip install numpy
pip install seaborn

#Import libraries

In [6]:
import os as os
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import shapiro
import seaborn as sns
import matplotlib.pyplot as plt
import shutil
import glob as glob
import itables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

#Define functions

In [4]:
def count_number_of_unique_struct_columns_by_value(df):
    """
    Count the occurrences of each unique value in the 'struct' column of the DataFrame.

    Parameters:
        df (DataFrame): The input DataFrame.

    Returns:
        Series: A Series containing the count of occurrences for each unique value in the 'struct' column.
    """
    return df.groupby('struct')['struct'].count()

def classify_struct_of_genes(df):
    """
    Classify the 'struct' column of the DataFrame based on the 'New_Name' column.

    Parameters:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: A copy of the input DataFrame with a new 'struct' column containing the classifications.
    """
    df2 = df.copy()
    list_of_struct = ["CP", "Coat", "Glycoprotein", "Nucleoprotein", "Membrane", "Capsid", "Envelope"]
    list_of_non_struct = ["RdRp", "Helicase", "Polymerase", "Methyltransferase", "Cap_MTase-GTase", "MTase", "GTase", "Pro", "Maturation"]
    df2['struct'] = np.where(df2['New_Name'].str.lower().str.contains('|'.join(map(str.lower, list_of_struct))), 'Structural',
                            np.where(df2['New_Name'].str.lower().str.contains('|'.join(map(str.lower, list_of_non_struct))), 'Non-Structural', 'Unknown'))
    return df2

def intersect_genes_by_ORFID(df1, df2):
    """
    Get the intersection of two DataFrames based on the 'ORFID' column.

    Parameters:
        df1 (DataFrame): First input DataFrame.
        df2 (DataFrame): Second input DataFrame.

    Returns:
        DataFrame: A new DataFrame containing the intersection of the two DataFrames.
    """
    return df1[df1['ORFID'].isin(df2['ORFID'])]

def set_struct_column_to_unknown_if_ORFID_not_in_hmm(genes, hmm_with_struct):
    """
    Set the 'struct' column to 'Unknown' for genes not present in the 'hmm_with_struct' DataFrame.

    Parameters:
        genes (DataFrame): The input DataFrame containing genes.
        hmm_with_struct (DataFrame): DataFrame with genes classified based on structure.

    Returns:
        DataFrame: The genes DataFrame with the 'struct' column updated to 'Unknown' for unmatched ORFIDs.
    """
    hmm_with_struct['struct'] = np.where(genes['ORFID'].isin(hmm_with_struct['ORFID']), hmm_with_struct['struct'], 'Unknown')
    return genes

def remove_row_if_ORFID_not_in_genes(hmm_with_struct, genes):
    """
    Remove rows from 'hmm_with_struct' DataFrame if 'ORFID' is not present in the 'genes' DataFrame.

    Parameters:
        hmm_with_struct (DataFrame): DataFrame with genes classified based on structure.
        genes (DataFrame): The input DataFrame containing genes.

    Returns:
        DataFrame: The 'hmm_with_struct' DataFrame with rows removed for unmatched ORFIDs.
    """
    return hmm_with_struct[hmm_with_struct['ORFID'].isin(genes['ORFID'])]

def remove_lines_of_uncompleted_genes(df):
    """
    Remove rows from DataFrame where the 'partial' column contains specific values.

    Parameters:
        df (DataFrame): The input DataFrame.

    Returns:
        DataFrame: A new DataFrame with rows removed based on the 'partial' column conditions.
    """
    return df[~df['partial'].astype(str).isin(['01', '10', '1', '11'])]


In [None]:
# Loading data from CSV files
genome_enc = pd.read_csv("~/Downloads/genome_enc.csv")
genes = pd.read_csv("~/Downloads/Gene_tsv.tsv", sep=",")
genes.drop(genes.columns[0], axis=1, inplace=True)
hmm = pd.read_csv("~/Downloads/RiboV1.4/RiboV1.4_HMMatches.tsv", sep="\t")

In [None]:
# Classify genes based on 'hmm' and 'genes' dataframes
hmm_with_struct = classify_struct_of_genes(hmm)
complete_genes = remove_lines_of_uncompleted_genes(genes)
hmm_with_struct_without_unkown = hmm_with_struct[hmm_with_struct['struct'] != 'Unknown']

In [None]:
# Need for final report
hmm_with_struct['struct'].value_counts()

complete_genes_sorted = complete_genes.sort_values(by=['seqid', 'ENC'])
complete_genes_sorted['ENC_rank'] = complete_genes_sorted.groupby('seqid')['ENC'].rank().astype(int)

hmm_with_struct_slim = hmm_with_struct[['ORFID', 'struct', 'Family', 'New_Name']].drop_duplicates()
hmm_with_struct_slim = hmm_with_struct_slim.drop_duplicates(subset='ORFID', keep=False, inplace=False)

hmm_genes = pd.merge(complete_genes_sorted, hmm_with_struct_slim, on='ORFID', how='inner')
hmm_genes['prop_ENC'] = hmm_genes['ENC'] / hmm_genes.groupby('seqid')['ENC'].transform('max')