In [1]:
#Imports
import os, sys
path_to_package = os.path.abspath(os.path.join('../..'))
if path_to_package not in sys.path:
    sys.path.append(path_to_package)


import scipy.stats as stats
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"

from src.io import *
from src.anova import *
from src.utils import *
from src.visualization import *

In [2]:
#ANOVA_DATASET_PATH = "C:\\Users\\Hector\\Desktop\\code\\sc_regmod\\dataset\\anova"
#ANOVA_CSV_DATASET_PATH = "C:\\Users\\Hector\\Desktop\\code\\sc_regmod\\dataset\\anova_res"
#INDEXES_DATASET_PATH = "C:\\Users\\Hector\\Desktop\\code\\sc_regmod\\dataset\\indexes"

ANOVA_DATASET_PATH = "/home/khaldrem/code/sc_regmod/dataset/anova"
ANOVA_CSV_DATASET_PATH = "/home/khaldrem/code/sc_regmod/dataset/anova_res"
INDEXES_DATASET_PATH = "/home/khaldrem/code/sc_regmod/dataset/indexes"

In [3]:
filepaths = get_filepaths(ANOVA_DATASET_PATH)
dataset = []
for filepath in filepaths:
    filename = get_filename(filepath=filepath)
    data = {}
    data["filepath"] = filepath
    data["filename"] = filename
    data["data"] = read_phylip_file(filepath)
    
    dataset.append(data)

In [4]:
# Get columns size by file
def get_columns_size_by_file(dataset):
    res = []
    for data in dataset:
        d = {}
        d["filename"] = data["filename"]
        d["alignment_length"] = data["data"].get_alignment_length()

        res.append(d)
    
    df = pd.DataFrame(res)

    return df

res_1 = get_columns_size_by_file(dataset)
res_1 = res_1.sort_values(by="alignment_length", ascending=False)



In [5]:
# Figura Largo de cadena vs nombre archivo TODOS
fig = px.bar(res_1, x="filename", y="alignment_length")
fig.update_traces(marker_color='red')
fig.show()

In [6]:
# Figura Largo de cadena vs nombre archivo TOP 50
fig = px.bar(res_1.head(50), x="filename", y="alignment_length")
fig.show()

In [7]:
# SHOW BASE OF INTEREST

def get_bases_data_in_file(filename, dataset):
    bases = {}
    for data in dataset:
        if data["filename"].split("/")[-1] == filename:

            #Open index json file
            f = open(f"{INDEXES_DATASET_PATH}/{filename}.json", "r")
            json_object = json.load(f)
            f.close()

            bases = json_object["anova_data"]["bases"]
    
    res = {
        "key": [],
        "val": []
    }
    for key in bases:
        res["key"].append(key)
        res["val"].append(bases[key])

    return pd.DataFrame.from_dict(res)


def get_percentage_bases(data):
    total = data["val"].sum()
    data["percentage"] = data["val"]/data["val"].sum()



def get_base_info_per_file(filename, dataset):
    res = get_bases_data_in_file(filename, dataset)
    get_percentage_bases(res)

    print(res)

    fig = px.bar(res, x="key", y="val")
    fig.show()

    
#Por todos los archivos mostrar la cantidad de 1 sola base que poseen
#Pq: buscar los archivos que poseen mas cantidad de "-"

def get_info_per_base_all_files(base_interest, dataset):
    res_dict = {"filename": [], "base_values": []}
    res = pd.DataFrame.from_dict(res_dict)

    for data in dataset:
        filename = data['filename'].split('/')[-1]

        #per file open index
        f = open(f"{INDEXES_DATASET_PATH}/{filename}.json", "r")
        json_object = json.load(f)
        f.close()

        #check if that base is in the file
        if base_interest in json_object["anova_data"]["bases"]:
            #Agregar datos al dataframe
            res.loc[len(res)] = [filename, int(json_object["anova_data"]["bases"][base_interest])]


    return res




In [8]:
#Files of interest
# YPL283C YGR296W

get_base_info_per_file("YPL283C", dataset)


  key      val  percentage
0   a  1584331    0.292074
1   t  1411977    0.260300
2   g  1329588    0.245112
3   c  1098520    0.202514


In [9]:
base_interest_df = get_info_per_base_all_files("-", dataset)

In [10]:
base_interest_df = base_interest_df.sort_values(by="base_values", ascending=False)
base_interest_df.head(10)


Unnamed: 0,filename,base_values
2558,YMR173W,1576912.0
2560,YOL155C,1522949.0
2990,YIR019C,1392686.0
3250,YAR050W,1141329.0
1949,YJL020C,1035410.0
568,YCR089W,727073.0
814,YHR211W,674465.0
175,YDL039C,672043.0
2562,YOL051W,510093.0
1964,YIL169C,504486.0


In [11]:
fig = px.bar(base_interest_df, x="filename", y="base_values")
fig.show()

In [12]:
#Top 50
fig = px.bar(base_interest_df.head(50), x="filename", y="base_values")
fig.show()

In [13]:
# Procesamos los csv con los datos obtenidos del anova
# Primero no diferenciamos por ID
# Por cada archivo, obtenemos por cada columna la suma total 
# Guardamos cada resultado de archivo en un dataframe
# filename, SM300-Efficiency,SM300-Rate,SM300-Lag,SM300-AUC,SM60-Efficiency,SM60-Rate,SM60-Lag,SM60-AUC,Ratio-Efficiency,Ratio-Rate,Ratio-Lag,Ratio-AUC 

def get_anova_results_info_all_files(dataset):
    data_rows = {}

    i = 0
    for data in dataset:
        filename = data["filename"].split("/")[-1]
        #print(filename)
        
        res = pd.read_csv(f"{ANOVA_CSV_DATASET_PATH}/{filename}.csv")
        data_rows[str(i)] = [
            filename,
            res["SM300-Efficiency"].sum(),
            res["SM300-Rate"].sum(),
            res["SM300-Lag"].sum(),
            res["SM300-AUC"].sum(),
            res["SM60-Efficiency"].sum(),
            res["SM60-Rate"].sum(),
            res["SM60-Lag"].sum(),
            res["SM60-AUC"].sum(),
            res["Ratio-Efficiency"].sum(),
            res["Ratio-Rate"].sum(),
            res["Ratio-Lag"].sum(),
            res["Ratio-AUC"].sum()
        ]

        i = i + 1

    res = pd.DataFrame.from_dict(data_rows, orient="index", 
                                columns=["filename", 
                                         "SM300-Efficiency", "SM300-Rate", "SM300-Lag", "SM300-AUC",
                                         "SM60-Efficiency", "SM60-Rate", "SM60-Lag", "SM60-AUC",
                                         "Ratio-Efficiency", "Ratio-Rate", "Ratio-Lag", "Ratio-AUC"])

    

    return res

In [14]:
# ETA 18 seg
anova_res_data = get_anova_results_info_all_files(dataset)

In [16]:
anova_res_data

Unnamed: 0,filename,SM300-Efficiency,SM300-Rate,SM300-Lag,SM300-AUC,SM60-Efficiency,SM60-Rate,SM60-Lag,SM60-AUC,Ratio-Efficiency,Ratio-Rate,Ratio-Lag,Ratio-AUC
0,YNL237W,22.0,23.0,31.0,24.0,37.0,25.0,20.0,30.0,32.0,30.0,13.0,19.0
1,YER140W,19.0,18.0,22.0,22.0,42.0,25.0,16.0,27.0,29.0,34.0,11.0,29.0
2,YPL111W,13.0,12.0,33.0,33.0,43.0,37.0,36.0,36.0,22.0,24.0,11.0,15.0
3,YDL214C,49.0,46.0,42.0,56.0,95.0,70.0,49.0,75.0,80.0,85.0,32.0,53.0
4,YKL188C,54.0,45.0,45.0,48.0,62.0,54.0,40.0,69.0,62.0,65.0,27.0,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6010,YML123C,46.0,41.0,23.0,45.0,71.0,47.0,19.0,51.0,53.0,54.0,32.0,48.0
6011,YPR196W,70.0,71.0,57.0,86.0,78.0,72.0,49.0,79.0,53.0,67.0,24.0,48.0
6012,YGL228W,76.0,72.0,50.0,73.0,90.0,66.0,52.0,62.0,79.0,77.0,47.0,77.0
6013,YNL330C,19.0,16.0,9.0,16.0,23.0,23.0,9.0,20.0,18.0,18.0,7.0,15.0


In [19]:
anova_res_data = anova_res_data.sort_values(by="SM300-Efficiency", ascending=False)

fig = px.bar(anova_res_data.head(50), x="filename", y="SM300-Efficiency")
fig.show()