In [1]:
#Imports
import os, sys
path_to_package = os.path.abspath(os.path.join('../..'))
if path_to_package not in sys.path:
    sys.path.append(path_to_package)

import numpy as np
import pandas as pd
from src.io import *

from sklearn.model_selection import train_test_split


In [2]:
#Archivo de prueba: YBL088C

phenotypes = read_phenotypes_file("/home/khaldrem/code/sc_regmod/dataset/fenotipos/fenotipos_clean.csv")
data = read_phylip_file("/home/khaldrem/code/sc_regmod/dataset/anova/YBL088C.phylip")

In [3]:
#Transform data into a DataFrame

df = pd.DataFrame()

#Id row
list_ids = []
for row in data:
    list_ids.append(row.id)

df.insert(0, "ids", list_ids)

#Data rows
index = 1
for col in range(data.get_alignment_length()):
    col_name = "x" + str(index)
    df.insert(index, col_name, list(data[:, col]))
    index += 1


#Y column
phenotypes_ordered = []
SELECTED_PHENOTYPE = "SM300-Efficiency"

for ind in df.index:
    value = float(phenotypes.loc[phenotypes['Standard'] == df['ids'][ind]][SELECTED_PHENOTYPE])
    phenotypes_ordered.append(value)

df.insert(index, "y", phenotypes_ordered)

df.head()

  if await self.run_code(code, result, async_=asy):


Unnamed: 0,ids,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x539,x540,x541,x542,x543,x544,x545,x546,x547,y
0,SACE_YAU,g,g,t,a,t,t,c,g,c,...,a,c,c,t,a,a,c,c,g,0.4674
1,SACE_YCF,g,g,t,a,t,t,c,g,g,...,a,c,c,t,a,a,c,c,g,0.6397
2,SACE_YAY,g,g,t,a,t,t,c,g,g,...,a,c,c,t,a,a,c,c,g,0.746
3,SACE_YCR,g,g,t,a,t,t,g,g,g,...,a,c,c,t,a,a,c,c,g,0.7786
4,SACE_YBG,g,g,t,a,t,t,c,g,g,...,a,c,c,t,a,a,c,c,g,1.028


In [4]:
#Drop ids col
#One hot encoding
df_ohe = df.drop(columns=["ids"])
df_ohe = pd.get_dummies(df_ohe)

df_ohe.head()

Unnamed: 0,y,x1_a,x1_g,x1_r,x2_a,x2_g,x2_r,x3_g,x3_t,x4_a,...,x544_a,x544_c,x545_a,x545_c,x546_c,x546_g,x546_s,x547_a,x547_g,x547_r
0,0.4674,0,1,0,0,1,0,0,1,1,...,1,0,0,1,1,0,0,0,1,0
1,0.6397,0,1,0,0,1,0,0,1,1,...,1,0,0,1,1,0,0,0,1,0
2,0.746,0,1,0,0,1,0,0,1,1,...,1,0,0,1,1,0,0,0,1,0
3,0.7786,0,1,0,0,1,0,0,1,1,...,1,0,0,1,1,0,0,0,1,0
4,1.028,0,1,0,0,1,0,0,1,1,...,1,0,0,1,1,0,0,0,1,0


In [5]:
#Train Test Data!

#Labels
labels = np.array(df_ohe["y"])

#Features
df_ohe_wo_y = df_ohe.drop(columns=["y"])
features_list = list(df_ohe_wo_y.columns)

features = np.array(df_ohe_wo_y)

#Train & test set
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state=42)

In [6]:
# Model

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)

#Train
rf.fit(train_features, train_labels)



In [7]:
#Predictions

predictions = rf.predict(test_features)

#Error
errors = abs(predictions - test_labels)

print(f"Mean Abs Error: {round(np.mean(errors), 2)}")

Mean Abs Error: 0.12


In [8]:
#MAPE
mape = 100 * (errors/test_labels)

accuracy = 100 - np.mean(mape)
print(f"Accuracy: {round(accuracy, 2)}%")

Accuracy: 81.15%


In [9]:
#Feature importance
importances = list(rf.feature_importances_)

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features_list, importances)]

#sort
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse=True)

[print("Var: {:20} Importance: {}".format(*pair)) for pair in feature_importances];

Var: x107_t               Importance: 0.22
Var: x199_g               Importance: 0.1
Var: x4_a                 Importance: 0.02
Var: x29_a                Importance: 0.02
Var: x4_g                 Importance: 0.01
Var: x7_c                 Importance: 0.01
Var: x7_g                 Importance: 0.01
Var: x9_c                 Importance: 0.01
Var: x9_g                 Importance: 0.01
Var: x41_g                Importance: 0.01
Var: x45_a                Importance: 0.01
Var: x45_g                Importance: 0.01
Var: x49_a                Importance: 0.01
Var: x49_g                Importance: 0.01
Var: x60_t                Importance: 0.01
Var: x107_c               Importance: 0.01
Var: x111_g               Importance: 0.01
Var: x131_g               Importance: 0.01
Var: x199_t               Importance: 0.01
Var: x292_g               Importance: 0.01
Var: x388_a               Importance: 0.01
Var: x390_a               Importance: 0.01
Var: x390_g               Importance: 0.01
Var: x391_g 