In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For the regression part
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from scipy.optimize import curve_fit

# For the classification part
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, KernelPCA

from sklearn.ensemble import RandomForestClassifier

from sklearn.manifold import SpectralEmbedding


#sns.set_context('talk')
sns.set_theme()

import warnings
warnings.filterwarnings("ignore")


En este práctico, utilizaremos el archivo original *fiji_datos_0al7mo_labels.csv* que se encuentra en la carpeta *data/raw/*.

Además sumaremos los siguientes datos sintéticos que se encuentran en la carpeta *data/datos_sinteticos/*:
  * datos_sinteticos_dias_3_y_5.csv
  * synthetic_3y5_sint2.csv
  * synthetic_data_dia_3_si.csv
  * synthetic_data_dia_4_si.csv
  * synthetic_data_dia_5_si.csv

Como también dos archivos extras que están en la capeta *data/03_AS/*:
  * fiji_datos_mean_diam.csv
  * fiji_datos_noise.csv
  

## Análisis y exploración de los datos sinteticos dia 3 y 5

In [14]:
df_sinteticos_3_5=pd.read_csv("data/datos_sinteticos/datos_sinteticos_dias_3_y_5.csv")

In [15]:
df_sinteticos_3_5.head()

Unnamed: 0,labels,Area,Perim.,Circ.,Feret,MinFeret,AR,Round,Solidity,Esferoide,dia,Diameter,n_diam
0,0001u,2108.856029,158.5521,0.853148,52.0548,39.89,1.252439,0.801927,0.9574,si,3,44.779969,11.118208
1,0001v,5494.716556,267.7438,0.765975,108.6886,78.1359,1.367897,0.729508,0.9424,si,5,94.04524,55.354949
2,0001w,4604.334693,208.6327,0.835749,79.215,78.5711,1.007505,0.992155,0.9544,si,3,74.80203,24.886113
3,0001x,3293.43306,184.9685,0.888199,59.5354,55.6766,1.039236,0.955215,0.9792,si,3,57.879742,19.426588
4,000a0,12597.351184,393.5486,0.74141,115.4147,98.4842,1.126705,0.888984,0.9568,si,3,108.587775,412.66966


In [17]:
df_sinteticos_3_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   labels     55 non-null     object 
 1   Area       55 non-null     float64
 2   Perim.     55 non-null     float64
 3   Circ.      55 non-null     float64
 4   Feret      55 non-null     float64
 5   MinFeret   55 non-null     float64
 6   AR         55 non-null     float64
 7   Round      55 non-null     float64
 8   Solidity   55 non-null     float64
 9   Esferoide  55 non-null     object 
 10  dia        55 non-null     int64  
 11  Diameter   55 non-null     float64
 12  n_diam     55 non-null     float64
dtypes: float64(10), int64(1), object(2)
memory usage: 5.7+ KB


In [18]:
df_sinteticos_3_5.shape

(55, 13)

In [20]:
df_sinteticos_3_5.columns

Index(['labels', 'Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round',
       'Solidity', 'Esferoide', 'dia', 'Diameter', 'n_diam'],
      dtype='object')

In [21]:
df_sinteticos_3_5.dtypes

labels        object
Area         float64
Perim.       float64
Circ.        float64
Feret        float64
MinFeret     float64
AR           float64
Round        float64
Solidity     float64
Esferoide     object
dia            int64
Diameter     float64
n_diam       float64
dtype: object

In [23]:
# creamos arrays para cada tipo de variable
variable_categorica = []
variable_numerica = []

# Iteramos a través de las columnas
for columna in df_sinteticos_3_5.columns:
    if df_sinteticos_3_5[columna].dtypes == 'object':
        variable_categorica.append(columna)
    else:
        variable_numerica.append(columna)

print('Variable Categórica:', variable_categorica,'\n')
print('Variable Numérica:', variable_numerica)


Variable Categórica: ['labels', 'Esferoide'] 

Variable Numérica: ['Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round', 'Solidity', 'dia', 'Diameter', 'n_diam']


In [25]:
df_sinteticos_3_5["labels"].unique()

array(['0001u', '0001v', '0001w', '0001x', '000a0', '000aa', '000ab',
       '000ac', '000ad', '000ae', '000ag', '000ah', '000aj', '000ak',
       '000am', '000an', '000aq', '000ar', '000au', '000aw', '000ax',
       '000ay', '000b0', '000bb', '000bc', '000bd', '000bg', '000bj',
       '000bk', '000bm', '000bo', '000br', '000bs', '000bt', '000bx',
       '000by', '000c0', '000ca', '000cc', '000cd', '000ce', '000cg',
       '000ch', '000cl', '000co', '000cs', '000ct', '000cu', '000cv',
       '000cx', '000d0', '000da', '000dd', '000de', '000dg'], dtype=object)

In [26]:
# A diferencia del dataset con el que venimos trabajando, la variable label toma otras etiquetas

In [28]:
df_sinteticos_3_5["Esferoide"].unique()

array(['si'], dtype=object)

In [30]:
df_sinteticos_3_5["Esferoide"].value_counts()

si    55
Name: Esferoide, dtype: int64

In [None]:
#Todos los datos tienen la etiqueta esferoide "si"

In [None]:
#Cantidad de datos etiquetados como "si" segun el dia 

In [73]:
df_sinteticos_3_5.groupby(["dia","Esferoide"]).size()

dia  Esferoide
3    si           29
5    si           26
dtype: int64

## Análisis y exploración de los datos sinteticos dia 3 y 5 bis

In [None]:
df_sinteticos_3_5_bis=pd.read_csv("data/datos_sinteticos/synthetic_3y5_sint2.csv")

In [6]:
df_sinteticos_3_5_bis.head()

Unnamed: 0,labels,Area,Perim.,Circ.,Feret,MinFeret,AR,Round,Solidity,Esferoide,dia,Diameter,n_diam
0,000iu,122.7626,41.1188,0.846634,14.1852,12.1525,1.366386,0.753013,0.935293,no,5,13.2978,0.38104
1,000ix,122.7626,101.52717,0.900619,32.269328,30.219156,1.27768,0.843124,0.974896,no,5,31.236372,0.38104
2,000j1,9070.046335,424.616078,0.563269,139.12526,95.758206,1.62625,0.618293,0.90347,no,3,117.423404,277.627847
3,000ji,6586.914719,270.750961,0.764905,89.727394,73.635156,1.465515,0.746293,0.947981,no,3,81.66837,215.923591
4,000ju,6796.984868,284.745161,0.788278,85.622933,72.65707,1.0012,0.955847,0.931386,si,3,79.115739,205.487181


In [31]:
df_sinteticos_3_5_bis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   labels     161 non-null    object 
 1   Area       161 non-null    float64
 2   Perim.     161 non-null    float64
 3   Circ.      161 non-null    float64
 4   Feret      161 non-null    float64
 5   MinFeret   161 non-null    float64
 6   AR         161 non-null    float64
 7   Round      161 non-null    float64
 8   Solidity   161 non-null    float64
 9   Esferoide  161 non-null    object 
 10  dia        161 non-null    int64  
 11  Diameter   161 non-null    float64
 12  n_diam     161 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 16.5+ KB


In [32]:
df_sinteticos_3_5_bis.shape

(161, 13)

In [33]:
df_sinteticos_3_5_bis.columns

Index(['labels', 'Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round',
       'Solidity', 'Esferoide', 'dia', 'Diameter', 'n_diam'],
      dtype='object')

In [38]:
df_sinteticos_3_5_bis.dtypes

labels        object
Area         float64
Perim.       float64
Circ.        float64
Feret        float64
MinFeret     float64
AR           float64
Round        float64
Solidity     float64
Esferoide     object
dia            int64
Diameter     float64
n_diam       float64
dtype: object

In [50]:
# creamos arrays para cada tipo de variable
variable_categorica = []
variable_numerica = []

# Iteramos a través de las columnas
for columna in df_sinteticos_3_5_bis.columns:
    if df_sinteticos_3_5_bis[columna].dtypes == 'object':
        variable_categorica.append(columna)
    else:
        variable_numerica.append(columna)

print('Variable Categórica:', variable_categorica,'\n')
print('Variable Numérica:', variable_numerica)


Variable Categórica: ['labels', 'Esferoide'] 

Variable Numérica: ['Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round', 'Solidity', 'dia', 'Diameter', 'n_diam']


In [54]:
df_sinteticos_3_5_bis["labels"].unique()

array(['000iu', '000ix', '000j1', '000ji', '000ju', '000jv', '000k0',
       '000kc', '000kf', '000kh', '000kn', '000kr', '000ku', '000kv',
       '000ky', '000l0', '000ld', '000lh', '000lj', '000lm', '000lo',
       '000lu', '000ly', '000mg', '000mi', '000mm', '000mo', '000ms',
       '000mu', '000my', '000n0', '000nh', '000ni', '000nk', '000nn',
       '000no', '000nr', '000ns', '000nx', '000o0', '000od', '000og',
       '000oj', '000om', '000op', '000ou', '000oy', '000oz', '000p0',
       '000pa', '000pf', '000pr', '000pu', '000px', '000pz', '000q1',
       '000qf', '000qg', '000qh', '000qi', '000qk', '000qp', '000qt',
       '000qu', '000qv', '000qy', '000qz', '000ra', '000rd', '000rf',
       '000rh', '000ri', '000rk', '000rn', '000rs', '000rt', '000rw',
       '000s0', '000s1', '000sb', '000sh', '000sl', '000sm', '000sq',
       '000sr', '000su', '000sy', '000t0', '000ta', '000tf', '000th',
       '000tj', '000tm', '000tn', '000to', '000tr', '000tv', '000ty',
       '000u1', '000

In [59]:
df_sinteticos_3_5_bis["Esferoide"].unique()

array(['no', 'si'], dtype=object)

In [62]:
df_sinteticos_3_5_bis["Esferoide"].value_counts()

no    124
si     37
Name: Esferoide, dtype: int64

In [74]:
df_sinteticos_3_5_bis.groupby(["dia","Esferoide"]).size()

dia  Esferoide
3    no           60
     si           17
5    no           64
     si           20
dtype: int64

## Análisis y exploración de los datos sinteticos dia 3 

In [None]:
df_sinteticos_3=pd.read_csv("data/datos_sinteticos/synthetic_data_dia_3_si.csv")

In [7]:
df_sinteticos_3.head()

Unnamed: 0,labels,Area,Perim.,Circ.,Feret,MinFeret,AR,Round,Solidity,Esferoide,dia,Diameter,n_diam
0,000px,2668.2568,228.4795,0.893241,66.215303,68.0477,1.069481,0.934437,0.9684,si,3,66.365583,17.311897
1,000q0,6009.2465,312.8576,0.784415,101.099708,76.88,1.37458,0.727304,0.9515,si,3,90.270453,109.653369
2,000qh,16358.7381,490.8434,0.85164,147.480859,144.2174,1.078255,0.927538,0.983,si,3,144.196627,634.141009
3,000qj,15335.9539,524.2858,0.756606,164.937428,134.5306,1.092246,0.915844,0.9693,si,3,141.551969,597.638251
4,000qq,14441.1167,454.2536,0.798609,99.781799,131.3265,1.028377,0.972036,0.9753,si,3,124.884592,501.757463


In [35]:
df_sinteticos_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   labels     56 non-null     object 
 1   Area       56 non-null     float64
 2   Perim.     56 non-null     float64
 3   Circ.      56 non-null     float64
 4   Feret      56 non-null     float64
 5   MinFeret   56 non-null     float64
 6   AR         56 non-null     float64
 7   Round      56 non-null     float64
 8   Solidity   56 non-null     float64
 9   Esferoide  56 non-null     object 
 10  dia        56 non-null     int64  
 11  Diameter   56 non-null     float64
 12  n_diam     56 non-null     float64
dtypes: float64(10), int64(1), object(2)
memory usage: 5.8+ KB


In [36]:
df_sinteticos_3.shape

(56, 13)

In [37]:
df_sinteticos_3.columns

Index(['labels', 'Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round',
       'Solidity', 'Esferoide', 'dia', 'Diameter', 'n_diam'],
      dtype='object')

In [39]:
df_sinteticos_3.dtypes

labels        object
Area         float64
Perim.       float64
Circ.        float64
Feret        float64
MinFeret     float64
AR           float64
Round        float64
Solidity     float64
Esferoide     object
dia            int64
Diameter     float64
n_diam       float64
dtype: object

In [51]:
# creamos arrays para cada tipo de variable
variable_categorica = []
variable_numerica = []

# Iteramos a través de las columnas
for columna in df_sinteticos_3.columns:
    if df_sinteticos_3[columna].dtypes == 'object':
        variable_categorica.append(columna)
    else:
        variable_numerica.append(columna)

print('Variable Categórica:', variable_categorica,'\n')
print('Variable Numérica:', variable_numerica)


Variable Categórica: ['labels', 'Esferoide'] 

Variable Numérica: ['Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round', 'Solidity', 'dia', 'Diameter', 'n_diam']


In [57]:
df_sinteticos_3["labels"].unique()

array(['000px', '000q0', '000qh', '000qj', '000qq', '000qu', '000qz',
       '000r1', '000re', '000rf', '000rh', '000ri', '000rq', '000rx',
       '000rz', '000s1', '000sh', '000si', '000sp', '000sq', '000sr',
       '000st', '000sy', '000ta', '000tg', '000tl', '000tn', '000tq',
       '000tv', '000tz', '000ub', '000ud', '000ug', '000ul', '000um',
       '000up', '000ur', '000us', '000uu', '000vb', '000vc', '000vg',
       '000vj', '000vt', '000vu', '000vw', '000vy', '000wa', '000wd',
       '000we', '000wf', '000wk', '000wn', '000wo', '000wv', '000wz'],
      dtype=object)

In [63]:
df_sinteticos_3["Esferoide"].unique()

array(['si'], dtype=object)

In [64]:
df_sinteticos_3["Esferoide"].value_counts()

si    56
Name: Esferoide, dtype: int64

## Análisis y exploración de los datos sinteticos dia 4

In [None]:
df_sinteticos_4=pd.read_csv("data/datos_sinteticos/synthetic_data_dia_4_si.csv")

In [8]:
df_sinteticos_4.head()

Unnamed: 0,labels,Area,Perim.,Circ.,Feret,MinFeret,AR,Round,Solidity,Esferoide,dia,Diameter,n_diam
0,00000,15848.2676,474.1609,0.77885,145.709319,127.7317,1.153,0.8674,0.974,si,4,127.770345,521.426321
1,00001,10181.0707,433.6711,0.784262,131.55914,128.6091,1.0748,0.931,0.9739,si,4,121.307057,216.107458
2,0000a,20784.2025,611.4172,0.739942,181.800275,174.8398,1.0715,0.9323,0.9726,si,4,175.153921,779.371247
3,0000b,1083.9499,190.94,0.909804,55.199988,53.5693,1.0905,0.9164,0.9768,si,4,52.983315,8.749749
4,0000c,1254.7693,191.1564,0.869752,56.385742,50.7836,1.1148,0.8978,0.9725,si,4,51.059912,9.517034


In [40]:
df_sinteticos_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   labels     50 non-null     object 
 1   Area       50 non-null     float64
 2   Perim.     50 non-null     float64
 3   Circ.      50 non-null     float64
 4   Feret      50 non-null     float64
 5   MinFeret   50 non-null     float64
 6   AR         50 non-null     float64
 7   Round      50 non-null     float64
 8   Solidity   50 non-null     float64
 9   Esferoide  50 non-null     object 
 10  dia        50 non-null     int64  
 11  Diameter   50 non-null     float64
 12  n_diam     50 non-null     float64
dtypes: float64(10), int64(1), object(2)
memory usage: 5.2+ KB


In [47]:
df_sinteticos_4.shape

(50, 13)

In [48]:
df_sinteticos_4.columns

Index(['labels', 'Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round',
       'Solidity', 'Esferoide', 'dia', 'Diameter', 'n_diam'],
      dtype='object')

In [49]:
df_sinteticos_4.dtypes

labels        object
Area         float64
Perim.       float64
Circ.        float64
Feret        float64
MinFeret     float64
AR           float64
Round        float64
Solidity     float64
Esferoide     object
dia            int64
Diameter     float64
n_diam       float64
dtype: object

In [52]:
# creamos arrays para cada tipo de variable
variable_categorica = []
variable_numerica = []

# Iteramos a través de las columnas
for columna in df_sinteticos_4.columns:
    if df_sinteticos_4[columna].dtypes == 'object':
        variable_categorica.append(columna)
    else:
        variable_numerica.append(columna)

print('Variable Categórica:', variable_categorica,'\n')
print('Variable Numérica:', variable_numerica)


Variable Categórica: ['labels', 'Esferoide'] 

Variable Numérica: ['Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round', 'Solidity', 'dia', 'Diameter', 'n_diam']


In [56]:
df_sinteticos_4["labels"].unique()

array(['00000', '00001', '0000a', '0000b', '0000c', '0000d', '0000e',
       '0000f', '0000g', '0000h', '0000i', '0000j', '0000k', '0000l',
       '0000m', '0000n', '0000o', '0000p', '0000q', '0000r', '0000s',
       '0000t', '0000u', '0000v', '0000w', '0000x', '0000y', '0000z',
       '00010', '00011', '0001a', '0001b', '0001c', '0001d', '0001e',
       '0001f', '0001g', '0001h', '0001i', '0001j', '0001k', '0001l',
       '0001m', '0001n', '0001o', '0001p', '0001q', '0001r', '0001s',
       '0001t'], dtype=object)

In [65]:
df_sinteticos_4["Esferoide"].unique()

array(['si'], dtype=object)

In [66]:
df_sinteticos_4["Esferoide"].value_counts()

si    50
Name: Esferoide, dtype: int64

## Análisis y exploración de los datos sinteticos dia 5

In [None]:
df_sinteticos_5=pd.read_csv("data/datos_sinteticos/synthetic_data_dia_5_si.csv")

In [9]:
df_sinteticos_5.head()

Unnamed: 0,labels,Area,Perim.,Circ.,Feret,MinFeret,AR,Round,Solidity,Esferoide,dia,Diameter,n_diam
0,000fh,1940.893421,161.8442,-0.627087,53.714871,62.5798,1.026814,0.973772,0.9833,si,5,46.077939,12.629334
1,000gl,17291.426981,524.0741,-0.627087,163.107805,155.1193,1.05376,0.949237,0.9835,si,5,149.262276,635.094169
2,000gq,1191.282905,155.5651,-0.627087,47.768183,41.5086,1.071748,0.933102,0.9508,si,5,40.808361,8.924818
3,000gx,3586.49506,269.9627,-0.627087,119.774515,73.2305,1.13487,0.881704,0.9813,si,5,60.997945,34.82699
4,000hl,11298.641901,402.5262,-0.627087,135.064444,120.0183,1.185762,0.84327,0.9812,si,5,106.108531,294.398135


In [41]:
df_sinteticos_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   labels     41 non-null     object 
 1   Area       41 non-null     float64
 2   Perim.     41 non-null     float64
 3   Circ.      41 non-null     float64
 4   Feret      41 non-null     float64
 5   MinFeret   41 non-null     float64
 6   AR         41 non-null     float64
 7   Round      41 non-null     float64
 8   Solidity   41 non-null     float64
 9   Esferoide  41 non-null     object 
 10  dia        41 non-null     int64  
 11  Diameter   41 non-null     float64
 12  n_diam     41 non-null     float64
dtypes: float64(10), int64(1), object(2)
memory usage: 4.3+ KB


In [42]:
df_sinteticos_5.shape

(41, 13)

In [44]:
df_sinteticos_5.columns

Index(['labels', 'Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round',
       'Solidity', 'Esferoide', 'dia', 'Diameter', 'n_diam'],
      dtype='object')

In [46]:
df_sinteticos_5.dtypes

labels        object
Area         float64
Perim.       float64
Circ.        float64
Feret        float64
MinFeret     float64
AR           float64
Round        float64
Solidity     float64
Esferoide     object
dia            int64
Diameter     float64
n_diam       float64
dtype: object

In [53]:
# creamos arrays para cada tipo de variable
variable_categorica = []
variable_numerica = []

# Iteramos a través de las columnas
for columna in df_sinteticos_5.columns:
    if df_sinteticos_5[columna].dtypes == 'object':
        variable_categorica.append(columna)
    else:
        variable_numerica.append(columna)

print('Variable Categórica:', variable_categorica,'\n')
print('Variable Numérica:', variable_numerica)


Variable Categórica: ['labels', 'Esferoide'] 

Variable Numérica: ['Area', 'Perim.', 'Circ.', 'Feret', 'MinFeret', 'AR', 'Round', 'Solidity', 'dia', 'Diameter', 'n_diam']


In [58]:
df_sinteticos_5["labels"].unique()

array(['000fh', '000gl', '000gq', '000gx', '000hl', '000hw', '000hy',
       '000ib', '000j1', '000jv', '000kf', '000kx', '000lf', '000lj',
       '000lo', '000mb', '000mf', '000mw', '000ni', '000no', '000nr',
       '000ny', '000nz', '000o0', '000oi', '000op', '000pr', '000qd',
       '000qg', '000qp', '000rk', '000rn', '000sm', '000t0', '000tj',
       '000tr', '000uv', '000vr', '000vv', '000wt', '000ww'], dtype=object)

In [69]:
df_sinteticos_5["Esferoide"].unique()

array(['si'], dtype=object)

In [70]:
df_sinteticos_5["Esferoide"].value_counts()

si    41
Name: Esferoide, dtype: int64

## Comparación con los datos de los dias 3, 4 y 5 

In [77]:
raw_df = pd.read_csv("data/raw/fiji_datos_0al7mo_labels.csv")

In [78]:
raw_df.head()

Unnamed: 0,labels,Area,X,Y,XM,YM,Perim.,BX,BY,Width,...,FeretY,FeretAngle,MinFeret,AR,Round,Solidity,Esferoide,dia,Diameter,n_diam
0,Esferas_BT474_dia_0_well_1_100X_1_blob_1,324.444,1129.2718,102.2051,1129.2718,102.2051,67.3603,1118.8859,92.3913,21.0598,...,157,23.4287,19.7011,1.0653,0.9387,0.95,si,0,20.9564,1.491357
1,Esferas_BT474_dia_0_well_1_100X_1_blob_2,497.5115,1517.5528,126.1022,1517.5528,126.1022,82.8004,1504.7554,113.4511,25.8152,...,192,16.1443,25.1289,1.0475,0.9546,0.9569,si,0,26.002,2.848733
2,Esferas_BT474_dia_0_well_1_100X_1_blob_3,282.9078,1314.2815,126.0988,1314.2815,126.0988,62.1586,1304.3478,116.8478,19.7011,...,189,15.4222,18.3424,1.0395,0.962,0.9519,si,0,19.3897,1.181258
3,Esferas_BT474_dia_0_well_1_100X_1_blob_5,500.7421,1189.2841,212.6155,1189.2841,212.6155,84.5571,1175.9511,200.4076,26.4946,...,309,174.144,25.1359,1.0278,0.973,0.9493,si,0,25.88475,2.81037
4,Esferas_BT474_dia_0_well_1_100X_1_blob_6,492.8964,1043.0782,247.7667,1043.0782,247.7667,82.0728,1030.5707,235.7337,25.1359,...,353,143.427,24.4565,1.0186,0.9817,0.9643,si,0,25.33985,2.636596


In [None]:
#Cantidad de datos clasificados como Esferoide "si" y "no" en los dias 3, 4 y 5

In [89]:
#En el dia 3 

raw_df[raw_df['dia'] == 3]['Esferoide'].value_counts()

no    25
si    18
Name: Esferoide, dtype: int64

In [90]:
#En el dia 4 

raw_df[raw_df['dia'] == 4]['Esferoide'].value_counts()

no    206
si     48
Name: Esferoide, dtype: int64

In [91]:
#En el dia 5

raw_df[raw_df['dia'] == 5]['Esferoide'].value_counts()

si    28
no    24
Name: Esferoide, dtype: int64

Lo que se puede notar en un principio es que en los dias 3 y 5 hay menor cantidad de datos que en el dia 4. En el dia 4 hay mayor cantidad de datos etiquetados como esferoides "no" que esferoides "si". Al contar con datos del dia 3 y 5 sinteticos se podrian agragar al dataset para contar con mas cantidad

In [103]:
df_sinteticos_3_5.groupby(["dia","Esferoide"]).size()

dia  Esferoide
3    si           29
5    si           26
dtype: int64

In [104]:
df_sinteticos_3_5_bis.groupby(["dia","Esferoide"]).size()

dia  Esferoide
3    no           60
     si           17
5    no           64
     si           20
dtype: int64

In [144]:
df_sinteticos_3["Esferoide"].unique()

array(['si'], dtype=object)

In [154]:
df_sinteticos_3["Esferoide"].value_counts()

si    56
Name: Esferoide, dtype: int64

In [152]:
df_sinteticos_5["Esferoide"].unique()

array(['si'], dtype=object)

In [153]:
df_sinteticos_5["Esferoide"].value_counts()

si    41
Name: Esferoide, dtype: int64

In [159]:
#Cantidad de datos sinteticos esferoide "si", "no" dia 3

si_sintetico_3_5 = len(df_sinteticos_3_5[(df_sinteticos_3_5['dia'] == 3) & (df_sinteticos_3_5['Esferoide'] == 'si')])
si_sintetico_3_5_bis= len(df_sinteticos_3_5_bis[(df_sinteticos_3_5_bis['dia'] == 3) & (df_sinteticos_3_5_bis['Esferoide'] == 'si')])
si_sintetico_3=len(df_sinteticos_3[df_sinteticos_3['Esferoide'] == 'si'])


count_dia_3_si= si_sintetico_3 + si_sintetico_3_bis + si_sintetico_3
print(f"La cantidad de datos sinteticos clasificados el dia 3 como esferoide  es: {count_dia_3_si}")

no_sintetico_3 = len(df_sinteticos_3_5[(df_sinteticos_3_5['dia'] == 3) & (df_sinteticos_3_5['Esferoide'] == 'no')])
no_sintetico_3_bis= len(df_sinteticos_3_5_bis[(df_sinteticos_3_5_bis['dia'] == 3) & (df_sinteticos_3_5_bis['Esferoide'] == 'no')])

count_dia_3_no= no_sintetico_3 + no_sintetico_3_bis
print(f"La cantidad de datos sinteticos clasificados el dia 3 como  no esferoide  es: {count_dia_3_no}")


La cantidad de datos sinteticos clasificados el dia 3 como esferoide  es: 129
La cantidad de datos sinteticos clasificados el dia 3 como  no esferoide  es: 60


In [160]:
#Cantidad de datos sinteticos esferoide "si", "no" dia 5

si_sintetico_5 = len(df_sinteticos_3_5[(df_sinteticos_3_5['dia'] == 5) & (df_sinteticos_3_5['Esferoide'] == 'si')])
si_sintetico_5_bis= len(df_sinteticos_3_5_bis[(df_sinteticos_3_5_bis['dia'] == 5) & (df_sinteticos_3_5_bis['Esferoide'] == 'si')])
si_sintetico_5=len(df_sinteticos_5[df_sinteticos_5['Esferoide'] == 'si'])

count_dia_5_si= si_sintetico_5 + si_sintetico_5_bis + si_sintetico_5
print(f"La cantidad de datos sinteticos clasificados el dia 5 como esferoide  es: {count_dia_5_si}")

no_sintetico_5 = len(df_sinteticos_3_5[(df_sinteticos_3_5['dia'] == 5) & (df_sinteticos_3_5['Esferoide'] == 'no')])
no_sintetico_5_bis= len(df_sinteticos_3_5_bis[(df_sinteticos_3_5_bis['dia'] == 5) & (df_sinteticos_3_5_bis['Esferoide'] == 'no')])

count_dia_5_no= no_sintetico_5+ no_sintetico_5_bis
print(f"La cantidad de datos sinteticos clasificados el dia 5 como  no esferoide  es: {count_dia_5_no}")


La cantidad de datos sinteticos clasificados el dia 5 como esferoide  es: 102
La cantidad de datos sinteticos clasificados el dia 5 como  no esferoide  es: 64


In [161]:
#Para el dia 4 serviria agregar datos sinteticos que esten clasificados como esferoide "si"

df_sinteticos_4["Esferoide"].unique()

array(['si'], dtype=object)

In [162]:
df_sinteticos_4["Esferoide"].value_counts()

si    50
Name: Esferoide, dtype: int64

In [None]:
#se podria decidir agregar estos 50 datos para sumar esferoides si al dia 4 