# Cargamos el DataFrame

In [206]:
import pandas as pd
import sys, os

sys.path.append(os.path.abspath(os.path.join('..', 'utils')))

df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


## Separamos todas las lineas que tangan el 'y' nulo para que despues podamos predecir:

In [207]:
df_pred = df[df['x_e_out [-]'].isnull()]

Dejamos solo el 'id' y 'x_e_out [-]' como columnas para que enviemos a kaggle despues:

In [208]:
df_pred = df_pred.drop(columns=['author', 'geometry', 'pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]', 'length [mm]', 'chf_exp [MW/m2]'])

In [209]:
df_pred

Unnamed: 0,id,x_e_out [-]
4,4,
7,7,
10,10,
12,12,
23,23,
...,...,...
31633,31633,
31634,31634,
31637,31637,
31640,31640,


Salvamos eso en un '.csv' para despues:

In [210]:
df_pred.to_csv('../data/data_pred.csv')

# Analizamos y limpiamos los datos

In [211]:
df.shape

(31644, 10)

In [212]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   31644 non-null  int64  
 1   author               26620 non-null  object 
 2   geometry             26144 non-null  object 
 3   pressure [MPa]       27192 non-null  float64
 4   mass_flux [kg/m2-s]  26853 non-null  float64
 5   x_e_out [-]          21229 non-null  float64
 6   D_e [mm]             26156 non-null  float64
 7   D_h [mm]             27055 non-null  float64
 8   length [mm]          26885 non-null  float64
 9   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.4+ MB


Eliminamos las columnas que no nos interesan

In [213]:
df = df.drop(columns=['id', 'author', 'geometry'])

Nombres de las columnas iniciales:

In [214]:
df.keys()

Index(['pressure [MPa]', 'mass_flux [kg/m2-s]', 'x_e_out [-]', 'D_e [mm]',
       'D_h [mm]', 'length [mm]', 'chf_exp [MW/m2]'],
      dtype='object')

In [215]:
df.isna().sum()

pressure [MPa]          4452
mass_flux [kg/m2-s]     4791
x_e_out [-]            10415
D_e [mm]                5488
D_h [mm]                4589
length [mm]             4759
chf_exp [MW/m2]            0
dtype: int64

Eliminamos los valores nulos the nustro target:

In [216]:
df = df.dropna(subset=['x_e_out [-]'])

Rellenamos a los valores nulos de X:

In [217]:
df = df.fillna(df.mean())

Miramos a los valores de nuestro target:

In [218]:
df['x_e_out [-]'].values

array([ 0.1754, -0.0416,  0.0335, ...,  0.0886, -0.1224,  0.0603])

## Preparamos los datos para entrenar

In [219]:
from sklearn.model_selection import train_test_split

X = df[['pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]', 'length [mm]', 'chf_exp [MW/m2]']]
y = df["x_e_out [-]"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [220]:
X_train

Unnamed: 0,pressure [MPa],mass_flux [kg/m2-s],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
4147,13.79,2034.000000,7.700000,7.700000,457.0,3.7
23341,6.89,6157.000000,8.589305,3.600000,76.0,6.6
5858,6.89,4286.000000,10.300000,10.300000,794.0,5.9
7624,6.89,5859.000000,10.300000,10.300000,794.0,3.6
3039,6.89,5479.000000,23.600000,14.215446,1972.0,4.9
...,...,...,...,...,...,...
16916,13.79,1316.000000,4.700000,4.700000,318.0,3.9
17899,12.00,3070.487779,10.000000,10.000000,1000.0,3.1
8029,10.34,7432.000000,10.300000,10.300000,762.0,2.6
1288,13.79,963.000000,7.700000,7.700000,457.0,13.3


In [221]:
X_test

Unnamed: 0,pressure [MPa],mass_flux [kg/m2-s],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
16131,10.635066,1356.000000,9.500000,9.5,1836.000000,1.1
19645,10.635066,694.000000,8.589305,7.7,457.000000,2.9
6068,6.890000,6049.000000,10.300000,10.3,762.000000,4.5
14276,13.790000,1356.000000,7.700000,7.7,457.000000,5.5
25860,15.510000,3070.487779,8.589305,1.9,152.000000,3.7
...,...,...,...,...,...,...
3651,13.790000,3070.487779,8.589305,7.7,457.000000,3.7
14324,10.635066,2278.000000,12.700000,42.3,1778.000000,3.8
19813,17.240000,2007.000000,8.589305,1.9,830.564962,2.5
25162,12.070000,3296.000000,1.900000,1.9,696.000000,2.5


In [222]:
y_train

4147    -0.1356
23341   -0.1332
5858    -0.0092
7624    -0.0574
3039    -0.0069
          ...  
16916   -0.1046
17899   -0.0466
8029    -0.1403
1288    -0.1341
23568    0.0839
Name: x_e_out [-], Length: 16983, dtype: float64

In [223]:
y_test

16131    0.1517
19645   -0.0202
6068    -0.0416
14276   -0.3469
25860   -0.0571
          ...  
3651    -0.1756
14324    0.0063
19813   -0.1977
25162   -0.0408
23110    0.0783
Name: x_e_out [-], Length: 4246, dtype: float64

Hacemos una copia y exportamos a CSV para que tengamos el Data original y el limpio

In [224]:
df_final = df.copy()

In [225]:
df_final

Unnamed: 0,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,7.000000,3770.000000,0.1754,8.589305,10.8,432.0,3.6
1,10.635066,6049.000000,-0.0416,10.300000,10.3,762.0,6.2
2,13.790000,2034.000000,0.0335,7.700000,7.7,457.0,2.5
3,13.790000,3679.000000,-0.0279,5.600000,15.2,2134.0,3.0
5,17.240000,3648.000000,-0.0711,8.589305,1.9,696.0,3.6
...,...,...,...,...,...,...,...
31636,12.070000,3070.487779,-0.0195,8.589305,1.9,152.0,5.4
31638,10.635066,3648.000000,-0.0487,4.700000,4.7,318.0,9.0
31639,10.635066,1736.000000,0.0886,8.589305,7.8,591.0,2.3
31641,18.270000,658.000000,-0.1224,3.000000,3.0,150.0,2.3


In [226]:
df_final.to_csv('../data/data_limpio.csv')