In [44]:
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

#Funcion para abrir los datos .npz y extraer las variables que elegimos, y lo guardamos en un diccionario
def get_data( input_data_file, var_input, var_target, train_ratio, val_ratio, test_ratio)   :
    
    data_from_file  = np.load( input_data_file ) 
    Data=dict()
    Input = data_from_file[var_input]
    Target = data_from_file[var_target]
    
    #En este caso tanto el input como el target tiene las mismas dimensiones, por eso podemos usarlas para ambos
    Data["len_total"], Data["nx"], Data["ny"]  = Input.shape
    
    indices = range(Data["len_total"]) #Con el largo del dataset cuento cuantos hay y genero un vector de indices

    #Separo en los conjuntos de Entrenamiento y un conjunto que será Validacion/Testing
    train_ids, rest_ids = train_test_split(indices, test_size=1 - train_ratio , shuffle=False )
    #Ahora a ese conjunto restante lo divido en Validacion y testing propiamente.
    val_ids, test_ids   = train_test_split(rest_ids, test_size=test_ratio/(test_ratio + val_ratio) , shuffle=False ) 

    #Guardo e imprimo por pantalla la cantidad de datos en cada conjunto
    Data["len_train"], Data["len_val"], Data["len_test"] = len(train_ids), len(val_ids), len(test_ids)
    print('Training set starts at :', str( np.min( train_ids) ) , ' and ends at: ', str( np.max( train_ids ) ) )
    print('Validation set starts at :', str( np.min( val_ids ) ) , ' and ends at: ', str( np.max( val_ids ) ) )
    print('Testing set starts at: ', str( np.min( test_ids ) ) , ' and ends at: ', str( np.max( test_ids ) ) )
    
    #Cantidad de datos de entrenamiento
   # train_ids = train_ids[:400]
    
    train_x_data = Input[train_ids,:,:]
    train_y_data = Target[train_ids,:,:]

    val_x_data = Input[val_ids,:,:]
    val_y_data = Target[val_ids,:,:]

    
    test_x_data = Input[test_ids,:,:]
    test_y_data = Target[test_ids,:,:]

    Data["xmin"], Data["xmax"] = np.append(train_x_data,val_x_data,axis=0).min() , np.append(train_x_data,val_x_data,axis=0).max()
    Data["ymin"], Data["ymax"] = np.append(train_y_data,val_y_data,axis=0).min() , np.append(train_y_data,val_y_data,axis=0).max()
    
    return Data, train_x_data, train_y_data, val_x_data, val_y_data, test_x_data, test_y_data


In [45]:
Input_name =  "gfs_input"
Target_name = "gsmap_target"
Experimento = Input_name+"_vs_"+ Target_name

In [48]:
#Porcentaje de reparticion de los conjuntos de Train / Validation / Testing
train_ratio = .8
val_ratio = .1
test_ratio = .1

directorio = '/home/fernando.huaranca/datosmunin/subset/enero2018.npz'

# Lectura de los datos
#Reflectividad de radar simulada
#Tasa de precicpitacion
Data, x_train, y_train, x_val, y_val, x_test, y_test = get_data(directorio,
                                                                   Input_name, Target_name, train_ratio, val_ratio, test_ratio)
nx, ny = Data["nx"], Data["ny"]

print("Muestras de Train / Valid / Test: ",(Data["len_train"],Data["len_val"],Data["len_test"]))

Training set starts at : 0  and ends at:  23
Validation set starts at : 24  and ends at:  26
Testing set starts at:  27  and ends at:  30
Muestras de Train / Valid / Test:  (24, 3, 4)


(24, 41, 41)

In [43]:
np.append(x_train,x_val,axis=0)

array([[[8.00000012e-01, 3.00000012e-01, 3.00000012e-01, ...,
         1.00000001e-01, 1.00000001e-01, 8.00000012e-01],
        [1.70000005e+00, 8.00000012e-01, 6.00000024e-01, ...,
         0.00000000e+00, 0.00000000e+00, 6.00000024e-01],
        [2.00000000e+00, 1.10000002e+00, 8.00000012e-01, ...,
         1.00000001e-01, 1.00000001e-01, 1.00000000e+00],
        ...,
        [4.69999981e+00, 5.50000000e+00, 5.70000029e+00, ...,
         7.09999990e+00, 4.09999990e+00, 1.60000002e+00],
        [1.60000002e+00, 2.59999990e+00, 3.59999990e+00, ...,
         9.19999981e+00, 7.09999990e+00, 3.50000000e+00],
        [1.00000001e-01, 6.99999988e-01, 6.99999988e-01, ...,
         1.15000000e+01, 1.26999998e+01, 6.19999981e+00]],

       [[1.91000004e+01, 1.50000000e+01, 1.15000000e+01, ...,
         9.19999981e+00, 1.04000006e+01, 1.03000011e+01],
        [2.05999985e+01, 2.35999985e+01, 1.66999989e+01, ...,
         9.00000000e+00, 1.08000002e+01, 1.10000000e+01],
        [1.51000004e+01, 

In [32]:
np.append(x_tra,val_x_data,axis=2).min() , np.append(train_x_data,val_x_data,axis=2).max()

{'len_total': 31,
 'nx': 41,
 'ny': 41,
 'len_train': 24,
 'len_val': 3,
 'len_test': 4}

In [23]:
Data

{'len_total': 31,
 'nx': 41,
 'ny': 41,
 'len_train': 24,
 'len_val': 3,
 'len_test': 4,
 'xmin': 0.0,
 'xmax': 195.9,
 'ymin': 0.0,
 'ymax': 195.9}

# Crear un npz para usar la funcion

In [None]:
#41

In [7]:
import numpy as np
import os
import glob

In [4]:
folder_gfs_subset = '/home/fernando.huaranca/datosmunin/subset/gfs'
folder_gsmap_subset = '/home/fernando.huaranca/datosmunin/subset/gsmap'

In [5]:
#Lista de archivos
Files_gfs = os.listdir(folder_gfs_subset)
Files_gfs

['2018-01-15.npz',
 '2018-01-05.npz',
 '2018-01-18.npz',
 '2018-01-22.npz',
 '2018-01-11.npz',
 '2018-01-30.npz',
 '2018-01-21.npz',
 '2018-01-17.npz',
 '2018-01-09.npz',
 '2018-01-28.npz',
 '2018-01-31.npz',
 '2018-01-12.npz',
 '2018-01-25.npz',
 '2018-01-26.npz',
 '2018-01-13.npz',
 '2018-01-24.npz',
 '2018-01-14.npz',
 '2018-01-19.npz',
 '2018-01-04.npz',
 '2018-01-01.npz',
 '2018-01-10.npz',
 '2018-01-29.npz',
 '2018-01-27.npz',
 '2018-01-08.npz',
 '2018-01-07.npz',
 '2018-01-16.npz',
 '2018-01-03.npz',
 '2018-01-23.npz',
 '2018-01-20.npz',
 '2018-01-02.npz',
 '2018-01-06.npz']

In [35]:
lista_npz = []

for file in Files_gfs:

    #Cargamos el directorio
    path = os.path.join(folder_gfs_subset,file)
    
    #Cargamos el archivo .npz
    data = np.load(path)
    pp_daily = data['pp_daily']

    lista_npz.append(pp_daily)
conjunto = np.stack(lista_npz,axis=0)
    

In [36]:
conjunto.shape #Obtengo un array de 31tiemposx41latx41long

(31, 41, 41)

In [29]:
conjunto

array([['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longitudes'],
       ['pp_daily', 'latitudes', 'longit

In [9]:
kk = '/home/fernando.huaranca/datosmunin/subset/gfs/2018-01-01.npz'
ll = np.load(kk)

In [10]:
ll.files

['pp_daily', 'latitudes', 'longitudes']

In [15]:
np.savez()

[array([[ 0.8      ,  0.3      ,  0.3      , ...,  0.1      ,  0.1      ,
          0.8      ],
        [ 1.7      ,  0.8      ,  0.6      , ...,  0.       ,  0.       ,
          0.6      ],
        [ 2.       ,  1.1      ,  0.8      , ...,  0.1      ,  0.1      ,
          1.       ],
        ...,
        [ 4.7      ,  5.5      ,  5.7000003, ...,  7.1      ,  4.1      ,
          1.6      ],
        [ 1.6      ,  2.6      ,  3.6      , ...,  9.2      ,  7.1      ,
          3.5      ],
        [ 0.1      ,  0.7      ,  0.7      , ..., 11.5      , 12.7      ,
          6.2      ]], dtype=float32),
 array([[19.1      , 15.       , 11.5      , ...,  9.2      , 10.400001 ,
         10.300001 ],
        [20.599998 , 23.599998 , 16.699999 , ...,  9.       , 10.8      ,
         11.       ],
        [15.1      , 23.1      , 21.900002 , ...,  7.3999996,  9.400001 ,
          9.4      ],
        ...,
        [ 0.       ,  0.       ,  0.       , ..., 26.400002 , 28.400002 ,
         27.699999 

# Tratamos de realizar un codigo sin la funcion

In [None]:
#Librerias
import os
import numpy as np

def obtener_data(path_input,path_target,var_input,var_target,train_ratio, val_ratio, test_ratio):


    #---Esta seccion puede ser actualizada en base a definir que tipo de extension entra pro ejemplo csv,npz
    #Lectura de .npz en Path input
    Files_Input = os.listdir(path_input)

    #Lectura de .npz de Path Target
    Files_Target = os.listdir(path_input)

    #--------------------
    
    #



    

In [7]:
shutil.disk_usage('/home/fernando.huaranca/datosmunin/subset')

usage(total=99600858873856, used=89516300828672, free=5084541878272)