In [1]:
%load_ext autoreload
%autoreload 2

Ипользуются функции из ```pzph1dot1```:

In [1]:
from pzph1dot1 import *

  from tqdm.autonotebook import tqdm


# Как выглядит класс Augmentation

In [17]:
class Augmentation:
    import warnings
    def __init__(self, type_model=AugmType.KNN, n_neighbors=5, path_to_models=None, debug=False):
        self.path_to_data = None
        self.type = type_model
        self.n_neighbors = n_neighbors
        self.debug = debug
        if path_to_models is None:
            warnings.warn('path_to_models is not define. Will be used ./AugModel')
            if not os.path.exists('./AugModel'):
                os.mkdir('./AugModel')
            self.path_to_model = './AugModel'
        elif not os.path.exists(file_path):
            warnings.warn(f'{path_to_models} didn\'t exist! Try to create...')
            try:
                os.mkdir(path_to_models)
            except OSError:
                print (f"Failed to create directory {path_to_models}! Will be used ./AugModel")
                if not os.path.exists('./AugModel'):
                    os.mkdir('./AugModel')
                self.path_to_model = './AugModel'
            else:
                print ("The directory has been successfully created")
                self.path_to_model = path_to_models
        else:
            self.path_to_model = path_to_models
        
        
        self.used_flux = set()
        self.flux_err = {
            'sdss_psfFlux_u': 'sdss_psfFluxIvar_u',
            'sdss_psfFlux_g': 'sdss_psfFluxIvar_g',
            'sdss_psfFlux_r': 'sdss_psfFluxIvar_r',
            'sdss_psfFlux_i': 'sdss_psfFluxIvar_i',
            'sdss_psfFlux_z': 'sdss_psfFluxIvar_z',
            'sdss_cModelFlux_u': 'sdss_cModelFluxIvar_u',
            'sdss_cModelFlux_g': 'sdss_cModelFluxIvar_g',
            'sdss_cModelFlux_r': 'sdss_cModelFluxIvar_r',
            'sdss_cModelFlux_i': 'sdss_cModelFluxIvar_i',
            'sdss_cModelFlux_z': 'sdss_cModelFluxIvar_z',
            'ps_gKronFlux': 'ps_gKronFluxErr',
            'ps_rKronFlux': 'ps_rKronFluxErr',
            'ps_iKronFlux': 'ps_iKronFluxErr',
            'ps_zKronFlux': 'ps_zKronFluxErr',
            'ps_yKronFlux': 'ps_yKronFluxErr',
            'ps_gPSFFlux': 'ps_gPSFFluxErr',
            'ps_rPSFFlux': 'ps_rPSFFluxErr',
            'ps_iPSFFlux': 'ps_iPSFFluxErr',
            'ps_zPSFFlux': 'ps_zPSFFluxErr',
            'ps_yPSFFlux': 'ps_yPSFFluxErr',
            'ls_flux_g_ebv': 'ls_flux_ivar_g',
            'ls_flux_r_ebv': 'ls_flux_ivar_r',
            'ls_flux_z_ebv': 'ls_flux_ivar_z',
            'ls_flux_w1_ebv': 'ls_flux_ivar_w1',
            'ls_flux_w2_ebv': 'ls_flux_ivar_w2',
            'ls_flux_w3_ebv': 'ls_flux_ivar_w3',
            'ls_flux_w4_ebv': 'ls_flux_ivar_w4'
        }
        
    def train(self, path_to_data):
        self.path_to_data = path_to_data
        self.train_data = Catalog.read_table(self.path_to_data)
        if self.debug:
            print('Catalog was been reading. Shape: ', self.train_data.shape)
        for column in self.flux_err:
            if column in self.train_data:
                if self.debug:
                    print(f'Column {column}')
                self.used_flux.add(column)
                self._fit_for_flux(column)
        del self.train_data
        self.save()
                   
    def _fit_for_flux(self, flux_name):
        err_name = self.flux_err[flux_name]
        if self.debug:
            print(f'In _fit {flux_name}, {err_name}')
            
        assert err_name in self.train_data, f'Beeeeedaaaaa! {err_name} not in train table'
        flux, err = self._preprocessing(self.train_data[flux_name], self.train_data[err_name])
        
        model = {}
        if self.type.value % 2:
            if self.debug:
                print('Fit KNN')
                print(flux, flux.shape, err, err.shape)
            model['knn'] = KNeighborsRegressor(self.n_neighbors).fit(flux, err) 
        if self.type.value > 1:
            if self.debug:
                print('Fir RNR')
            model['rnn'] = RadiusNeighborsRegressor().fit(flux, err)
            model['normalize'] = 0.2 * np.max(flux) / 100
        model['y'] = err
        self._save_model(model, flux_name)
        return self

    def predict(self, data, dm=0, gauss_augm=True):
        df = Catalog.read_table(data).copy()
        for flux_name in self.used_flux:
            if flux_name in df:
                err_name = self.flux_err[flux_name]
                assert err_name in df, f'Beeeeedaaaaa! {err_name} not in predict table'
                if self.debug:
                    print('before', df[flux_name], np.sum(df[flux_name]))
                flux, err = self._preprocessing(df[flux_name], df[err_name])
                if self.debug:
                    print('preprocess', flux, np.sum(flux))
                flux = flux * np.power(10, 0.4 * dm)

                if dm != 0:
                    if self.debug:
                        self.plotik(flux / np.power(10, 0.4 * dm), err, flux_name + 'before')
                    model = self._read_model(flux_name)
                    err = self.neighbors(model, flux, sigma=err)
                    if self.debug:
                        self.plotik(flux, err, flux_name + 'after_tmp')
                    
                if gauss_augm:
                    flux = np.array(list(map(self.gauss_flux, zip(flux, err))))
                    if self.debug:
                        self.plotik(flux, err, flux_name + 'after')
                if self.debug:
                    print('after', flux, np.sum(flux)) 
                df.loc[:, flux_name], df.loc[:, err_name] = self._postrocessing(flux, err)
                if self.debug:
                    print('postprocess', df[flux_name], np.sum(df[flux_name])) 
                
        return df
            
    
    def neighbors(self, model, x_input, n_neighbors=None, radius=None, sigma=1e-9): #TO
        if self.debug:
            print(f'Hi! I\'m neighbors of {x_input}')
        x = x_input
#         if isinstance(x_input, float) or isinstance(x_input, int):
#             x = np.array([x_input])
#         x = np.array(x, dtype=float)
#         good_rows = ~(np.isnan(x) + np.isinf(x))
        res = np.zeros(x.shape)
        if not(n_neighbors is None):
            self.n_neighbors = n_neighbors
 
        if self.type == AugmType.KNN:
            assert self.n_neighbors > 0
            knb = model['knn'].kneighbors(x, n_neighbors=self.n_neighbors, return_distance=False)
            if self.debug:
                print('knn')
            res_tmp = []
            for x_loc, n, i in zip(x, knb, range(len(knb))):
                res_tmp.append(np.random.choice(model['y'][n][:, 0])) ################
            res = np.array(res_tmp)
                
        elif self.type == AugmType.RNN:
            pass

        elif self.type == AugmType.MIX:
            assert self.n_neighbors > 0
            for x_loc, s, i in zip(x, sigma, range(len(x))): # самая долгая часть - причина, почему обычные соседи могут быть лучше
                if radius is None:
                    r = np.max([0.2*x_loc, 3*s]) / model['normalize']
                else:
                    r = radius
                rnb = model['rnn'].radius_neighbors(x_loc.reshape(-1, 1), radius=r, return_distance=False)[0]
                if len(rnb) < self.n_neighbors:
                    knb = model['knn'].kneighbors(x_loc.reshape(-1, 1), n_neighbors=self.n_neighbors, return_distance=False)[0]
                    res[i] = np.random.choice(model['y'][knb][:, 0])
                else:
                    res[i] = np.random.choice(model['y'][rnb][:, 0])
        return res

    
    def _preprocessing(self, flux, err):
        if self.debug:
            print('In _preprocessing')
        if 'ivar' in err.name or 'Ivar' in err.name:
            err = np.power(err, -0.5)
            self.tmp_case = 1
        elif re.findall('^ps_dw\dflux_ab$', err.name):
            flux, err = flux.replace(-999, np.NaN), err.replace(-999, np.NaN)
            self.tmp_case = 2
        else:
            flux, err = flux.replace(-999, np.NaN) / 3621e-9, err.replace(-999, np.NaN) / 3621e-9
            self.tmp_case = 3

        flux, err = np.array(flux.values, dtype=float), np.array(err.values, dtype=float)
        self.tmp_index = np.isfinite(flux) & np.isfinite(err)
            
        return flux[self.tmp_index].reshape(-1, 1), err[self.tmp_index].reshape(-1, 1)
        
        
    def _postrocessing(self, flux, err): #TO
        if self.debug:
            print('In _postprocessing')
        if self.tmp_case == 1:
            err = np.power(err, -2)
        elif self.tmp_case == 3:
            flux, err = flux * 3621e-9, err * 3621e-9
            
                
        if self.debug:
            print(self.tmp_index, self.tmp_index.shape)
            
        flux_res, err_res = np.full((len(self.tmp_index), 1), np.nan), np.full((len(self.tmp_index), 1), np.nan)
        flux_res[self.tmp_index], err_res[self.tmp_index] = flux.reshape(-1, 1), err.reshape(-1, 1)
        return flux_res, err_res
        
    @staticmethod
    def gauss_flux(inputs):
        mu, err = inputs
        sigma = np.sqrt(err ** 2 + (mu * 0.03) ** 2)
        return np.random.normal(mu, sigma)
    
    
    def plotik(self, x, y, name=''):
        import seaborn as sns
        import matplotlib.pyplot as plt
        f, ax = plt.subplots(nrows = 1, ncols=1, figsize=(12, 8), sharex=True, sharey = True)
        ax.set_title(name) 
        ax.set_xlim([-1000, 30000])
        ax.set_ylim([-10, 400])
        ax.scatter(x, y, c='r')
        if not os.path.exists(os.path.join(self.path_to_model, 'plt')):
                os.mkdir(os.path.join(self.path_to_model, 'plt'))
        f.savefig(os.path.join(self.path_to_model, 'plt', self.type.name+'_'+name+'.png'))
    
    def _save_model(self, model, flux_name):
        if self.debug:
            print(f'Self mimi model')
        with open(os.path.join(self.path_to_model, self.type.name+'_'+flux_name), 'wb') as file:
            pickle.dump(model, file) 
            
    def _read_model(self, flux_name):
        if self.debug:
            print(f'Read mimi model')
        with open(os.path.join(self.path_to_model, self.type.name+'_'+flux_name), 'rb') as file:
            return pickle.load(file)
        
    def save(self, path=None):
        if self.debug:
            print(f'Self me :)')
        with open(os.path.join(path or self.path_to_model, self.type.name+'_CLASS'), 'wb') as file:
            pickle.dump(self, file) 
    
    @staticmethod
    def read(path, type_m=AugmType.KNN):
        with open(os.path.join(path, type_m.name+'_CLASS'), 'rb') as file:
            return pickle.load(file)

Основные функции: 

```Augmentation(type_model=<Тип модели>, n_neighbors=5, path_to_models=<Путь для сохранинеия модели>, debug=False)``` - инициализация модели \
```Augmentation.train(path_to_data)``` - обучение модели на данных из ```path_to_data```\
```Augmentation.read(path, type_m=<Тип модели>)``` - чтение ранее обученной модели из ```path```\
```Augmentation.predict(self, data, dm=0, gauss_augm=True)``` - аугментация данных ```data```, смещенны на ```dm```

# Обучение модели аугментации

Для начала необходимо определиться, какая таблица будет использована в качестве тренировочных данных для аугментции ошибки при заданном потоке

In [4]:
!ls /data/SRGz/LH_20200514_results_new3

buf   part-00000.best.x1.gz_pkl   part-00000.predictions.x1.gz_pkl
data  part-00000.features.gz_pkl


In [102]:
train_data = pd.read_pickle('/data/SRGz/LH_20200514_results_new3/part-00000.features.gz_pkl', compression='gzip')

In [103]:
train_data

Unnamed: 0,ID_SRC,RA,DEC,RADEC_ERR,EXT,DET_LIKE_0,ML_FLUX_0,ML_FLUX_ERR_0,EXT_LIKE,EXT_ERR,...,phot_is_train_gal,phot_is_test_xxln_m16,phot_is_test_s82x_l19,phot_is_test_s82x_a17,phot_is_test_qso,phot_is_test_star,phot_is_test_gal,phot_is_spec_sdss,phot_test_field,__nrow__
0,1,159.435510,57.198784,0.000000,0.0,66750.085938,6.003547,2.523368,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,1
1,1,159.435510,57.198784,0.000000,0.0,66750.085938,6.003547,2.523368,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,2
2,1,159.435510,57.198784,0.000000,0.0,66750.085938,6.003547,2.523368,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,3
3,1,159.435510,57.198784,0.000000,0.0,66750.085938,6.003547,2.523368,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,4
4,1,159.435510,57.198784,0.000000,0.0,66750.085938,6.003547,2.523368,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122442,7099,161.403945,54.771317,3.497884,0.0,6.535053,0.001697,0.000623,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,122443
122443,7099,161.403945,54.771317,3.497884,0.0,6.535053,0.001697,0.000623,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,122444
122444,7099,161.403945,54.771317,3.497884,0.0,6.535053,0.001697,0.000623,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,122445
122445,7099,161.403945,54.771317,3.497884,0.0,6.535053,0.001697,0.000623,0.0,0.0,...,False,False,False,False,False,False,False,False,LH,122446


In [18]:
model = Augmentation(debug=True) # Модель будет сохранена в ./AugModel

  if __name__ == '__main__':


In [19]:
model.train('/data/SRGz/LH_20200514_results_new3/part-00000.features.gz_pkl')

Catalog was been reading. Shape:  (122447, 368)
Column sdss_psfFlux_u
In _fit sdss_psfFlux_u, sdss_psfFluxIvar_u
In _preprocessing
Fit KNN
[[1.95891964]
 [0.07986455]
 [0.44271901]
 ...
 [2.19622612]
 [0.12734416]
 [0.03700907]] (48058, 1) [[0.22642964]
 [0.19628328]
 [0.20243537]
 ...
 [0.25341411]
 [0.22713669]
 [0.23570331]] (48058, 1)
Self mimi model
Column sdss_psfFlux_g
In _fit sdss_psfFlux_g, sdss_psfFluxIvar_g
In _preprocessing
Fit KNN
[[4.24975395]
 [0.14327973]
 [0.48652768]
 ...
 [5.04315376]
 [0.66581106]
 [0.0917399 ]] (48057, 1) [[0.11377726]
 [0.07552425]
 [0.0782892 ]
 ...
 [0.14577981]
 [0.09048657]
 [0.08485833]] (48057, 1)
Self mimi model
Column sdss_psfFlux_r
In _fit sdss_psfFlux_r, sdss_psfFluxIvar_r
In _preprocessing
Fit KNN
[[ 6.22934246]
 [ 0.79992878]
 [ 0.77996922]
 ...
 [11.20584965]
 [ 1.75984085]
 [ 0.7426852 ]] (48058, 1) [[0.1991438 ]
 [0.15161575]
 [0.15179704]
 ...
 [0.25586743]
 [0.16597577]
 [0.26842281]] (48058, 1)
Self mimi model
Column sdss_psfFlux

Self mimi model
Self me :)


# Аугментация таблицы

In [6]:
df = pd.read_pickle('../data/3weak.pkl_gz', compression='gzip')
df

Unnamed: 0,nrow,objID,ra,dec,zspec,zspec_conf,zspec_source,class,subclass,class1,...,zoo_best-x1_ci1a_90_Lx_err,zoo_best-x1_ci1b_90_Lx_err,zoo_best-x1_ci1a_95,zoo_best-x1_ci1b_95,zoo_best-x1_ci1a_95_DL_cm,zoo_best-x1_ci1b_95_DL_cm,zoo_best-x1_ci1a_95_Lx,zoo_best-x1_ci1b_95_Lx,zoo_best-x1_ci1a_95_Lx_err,zoo_best-x1_ci1b_95_Lx_err
9328,198482,131932190057259165,219.005907,19.948889,0.045404,-999.0,SDSS,1,STARFORMING,,...,,,-5.822927,0.364719,,,,,,
9479,368943,119241304119489055,130.411915,9.373807,0.000239,-999.0,SDSS,1,,,...,,,-3.251049,0.344858,,,,,,
9681,610236,131851704557834825,170.455790,19.878579,-0.009112,-999.0,SDSS,1,,,...,,,-3.134196,0.553093,,,,,,
9755,692755,113791676021961937,167.602380,4.826100,0.030167,-999.0,SDSS,1,STARFORMING,,...,,,-3.250823,0.332563,,,,,,
24242,52943,172432048282463503,204.828265,53.694016,0.106581,-999.0,SDSS,1,STARFORMING,,...,,,-3.212785,0.486123,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60318,960263,1237678617420628015,6.372177,1.363077,,,,1,,,...,,,-2.730689,1.53531,,,,,,
60340,960285,1237657628450750810,167.748040,50.564964,,,,1,,,...,,,-2.96547,0.383744,,,,,,
60362,960307,1237665583253881608,251.800550,50.801796,,,,1,,,...,,,-0.679388,0.211186,,,,,,
60381,960326,1237679579471282689,22.535753,30.106945,,,,1,,,...,,,-3.302376,0.350595,,,,,,


In [None]:
model = Augmentation.read('./AugModel')
model.debug = False
dm=-5

In [98]:
df_aug = model.predict(df, dm)

In [99]:
df_aug.to_pickle(f'./output_augm/df_aug{dm}.gz_pkl', compression='gzip')

получили таблицу с аугментированными данными