### Refiner class settings <a name="refiner-class-settings"></a>

In [21]:
import os 
import sys 
import numpy as np
import pandas as pd
import logging
sys.path.append(os.path.dirname(sys.path[0])) 
from refineryframe.refiner import Refiner

In [22]:
df = pd.DataFrame({
    'num_id' : [1, 2, 3, 4, 5],
    'NumericColumn': [1, -np.inf, np.inf,np.nan, None],
    'NumericColumn_exepted': [1, -996, np.inf,np.nan, None],
    'NumericColumn2': [None, None, 1,None, None],
    'NumericColumn3': [1, 2, 3, 4, 5],
    'DateColumn': pd.date_range(start='2022-01-01', periods=5),
    'DateColumn2': [pd.NaT,pd.to_datetime('2022-01-01'),pd.NaT,pd.NaT,pd.NaT],
    'DateColumn3': ['2122-05-01',
                    '2022-01-01',
                    '2021-01-01',
                    '1000-01-09',
                    '1850-01-09'],
    'CharColumn': ['Fół', None, np.nan, 'nót eXpęćTęd', '']
})

df

Unnamed: 0,num_id,NumericColumn,NumericColumn_exepted,NumericColumn2,NumericColumn3,DateColumn,DateColumn2,DateColumn3,CharColumn
0,1,1.0,1.0,,1,2022-01-01,NaT,2122-05-01,Fół
1,2,-inf,-996.0,,2,2022-01-02,2022-01-01,2022-01-01,
2,3,inf,inf,1.0,3,2022-01-03,NaT,2021-01-01,
3,4,,,,4,2022-01-04,NaT,1000-01-09,nót eXpęćTęd
4,5,,,,5,2022-01-05,NaT,1850-01-09,


#### Defining specification for the dataframe 

In [3]:
MISSING_TYPES = {'date_not_delivered': '1850-01-09',
                 'date_other_missing_type': '1850-01-08',
                 'numeric_not_delivered': -999,
                 'character_not_delivered': 'missing'}

In [4]:
unexpected_exceptions = {
    "col_names_types": "NONE",
    "missing_values": "ALL",
    "missing_types": "ALL",
    "inf_values": "NONE",
    "date_format": "NONE",
    "duplicates": "ALL",
    "date_range": "NONE",
    "numeric_range": "ALL"
}

In [5]:
replace_dict = {-996 : -999,
                "1000-01-09": "1850-01-09"}

### Initializing Refiner class

In [6]:
tns = Refiner(dataframe = df,
              replace_dict = replace_dict,
              loggerLvl = logging.DEBUG,
              unexpected_exceptions_duv = unexpected_exceptions)

#### using the main function to detect unexpected values

In [7]:
tns.detect_unexpected_values()

DEBUG:Refiner:=== checking column names and types
DEBUG:Refiner:=== checking propper date format
DEBUG:Refiner:=== checking expected date range
DEBUG:Refiner:=== checking for presense of inf values in numeric colums


#### extracting Refiner settings <a name="extracting-refiner-class-settings"></a>

In [16]:
refiner_settings = tns.get_refiner_settings()
refiner_settings

{'replace_dict': {-996: -999, '1000-01-09': '1850-01-09'},
 'MISSING_TYPES': {'date_not_delivered': '1850-01-09',
  'numeric_not_delivered': -999,
  'character_not_delivered': 'missing'},
 'expected_date_format': '%Y-%m-%d',
 'mess': 'INITIAL PREPROCESSING',
 'shout_type': 'HEAD2',
 'logger': <Logger Refiner (DEBUG)>,
 'logger_name': 'Refiner',
 'loggerLvl': 10,
 'dotline_length': 50,
 'lower_bound': -inf,
 'upper_bound': inf,
 'earliest_date': '1900-08-25',
 'latest_date': '2100-01-01',
 'unexpected_exceptions_duv': {'col_names_types': 'NONE',
  'missing_values': 'ALL',
  'missing_types': 'ALL',
  'inf_values': 'NONE',
  'date_format': 'NONE',
  'duplicates': 'ALL',
  'date_range': 'NONE',
  'numeric_range': 'ALL'},
 'unexpected_exceptions_ruv': {'irregular_values': 'NONE',
  'date_range': 'NONE',
  'numeric_range': 'NONE',
  'capitalization': 'NONE',
  'unicode_character': 'NONE'},
 'unexpected_conditions': None,
 'ignore_values': [],
 'ignore_dates': [],
 'type_dict': {}}

### Initializing new clean Refiner

In [9]:
tns2 = Refiner(dataframe = df)

#### detection before applying settings

In [10]:
tns2.detect_unexpected_values()



#### using saved refiner settings for new instance <a name="recreating-refiner-class-settings"></a> 

In [11]:
tns2.set_refiner_settings(refiner_settings)

In [12]:
tns2.detect_unexpected_values()

DEBUG:Refiner:=== checking column names and types
DEBUG:Refiner:=== checking propper date format
DEBUG:Refiner:=== checking expected date range
DEBUG:Refiner:=== checking for presense of inf values in numeric colums
