# Imports

In [136]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn import set_config; set_config(display='diagram')


In [137]:
# ns = not scaled / ss = standard scaler / rs = robust scaler / ms = minmax scaler
# nt = not typed / dat = datetime / int = integer / flt = float


column_dict = {
    'date':['ns','dat'],
    'numer_sta':['ns','nt'],
    'Latitude':['ns','nt'],
    'Longitude':['ns','nt'],
    'Altitude':['ns','int'],
    'pmer':['ss','int'],
    'dd':['ss','int'],
    'ff':['ss','flt'],
    't':['ss','flt'],
    'u':['ss','int'],
    'ssfrai':['rs','flt'],
    'rr3':['ms','flt']
}

Dtype_col = {'int':[],'flt':[],'dat':[]}
for i in column_dict:
    if column_dict[i][1] == 'int':
        Dtype_col['int'].append(i)
    elif column_dict[i][1] == 'flt':
        Dtype_col['flt'].append(i)
    elif column_dict[i][1] == 'dat':
        Dtype_col['dat'].append(i)  

Not_encoded = ['date','numer_sta','Latitude','Lontitude','Altitude']
Num_col_standard = ['pmer','dd','ff','t','u']
Num_col_robust = ['ssfrai']
Num_col_minmax = ['rr3']
Cat_col = []
Col_select = Not_encoded + Num_col_standard + Num_col_robust + Num_col_minmax + Cat_col
Stations = ['7481','7650','7661','7690','7591','7577','7643']

In [42]:
#df_synop = deva_class
#df_synop = df_synop[df_synop.numer_sta.isin(stations.ID)]

# Selector

In [139]:
Selector = Pipeline([
    ("selector", ColumnTransformer([("selector", "passthrough", Col_select)], remainder="drop"))
])

In [140]:
Selector

# Dtype transformer

In [141]:
def to_int(x):
    return pd.DataFrame(x).astype(int)
def to_float(x):
    return pd.DataFrame(x).astype(float)
def to_date(x):
    return pd.to_datetime(x,format='%Y%m%d%H%M%S',errors='coerce')

In [142]:
tr_int = FunctionTransformer(to_int)
tr_float = FunctionTransformer(to_float)
tr_date = FunctionTransformer(to_date)

In [144]:
Dtyper = ColumnTransformer([
    ('to int',tr_int,Dtype_col['int']),
    ('to float',tr_float,Dtype_col['flt']),
    ('to date',tr_date,Dtype_col['dat'])
])

In [145]:
Dtyper

# Encoders

## Pipeline

In [146]:
# Numeric_transformer_standard
num_transformer_standard = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

In [147]:
# Numeric_transformer_robust
num_transformer_robust = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', RobustScaler())])

In [148]:
# Numeric_transformer_minmax
num_transformer_minmax = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

In [149]:
# Categorical transformer
cat_ohe = Pipeline([
    ('ohe', OneHotEncoder())
])

## Columns Transformer

In [150]:
encoder_scaler = ColumnTransformer([
    ('standard',num_transformer_standard,Num_col_standard),
    ('robust',num_transformer_robust,Num_col_robust),
    ('minmax',num_transformer_minmax,Num_col_minmax)
])

In [151]:
encoder_scaler

# Pipeline

In [152]:
Skypipe = Pipeline([
    ('Selector', Selector),
    ('dtyper', Dtyper),
    ('encoder_scaler', encoder_scaler)
    ])
    

In [153]:
Skypipe

In [None]:
pipe.fit(X[['age']])
    pipe.transform(X[['age']])
    pipe.fit_transform(X[['age']])

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression

cont_prepro = Pipeline([
    ("imputer",SimpleImputer(strategy = "median")),
    ("scaler",StandardScaler())
])

cat_prepro = Pipeline([
    ("imputer",SimpleImputer(strategy = "most_frequent")),
    ("encoder",OneHotEncoder(handle_unknown = "ignore"))
])

preprocessing = make_column_transformer(
    (cont_prepro,selector(dtype_exclude = "object")),
    (cat_prepro,selector(dtype_include = "object"))
)

pipe1 = Pipeline([("preprocessing",preprocessing),("model",LogisticRegression())])

In [80]:
pipe1

In [81]:
pipe_one = Pipeline([("num_impute",SimpleImputer(strategy='median')),('Std', StandardScaler())])

preprocessing = make_column_transformer((pipe_one,["feat_1","feat_2"]),remainder='passthrough')

pipe = Pipeline([("preprocessing",preprocessing),("model",LogisticRegression())])

In [82]:
pipe