In [7]:
import sys
import pathlib
src_path = pathlib.Path().absolute().parent / "src"
sys.path.append(str(src_path))
import data_functions
import custom_transformers as ct
import pandas as pd
import numpy as np

import pickle

from sklearn.compose import  make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
x_train, x_test, y_train = data_functions.get_dataframes()

In [18]:
simple_impute_strict = make_column_transformer((SimpleImputer(), data_functions.get_numeric_features(data_functions.get_strict_features())), remainder = 'passthrough')
ss_strict = make_column_transformer((StandardScaler(), data_functions.get_numeric_features(data_functions.get_strict_features())), remainder = 'passthrough')
ohe_strict = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore', sparse = False), data_functions.get_categorical_features(data_functions.get_strict_features())), remainder = 'passthrough')

simple_impute_cy = make_column_transformer((SimpleImputer(missing_values = 0.0), ['construction_year']), remainder = 'passthrough')
transform_construction = make_column_transformer((ct.TransformConstructionYear(), ['construction_year']), remainder = 'passthrough')

simple_impute_loose = make_column_transformer((SimpleImputer(), data_functions.get_numeric_features(data_functions.get_loose_features())), remainder = 'passthrough')
ss_loose = make_column_transformer((StandardScaler(), data_functions.get_numeric_features(data_functions.get_loose_features())), remainder = 'passthrough')
ohe_loose = make_column_transformer((OneHotEncoder(catgories = 'auto', handle_unknown = 'ignore', sparse = False), data_functions.get_categorical_features(data_functions.get_loose_features())), remainder = 'passthrough')

strict_preprocessing = make_pipeline(ct.ChooseStrictFeatures(),
                                    ct.BinInstaller(),
                                    simple_impute_cy,
                                    simple_impute_strict,
                                    transform_construction,
                                    ss_strict,
                                    ohe_strict)
                                    

loose_preprocessing = make_pipeline(ct.ChooseLooseFeatures(),
                                    ct.BinInstaller(),
                                    simple_impute_cy,
                                    transform_construction,
                                    simple_impute_loose,
                                    ss_loose,
                                    ohe_loose)

In [26]:
strict_preprocessing = make_pipeline(ct.ChooseStrictFeatures(),
                                    ct.BinInstaller(),
                                    make_column_transformer((SimpleImputer(missing_values = 0.0), ['construction_year']),
                                                            (SimpleImputer(), data_functions.get_numeric_features(data_functions.get_strict_features())),
                                                            (ct.TransformConstructionYear(), ['construction_year']),
                                                            (StandardScaler(), data_functions.get_numeric_features(data_functions.get_strict_features())),
                                                            (OneHotEncoder(categories = 'auto', handle_unknown = 'ignore', sparse = False), data_functions.get_categorical_features(data_functions.get_strict_features())),
                                                            remainder = 'drop'))
                                    

loose_preprocessing = make_pipeline(ct.ChooseLooseFeatures(),
                                    ct.BinInstaller(),
                                    make_column_transformer((SimpleImputer(missing_values = 0.0), ['construction_year']),
                                                            (SimpleImputer(), data_functions.get_numeric_features(data_functions.get_loose_features())),
                                                            (ct.TransformConstructionYear(), ['construction_year']),
                                                            (StandardScaler(), data_functions.get_numeric_features(data_functions.get_loose_features())),
                                                            (OneHotEncoder(categories = 'auto', handle_unknown = 'ignore', sparse = False), data_functions.get_categorical_features(data_functions.get_loose_features())),
                                                            remainder = 'drop'))

In [27]:
strict_model = make_pipeline(strict_preprocessing, DecisionTreeClassifier())

In [28]:
loose_model = make_pipeline(loose_preprocessing, DecisionTreeClassifier())

In [29]:
cross_val_score(strict_model, x_train, y_train, cv = 5)

array([0.75465028, 0.7538086 , 0.76010101, 0.75151515, 0.74793736])

In [30]:
cross_val_score(loose_model, x_train, y_train, cv = 5)



ValueError: Input contains NaN

In [31]:
strict_pickle_path = src_path / 'strict_pre_pipeline_v2.pkl'
#loose_pickle_path = src_path / 'loose_pre_pipeline_v2.pkl'
file = open(strict_pickle_path, 'wb')
pickle.dump(strict_preprocessing, file)
file.close()
# file = open(loose_pickle_path, 'wb')
# pickle.dump(loose_preprocessing, file)
# file.close()

0       20709
2010     2645
2008     2613
2009     2533
2000     2091
2007     1587
2006     1471
2003     1286
2011     1256
2004     1123
2012     1084
2002     1075
1978     1037
1995     1014
2005     1011
1999      979
1998      966
1990      954
1985      945
1980      811
1996      811
1984      779
1982      744
1994      738
1972      708
1974      676
1997      644
1992      640
1993      608
2001      540
1988      521
1983      488
1975      437
1986      434
1976      414
1970      411
1991      324
1989      316
1987      302
1981      238
1977      202
1979      192
1973      184
2013      176
1971      145
1960      102
1967       88
1963       85
1968       77
1969       59
1964       40
1962       30
1961       21
1965       19
1966       17
Name: construction_year, dtype: int64