# Preprocess Pipeline

In [29]:
import os
import gc
import warnings
import json

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import statsmodels as sm
from prettytable import PrettyTable

In [31]:
from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
    RobustScaler,
    OneHotEncoder,
)
from sklearn.preprocessing import TargetEncoder
from sklearn.decomposition import (
    PCA,
    SparsePCA
)
from sklearn.pipeline import (
    Pipeline,
    make_pipeline,
)
from sklearn.compose import (
    ColumnTransformer
)
from sklearn.model_selection import (
    train_test_split
)

In [32]:
PATH = os.getcwd()
PATH = PATH.split('/')[:-2]
PATH = ''.join([str(folder + '/') for folder in PATH])
print(PATH)

/Users/school/Documents/repositories/Datasets_EDA/src/Allstate Claims Severity/


In [33]:
train_df = pd.read_csv(
    filepath_or_buffer= os.path.join(PATH, 'code', 'v2', 'train_v2.csv')
)

In [34]:
y = train_df['loss']
X = train_df.drop(
    columns= ['id','loss'],
    axis= 0
)

In [35]:
print(y)

0         2213.18
1         1283.60
2         3005.09
3          939.85
4         2763.85
           ...   
194587    1198.62
194588    1108.34
194589    5762.64
194590    1562.87
194591    4751.72
Name: loss, Length: 194592, dtype: float64


In [36]:
print(X)

       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ...     cont5  \
0         A    B    A    B    A    A    A    A    B     A  ...  0.310061   
1         A    B    A    A    A    A    A    A    B     B  ...  0.885834   
2         A    B    A    A    B    A    A    A    B     B  ...  0.397069   
3         B    B    A    B    A    A    A    A    B     A  ...  0.422268   
4         A    B    A    B    A    A    A    A    B     B  ...  0.704268   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...       ...   
194587    A    B    A    A    A    A    A    A    B     A  ...  0.939556   
194588    A    A    A    A    A    B    A    A    A     A  ...  0.704268   
194589    A    B    A    A    A    A    A    B    B     A  ...  0.482436   
194590    A    B    A    A    A    A    A    A    B     B  ...  0.340543   
194591    B    A    A    B    A    A    A    A    A     A  ...  0.281143   

           cont6     cont7    cont8    cont9   cont10    cont11    cont12  \
0       0.

In [37]:
gc.collect()

0

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size= 0.25,
    random_state= 4095,
    
)
del X, y

In [39]:
print(X_train)
print(X_train.shape)

       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ...     cont5  \
133017    A    B    A    A    B    A    A    A    B     B  ...  0.845727   
8498      A    B    A    B    A    A    A    A    B     A  ...  0.858841   
38546     A    A    A    B    B    A    A    A    A     A  ...  0.288217   
10917     B    A    A    B    A    A    A    A    A     A  ...  0.281143   
83256     A    A    A    A    A    B    A    A    A     A  ...  0.281143   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...       ...   
100201    A    A    A    A    A    A    A    A    A     A  ...  0.783230   
24465     A    B    A    A    A    A    A    A    B     A  ...  0.397069   
35429     B    A    A    B    A    A    A    A    A     A  ...  0.499798   
26602     A    A    A    B    A    A    A    A    A     A  ...  0.783230   
158976    B    A    A    A    A    B    A    A    A     A  ...  0.317541   

           cont6     cont7    cont8    cont9   cont10    cont11    cont12  \
133017  0.

In [40]:
CAT_X = [str('cat' + str(i)) for i in range(1, 117, 1)]
CONT_X = [str('cont' + str(i)) for i in range(1, 15, 1)]

### B.2 Pipeline

#### B.2.1 Continuous

In [41]:
print('Before')
print(X_train[CONT_X])

Before
           cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
133017  0.626630  0.785784  0.762059  0.873871  0.845727  0.805849  0.638454   
8498    0.462786  0.620805  0.484196  0.833240  0.858841  0.349350  0.362171   
38546   0.067520  0.620805  0.944251  0.189137  0.288217  0.204165  0.348605   
10917   0.493164  0.422197  0.462347  0.833240  0.281143  0.331398  0.361715   
83256   0.351358  0.737068  0.777587  0.182950  0.281143  0.484775  0.729795   
...          ...       ...       ...       ...       ...       ...       ...   
100201  0.475784  0.785784  0.592681  0.614134  0.783230  0.373500  0.381883   
24465   0.961159  0.620805  0.246911  0.821574  0.397069  0.966438  0.640047   
35429   0.520698  0.422197  0.549770  0.452887  0.499798  0.314683  0.370419   
26602   0.475784  0.422197  0.484196  0.534409  0.783230  0.331919  0.336160   
158976  0.372785  0.785784  0.692825  0.284048  0.317541  0.464835  0.800337   

          cont8    cont9   cont1

In [42]:
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
cont_pipeline= Pipeline(
    steps= [
        (
            'scaler',
            RobustScaler( # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler
                with_centering= True,
                with_scaling= True,
                # quantile_range= ,
                unit_variance= True
            )
        ),
        (
            'pca',
            PCA( # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA
                n_components= 'mle',
                # copy: bool = True,
                # whiten: bool = False,
                svd_solver= "auto",
                # tol: Float = 0,
                # iterated_power: Int | Literal['auto'] = "auto",
                # n_oversamples: Int = 10,
                # power_iteration_normalizer: Literal['auto', 'QR', 'LU', 'none'] = "auto",
                random_state= 4095
            )
        )
    ]
)

# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer
cont_preprocessor = ColumnTransformer(
    transformers=[
        ('continuous', cont_pipeline, CONT_X),
    ])

In [43]:
cont_preprocessor

In [44]:
after_data = cont_preprocessor.fit_transform(X_train)
after_data = pd.DataFrame(
    data= after_data,
)
print('After')
print(after_data)
del after_data

After
              0         1         2         3         4         5         6   \
0       2.704851  0.725858  2.392243 -0.253668  0.721206  0.400600  0.557179   
1      -1.656761 -1.206162  1.371314 -0.563968  0.833353 -1.032388  0.056500   
2      -3.982972  1.829617  0.002258  0.214165 -0.753605 -0.491473 -0.638822   
3      -1.416029 -1.425447  0.209302  1.230888  0.432375 -0.035363 -0.904777   
4      -0.981403  2.445807 -0.266940  0.221293 -0.388720  0.890537 -0.338565   
...          ...       ...       ...       ...       ...       ...       ...   
145939 -1.183862 -0.207105  1.412361 -1.038941  0.240196 -0.291685 -0.257080   
145940  5.608953 -0.646191  0.235439 -0.385660  0.009462  0.540336 -0.630884   
145941 -1.123915 -0.625371  0.067032  0.180900  0.250826  0.749077 -0.348201   
145942 -1.636377 -0.933232  0.217480 -0.853192  0.586777 -0.481148  0.198168   
145943 -0.346881  2.256650  0.548880  1.165106 -0.292567 -0.793239 -0.324340   

              7         8        

#### B.2.1 Categorical

In [45]:
print("Before")
print(X_train[CAT_X])

Before
       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ... cat107 cat108  \
133017    A    B    A    A    B    A    A    A    B     B  ...      J      K   
8498      A    B    A    B    A    A    A    A    B     A  ...      F      B   
38546     A    A    A    B    B    A    A    A    A     A  ...      K      A   
10917     B    A    A    B    A    A    A    A    A     A  ...      F      B   
83256     A    A    A    A    A    B    A    A    A     A  ...      G      D   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...    ...    ...   
100201    A    A    A    A    A    A    A    A    A     A  ...      F      B   
24465     A    B    A    A    A    A    A    A    B     A  ...      H      G   
35429     B    A    A    B    A    A    A    A    A     A  ...      F      B   
26602     A    A    A    B    A    A    A    A    A     A  ...      G      B   
158976    B    A    A    A    A    B    A    A    A     A  ...      F      D   

       cat109 cat110 cat111 cat1

In [46]:
# CAT_X = ['cat1', 'cat2', 'cat3']

In [47]:
# v1
from matplotlib import category


cat_pipeline= Pipeline(
    steps= [
        (
            'One Hot Encoder',
            OneHotEncoder( # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
                # categories: Sequence[ArrayLike] | Literal['auto'] = "auto",
                # drop= CAT_X,
                # sparse: str | bool = "deprecated",
                # sparse_output: bool = True,
                # dtype: ... = ...,
                # handle_unknown: Literal['error', 'ignore', 'infrequent_if_exist'] = "error",
                # min_frequency: float | int | None = None,
                # max_categories: Int | None = None
            ),
        ),
    ]
)

# v2
cat_pipeline= Pipeline(
    steps= [
        (
            'target',
            TargetEncoder( # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html
                categories= "auto",
                target_type= 'continuous',
                cv= 5,
                shuffle= True,
                random_state= 4095
            )
        )
    ]
)


cat_preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', cat_pipeline, CAT_X),
    ])

In [48]:
cat_preprocessor

In [49]:
after_data = cat_preprocessor.fit_transform(X_train, y_train)
after_data = pd.DataFrame(
    data= after_data,
)
print('After')
print(after_data)
del after_data

After
                0            1            2            3            4    \
0       3400.726290  3790.325086  2901.265857  2827.047257  3462.210643   
1       3405.193546  3801.333089  2905.191945  3485.199088  2817.362924   
2       3400.726290  2456.145293  2901.265857  3475.385958  3462.210643   
3       1916.708036  2452.696878  2900.580188  3500.463069  2802.788523   
4       3400.726290  2456.145293  2901.265857  2827.047257  2807.412150   
...             ...          ...          ...          ...          ...   
145939  3405.193546  2457.803096  2905.191945  2829.609138  2817.362924   
145940  3400.726290  3790.325086  2901.265857  2827.047257  2807.412150   
145941  1923.390033  2455.811652  2907.987372  3487.302572  2815.878058   
145942  3400.726290  2456.145293  2901.265857  3475.385958  2807.412150   
145943  1916.708036  2452.696878  2900.580188  2821.286150  2802.788523   

                5            6            7            8            9    ...  \
0       3257.

### B.3 Combine Pipeline & Transform '*X_test*'

In [50]:
final_pipeline = ColumnTransformer(
    transformers=[
        ('categorical', cat_pipeline, CAT_X),
        ('continuous', cont_pipeline, CONT_X)
    ])
print(final_pipeline)

ColumnTransformer(transformers=[('categorical',
                                 Pipeline(steps=[('target',
                                                  TargetEncoder(random_state=4095,
                                                                target_type='continuous'))]),
                                 ['cat1', 'cat2', 'cat3', 'cat4', 'cat5',
                                  'cat6', 'cat7', 'cat8', 'cat9', 'cat10',
                                  'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
                                  'cat16', 'cat17', 'cat18', 'cat19', 'cat20',
                                  'cat21', 'cat22', 'cat23', 'cat24', 'cat25',
                                  'cat26', 'cat27', 'cat28', 'cat29', 'cat30', ...]),
                                ('continuous',
                                 Pipeline(steps=[('scaler',
                                                  RobustScaler(unit_variance=True)),
                                                 ('

In [51]:
final_pipeline

In [52]:
print('before')
print(X_train)

before
       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ...     cont5  \
133017    A    B    A    A    B    A    A    A    B     B  ...  0.845727   
8498      A    B    A    B    A    A    A    A    B     A  ...  0.858841   
38546     A    A    A    B    B    A    A    A    A     A  ...  0.288217   
10917     B    A    A    B    A    A    A    A    A     A  ...  0.281143   
83256     A    A    A    A    A    B    A    A    A     A  ...  0.281143   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...       ...   
100201    A    A    A    A    A    A    A    A    A     A  ...  0.783230   
24465     A    B    A    A    A    A    A    A    B     A  ...  0.397069   
35429     B    A    A    B    A    A    A    A    A     A  ...  0.499798   
26602     A    A    A    B    A    A    A    A    A     A  ...  0.783230   
158976    B    A    A    A    A    B    A    A    A     A  ...  0.317541   

           cont6     cont7    cont8    cont9   cont10    cont11    cont12  \
133

In [53]:
X_train = final_pipeline.fit_transform(X_train, y_train)
print("After")
print(X_train)
print(X_train.shape)

After
[[ 3.40072629e+03  3.79032509e+03  2.90126586e+03 ...  8.42135570e-01
  -9.08156673e-02  2.35976578e-01]
 [ 3.40519355e+03  3.80133309e+03  2.90519194e+03 ...  4.43632957e-01
  -1.83723950e-01  4.49024831e-02]
 [ 3.40072629e+03  2.45614529e+03  2.90126586e+03 ...  4.53133492e-01
   3.32069315e-01  1.77241626e-01]
 ...
 [ 1.92339003e+03  2.45581165e+03  2.90798737e+03 ... -3.02648357e-01
  -1.66287840e-01 -1.03185412e-01]
 [ 3.40072629e+03  2.45614529e+03  2.90126586e+03 ...  2.76361714e-02
  -1.14070698e-01 -1.66153545e-02]
 [ 1.91670804e+03  2.45269688e+03  2.90058019e+03 ... -4.50485345e-01
   1.60024167e-01  6.28488127e-02]]
(145944, 129)


In [54]:
print("Before")
print(X_test)

Before
       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ...     cont5  \
117152    A    A    A    A    B    A    A    A    A     A  ...  0.921707   
18997     B    A    A    B    A    B    A    A    A     A  ...  0.718531   
36633     A    A    A    B    A    B    A    A    A     A  ...  0.745764   
57554     A    B    A    A    B    A    A    A    B     A  ...  0.858841   
96317     A    B    A    A    B    A    A    A    B     A  ...  0.281143   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...       ...   
117688    A    A    A    A    A    B    A    A    A     A  ...  0.758711   
44847     A    B    A    A    B    A    A    A    B     A  ...  0.372405   
180721    B    A    A    A    A    B    A    A    A     A  ...  0.310061   
14840     A    B    B    A    B    A    A    A    B     B  ...  0.281143   
24378     A    B    B    A    A    A    A    A    B     A  ...  0.422268   

           cont6     cont7    cont8    cont9   cont10    cont11    cont12  \
117

In [55]:
X_test = final_pipeline.transform(X_test)
print(X_test)

[[ 3.40450155e+03  2.45363036e+03  2.90270113e+03 ... -3.96182679e-02
  -1.38322659e-01 -2.24431107e-01]
 [ 1.92118913e+03  2.45363036e+03  2.90270113e+03 ...  2.02457553e-01
  -7.08670794e-02  2.20722409e-02]
 [ 3.40450155e+03  2.45363036e+03  2.90270113e+03 ... -6.68829216e-02
   8.48612884e-02  1.18185560e-01]
 ...
 [ 1.92118913e+03  2.45363036e+03  2.90270113e+03 ... -5.26364865e-02
  -1.25855181e-01 -1.90493802e-01]
 [ 3.40450155e+03  3.80047903e+03  5.36007393e+03 ...  4.55991958e-01
  -1.38542066e-01  4.39579952e-01]
 [ 3.40450155e+03  3.80047903e+03  5.36007393e+03 ...  1.31589521e-01
   2.39383149e-01  8.54584483e-02]]


## (C) Save Train & Test as numpy array for imports

In [57]:
np.savez_compressed(
    file= os.path.join(PATH, 'code', 'v2', 'Preprocessed_Pipeline_Data'),
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)