# All improts necessary

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sys
sys.path.append('..')

In [3]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [6]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

In [9]:
from sklearn.compose import ColumnTransformer

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import NearMiss

In [12]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
%matplotlib inline

sns.set(font_scale=2)

# Auxiliary functions and classes

# A few intro-words

# Read the data and add some gaps

In [15]:
data = pd.read_csv('../data/datasets/TS_Summer_2018/data.csv', sep=';')

In [16]:
data.head().T

Unnamed: 0,0,1,2,3,4
age,56,57,37,40,56
job,housemaid,services,services,admin.,services
marital,married,married,married,married,married
education,basic.4y,high.school,high.school,basic.6y,high.school
default,no,unknown,no,no,no
housing,no,no,yes,no,no
loan,no,no,no,no,yes
contact,telephone,telephone,telephone,telephone,telephone
month,may,may,may,may,may
day_of_week,mon,mon,mon,mon,mon


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

In [18]:
columns_with_gaps = data.columns[:-1]

In [19]:
minimum = 0
maximum = 0.3

In [20]:
columns_with_gaps_dict = dict(
    zip(
        columns_with_gaps,
        np.random.uniform(
            minimum,
            maximum,
            len(columns_with_gaps)
        )
    )
)

In [21]:
columns_with_gaps_dict

{'age': 0.2837523366623057,
 'job': 0.2278960483378746,
 'marital': 0.04496849260661576,
 'education': 0.004064781135297857,
 'default': 0.09963245911421251,
 'housing': 0.18518689460396306,
 'loan': 0.14415100269370015,
 'contact': 0.05706744291297195,
 'month': 0.028730303370341846,
 'day_of_week': 0.04711721015234821,
 'duration': 0.23401645762406098,
 'campaign': 0.20266816082397063,
 'pdays': 0.28571368407873504,
 'previous': 0.09867045897017096,
 'poutcome': 0.29691770345090474,
 'emp.var.rate': 0.2924668787433146,
 'cons.price.idx': 0.03892581306150573,
 'cons.conf.idx': 0.021061799946499527,
 'euribor3m': 0.07849767892918204,
 'nr.employed': 0.16984923506996435}

In [22]:
data_with_gaps = data.copy()

In [23]:
for column in columns_with_gaps:
    if columns_with_gaps_dict[column] > 0:
        gaps_count = int(
            len(data_with_gaps) * columns_with_gaps_dict[column]
        )
        data_with_gaps[column].iloc[
            np.random.randint(
                0,
                len(data_with_gaps),
                gaps_count
            )
        ] = np.nan

In [24]:
data_with_gaps.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               30990 non-null float64
job               32721 non-null object
marital           39377 non-null object
education         41021 non-null object
default           37288 non-null object
housing           34216 non-null object
loan              35647 non-null object
contact           38915 non-null object
month             40016 non-null object
day_of_week       39289 non-null object
duration          32584 non-null float64
campaign          33657 non-null float64
pdays             30963 non-null float64
previous          37333 non-null float64
poutcome          30588 non-null object
emp.var.rate      30740 non-null float64
cons.price.idx    39613 non-null float64
cons.conf.idx     40327 non-null float64
euribor3m         38072 non-null float64
nr.employed       34746 non-null float64
y                 41188 non-null object
dtypes: float64(10), object(11)
memory us

In [25]:
numerical_features = [
    'age',
    'campaign',
    'cons.conf.idx',
    'cons.price.idx',
    'duration',
    'emp.var.rate',
    'euribor3m',
    'nr.employed',
    'pdays',
    'previous'
]

In [26]:
categorical_features = [
    'contact',
    'day_of_week',
    'default',
    'education',
    'housing',
    'job',
    'loan',
    'marital',
    'month',
    'poutcome'
]

In [27]:
target = 'y'

# Split the data into train and test

In [28]:
X, y = data_with_gaps[numerical_features + categorical_features], data_with_gaps[target]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [38]:
type(X_train)

pandas.core.frame.DataFrame

In [39]:
type(X_test)

pandas.core.frame.DataFrame

In [40]:
type(y_train)

pandas.core.series.Series

In [41]:
type(y_test)

pandas.core.series.Series

In [42]:
X_train.shape

(28831, 20)

In [43]:
X_test.shape

(12357, 20)

In [44]:
y_train.shape

(28831,)

In [45]:
y_test.shape

(12357,)

# Pipeline

## Separated pipelines

### Numerical features

In [30]:
num_features_pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scale', MinMaxScaler()),
    ('transform', QuantileTransformer(output_distribution='normal'))
])

In [33]:
num_features_transformed = num_features_pipeline.fit_transform(X_train[numerical_features], y_train)

In [34]:
num_features_pipeline.named_steps

{'impute': SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
        verbose=0),
 'scale': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'transform': QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
           output_distribution='normal', random_state=None,
           subsample=100000)}

In [35]:
num_features_pipeline.get_params()

{'memory': None,
 'steps': [('impute',
   SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0)),
  ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))),
  ('transform',
   QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
             output_distribution='normal', random_state=None,
             subsample=100000))],
 'impute': SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
        verbose=0),
 'scale': MinMaxScaler(copy=True, feature_range=(0, 1)),
 'transform': QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
           output_distribution='normal', random_state=None,
           subsample=100000),
 'impute__copy': True,
 'impute__fill_value': None,
 'impute__missing_values': nan,
 'impute__strategy': 'mean',
 'impute__verbose': 0,
 'scale__copy': True,
 'scale__feature_range': (0, 1),
 'transform__copy': True,
 'transform__ignore_implicit_zeros': 

In [36]:
type(num_features_transformed)

numpy.ndarray

In [37]:
num_features_transformed.shape

(28831, 10)

### Categorical features

In [55]:
cat_features_pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [56]:
cat_features_transformed = cat_features_pipeline.fit_transform(X_train[categorical_features], y_train)

In [57]:
cat_features_pipeline.named_steps

{'impute': SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
        strategy='constant', verbose=0),
 'onehot': OneHotEncoder(categorical_features=None, categories=None,
        dtype=<class 'numpy.float64'>, handle_unknown='ignore',
        n_values=None, sparse=False)}

In [58]:
cat_features_pipeline.get_params()

{'memory': None,
 'steps': [('impute',
   SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
          strategy='constant', verbose=0)),
  ('onehot', OneHotEncoder(categorical_features=None, categories=None,
          dtype=<class 'numpy.float64'>, handle_unknown='ignore',
          n_values=None, sparse=False))],
 'impute': SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
        strategy='constant', verbose=0),
 'onehot': OneHotEncoder(categorical_features=None, categories=None,
        dtype=<class 'numpy.float64'>, handle_unknown='ignore',
        n_values=None, sparse=False),
 'impute__copy': True,
 'impute__fill_value': 'missing',
 'impute__missing_values': nan,
 'impute__strategy': 'constant',
 'impute__verbose': 0,
 'onehot__categorical_features': None,
 'onehot__categories': None,
 'onehot__dtype': numpy.float64,
 'onehot__handle_unknown': 'ignore',
 'onehot__n_values': None,
 'onehot__sparse': False}

In [70]:
cat_features_pipeline.get_params()['onehot'].get_feature_names()

array(['x0_cellular', 'x0_missing', 'x0_telephone', 'x1_fri',
       'x1_missing', 'x1_mon', 'x1_thu', 'x1_tue', 'x1_wed', 'x2_missing',
       'x2_no', 'x2_unknown', 'x2_yes', 'x3_basic.4y', 'x3_basic.6y',
       'x3_basic.9y', 'x3_high.school', 'x3_illiterate', 'x3_missing',
       'x3_professional.course', 'x3_university.degree', 'x3_unknown',
       'x4_missing', 'x4_no', 'x4_unknown', 'x4_yes', 'x5_admin.',
       'x5_blue-collar', 'x5_entrepreneur', 'x5_housemaid',
       'x5_management', 'x5_missing', 'x5_retired', 'x5_self-employed',
       'x5_services', 'x5_student', 'x5_technician', 'x5_unemployed',
       'x5_unknown', 'x6_missing', 'x6_no', 'x6_unknown', 'x6_yes',
       'x7_divorced', 'x7_married', 'x7_missing', 'x7_single',
       'x7_unknown', 'x8_apr', 'x8_aug', 'x8_dec', 'x8_jul', 'x8_jun',
       'x8_mar', 'x8_may', 'x8_missing', 'x8_nov', 'x8_oct', 'x8_sep',
       'x9_failure', 'x9_missing', 'x9_nonexistent', 'x9_success'],
      dtype=object)

In [59]:
type(cat_features_transformed)

numpy.ndarray

In [60]:
cat_features_transformed.shape

(28831, 63)

## Unified pipeline

In [61]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_features_pipeline, numerical_features),
        ('cat', cat_features_pipeline, categorical_features)
    ]
)

In [62]:
unified_pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessor)
    ]
)

In [63]:
all_features_transformed = unified_pipeline.fit_transform(X_train, y_train)

In [66]:
unified_pipeline.named_steps

{'preprocessing': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
          transformer_weights=None,
          transformers=[('num', Pipeline(memory=None,
      steps=[('impute', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
        verbose=0)), ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('transform', QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_..., 'day_of_week', 'default', 'education', 'housing', 'job', 'loan', 'marital', 'month', 'poutcome'])])}

In [67]:
unified_pipeline.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
            transformer_weights=None,
            transformers=[('num', Pipeline(memory=None,
        steps=[('impute', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0)), ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('transform', QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_..., 'day_of_week', 'default', 'education', 'housing', 'job', 'loan', 'marital', 'month', 'poutcome'])]))],
 'preprocessing': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
          transformer_weights=None,
          transformers=[('num', Pipeline(memory=None,
      steps=[('impute', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
        verbose=0)), ('scale', MinMaxScaler(copy=True, feature_range=(0, 1))), ('transform', QuantileTransformer(copy=True, ignore_imp

In [64]:
type(all_features_transformed)

numpy.ndarray

In [65]:
all_features_transformed.shape

(28831, 73)

# Pipeline + GridSearch

# Conclusion