In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [24,16]

### Review of pipelines using sklearn
- Takes a list of named 2-tuples (name, pipeline_step) as input 
- Tuples can contain any arbitrary scikit-learn compatible estimator or transformer object
- Pipeline implements fit/predict methods
- Can be used as input estimator into grid/randomized search and cross_val_score methods

In [3]:
from sklearn.datasets import load_boston
data = load_boston()
X, y = data.data, data.target
print(X.shape, y.shape)

(506, 13) (506,)


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [5]:
rf_pipeline = Pipeline([('st_scaler', StandardScaler()),
                        ('rf_model', RandomForestRegressor())])

In [6]:
scores = cross_val_score(rf_pipeline, X,y,
                        scoring='neg_mean_squared_error', cv=10)

In [7]:
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final RMSE:", final_avg_rmse)

Final RMSE: 4.1800508595672525


### Preprocessing I: LabelEncoder and OneHotEncoder
- `LabelEncoder`: Converts a categorical column of strings into integers
- `OneHotEncoder`: Takes the column of integers and encodes them as dummy variables
- Cannot be done within a pipeline.

### Preprocessing II: DictVectorizer
- Traditionally used in text processing
- Converts lists of feature mappings into vectors
- Need to convert DataFrame into a list of dictionary entriesdf

In [8]:
df = pd.read_csv('data/ames_unprocessed_data.csv')
df.shape

(1460, 21)

In [9]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Fill missing values with 0
df.LotFrontage = df.LotFrontage.fillna(0)

# Create a boolean mask for categorical columns
categorical_mask = (df.dtypes == object)

# Get list of categorical column names
categorical_columns = df.columns[categorical_mask].tolist()

# Print the head of the categorical columns
print(df[categorical_columns].head())

# Create LabelEncoder object: le
le = LabelEncoder()

# Apply LabelEncoder to categorical columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

# Print the head of the LabelEncoded categorical columns
print(df[categorical_columns].head())

  MSZoning Neighborhood BldgType HouseStyle PavedDrive
0       RL      CollgCr     1Fam     2Story          Y
1       RL      Veenker     1Fam     1Story          Y
2       RL      CollgCr     1Fam     2Story          Y
3       RL      Crawfor     1Fam     2Story          Y
4       RL      NoRidge     1Fam     2Story          Y
   MSZoning  Neighborhood  BldgType  HouseStyle  PavedDrive
0         3             5         0           5           2
1         3            24         0           2           2
2         3             5         0           5           2
3         3             6         0           5           2
4         3            15         0           5           2


In [10]:
df_processed = df.drop(categorical_columns, axis=1)
df_processed = df_processed.to_numpy()
#df_processed.head(3)
df_ohe = df[categorical_columns]
df_ohe.head()

Unnamed: 0,MSZoning,Neighborhood,BldgType,HouseStyle,PavedDrive
0,3,5,0,5,2
1,3,24,0,2,2
2,3,5,0,5,2
3,3,6,0,5,2
4,3,15,0,5,2


In [11]:
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Create OneHotEncoder: ohe
ohe = OneHotEncoder(sparse=False)

# Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded
df_encoded_temp = ohe.fit_transform(df_ohe)

df_encoded = np.concatenate((df_processed, df_encoded_temp), axis=1)

# Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe
print(df_encoded[:1, :])

# Print the shape of the original DataFrame
print(df.shape)

# Print the shape of the transformed array
print(df_encoded.shape)

[[6.000e+01 6.500e+01 8.450e+03 7.000e+00 5.000e+00 2.003e+03 0.000e+00
  1.710e+03 1.000e+00 0.000e+00 2.000e+00 1.000e+00 3.000e+00 0.000e+00
  5.480e+02 2.085e+05 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00]]
(1460, 21)
(1460, 62)


In [12]:
# Import DictVectorizer
from sklearn.feature_extraction import DictVectorizer

# Convert df into a dictionary: df_dict
df_dict = df.to_dict('records')

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False)

# Apply dv on df: df_encoded
df_encoded = dv.fit_transform(df_dict)

# Print the resulting first five rows
print(df_encoded[:5,:])

# Print the vocabulary
print(dv.vocabulary_)

[[3.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 2.000e+00 5.480e+02
  1.710e+03 1.000e+00 5.000e+00 8.450e+03 6.500e+01 6.000e+01 3.000e+00
  5.000e+00 5.000e+00 7.000e+00 2.000e+00 0.000e+00 2.085e+05 2.003e+03]
 [3.000e+00 0.000e+00 0.000e+00 1.000e+00 1.000e+00 2.000e+00 4.600e+02
  1.262e+03 0.000e+00 2.000e+00 9.600e+03 8.000e+01 2.000e+01 3.000e+00
  2.400e+01 8.000e+00 6.000e+00 2.000e+00 0.000e+00 1.815e+05 1.976e+03]
 [3.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 2.000e+00 6.080e+02
  1.786e+03 1.000e+00 5.000e+00 1.125e+04 6.800e+01 6.000e+01 3.000e+00
  5.000e+00 5.000e+00 7.000e+00 2.000e+00 1.000e+00 2.235e+05 2.001e+03]
 [3.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 1.000e+00 6.420e+02
  1.717e+03 0.000e+00 5.000e+00 9.550e+03 6.000e+01 7.000e+01 3.000e+00
  6.000e+00 5.000e+00 7.000e+00 2.000e+00 1.000e+00 1.400e+05 1.915e+03]
 [4.000e+00 0.000e+00 1.000e+00 0.000e+00 1.000e+00 2.000e+00 8.360e+02
  2.198e+03 1.000e+00 5.000e+00 1.426e+04 8.400e+01 6.000e+0

In [13]:
X = df.drop('SalePrice', axis=1)
y = df.SalePrice
#X.LotFrontage

In [14]:
import xgboost as xgb
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(objective='reg:squarederror'))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Fit the pipeline
xgb_pipeline.fit(X.to_dict('records'), y)

Pipeline(memory=None,
         steps=[('ohe_onestep',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=False)),
                ('xgb_model',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain', learning_rate=0.1,
                              max_delta_step=0, max_depth=3, min_child_weight=1,
                              missing=None, n_estimators=100, n_jobs=1,
                              nthread=None, objective='reg:squarederror',
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, seed=None, silent=None,
                              subsample=1, verbosity=1))],
         verbose=False)

### Incorporating xgboost into pipelines

In [15]:
data = load_boston()
X, y = data.data, data.target
print(X.shape, y.shape)

(506, 13) (506,)


In [16]:
xgb_pipeline = Pipeline([
        ('st_scaler', StandardScaler()),
        ('xgb_boost', xgb.XGBRegressor(objective='reg:squarederror'))
    ])

scores = cross_val_score(xgb_pipeline, X, y, 
                        scoring='neg_mean_squared_error',
                        cv=10)

final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final XGB RMSE:", final_avg_rmse)

Final XGB RMSE: 4.027195933230151


### Additional components introduced for pipelines
- `sklearn_pandas`
    - `DataFrameMapper` - interoperability between `pandas` and `scikit-learn` 
    - `CategoricalImputer` - Allow for imputation of categorical variables before conversion to integers
- `sklearn.preprocessing` 
    - `Imputer` - Native imputation of numeriacl columns in scikit-learn
- `sklearn.pipeline` 
    - `FeatureUnion` - combine multiple pipelines of features into a single pipelines of features

In [17]:
df = pd.read_csv('data/ames_unprocessed_data.csv')
X = df.drop('SalePrice', axis=1)
y = df.SalePrice
X.shape

(1460, 20)

In [18]:
# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:squarederror"))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val_scores = cross_val_score(xgb_pipeline, 
                                   X.to_dict('records'), y, 
                                   cv=10, scoring='neg_mean_squared_error')

# Print the 10-fold RMSE
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))

10-fold RMSE:  29867.603720688923


In [35]:
cols = ['age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo','pcv','wc','rc','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane','class']
len(cols)

25

In [36]:
df = pd.read_csv('data/chronic_kidney_disease.csv', header=None)
df.columns = cols

df.replace("?", np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='ignore')
X, y = df.drop('class',axis=1), df['class']

age      float64
bp       float64
sg       float64
al       float64
su       float64
bgr       object
bu        object
sc        object
sod       object
pot      float64
hemo     float64
pcv      float64
wc       float64
rc       float64
rbc      float64
pc       float64
pcc      float64
ba       float64
htn       object
dm        object
cad       object
appet     object
pe        object
ane       object
dtype: object

In [37]:
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
#print(nulls_per_column)

In [38]:
from sklearn.impute import SimpleImputer

In [41]:
# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object

# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# # Apply numeric imputer
# numeric_imputation_mapper = DataFrameMapper(
#         [([numeric_feature], SimpleImputer(strategy="median")) for numeric_feature in non_categorical_columns],
#         input_df=True,
#         df_out=True
#         )

# # Apply categorical imputer
# categorical_imputation_mapper = DataFrameMapper(
#         [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
#         input_df=True,
#         df_out=True
#       )

transformers = []

transformers.extend(
        [([numeric_feature], SimpleImputer(strategy="median")) for numeric_feature in non_categorical_columns]
        )

transformers.extend(
        [(category_feature, CategoricalImputer()) for category_feature in categorical_columns]
      )

In [42]:
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# # Combine the numeric and categorical transformations
# numeric_categorical_union = FeatureUnion([
#                   ("cat_mapper", categorical_imputation_mapper),
#                   ("num_mapper", numeric_imputation_mapper)
#                  ])

numeric_categorical_union = DataFrameMapper(transformers,
                                           input_df=True,
                                           df_out=True)

In [43]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class Dictifier(BaseEstimator, TransformerMixin):       
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.to_dict('records')

In [44]:
X = pd.read_csv('data/chronic_kidney_disease.csv', header=None)
X.columns = cols

In [45]:
X.replace("?", np.nan, inplace=True)
X = X.apply(pd.to_numeric, errors='ignore')
y = X['class'].map({'ckd': 0, 'notckd': 1}).values
print(type(X), type(y))

<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>


In [46]:
# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     ("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, 
                                   scoring="roc_auc", cv=3)
cross_val_scores

array([0.99928571, 0.99686747, 0.99975904])

In [47]:
# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))

3-fold AUC:  0.998637406769937


In [22]:
? cross_val_score

[0;31mSignature:[0m
 [0mcross_val_score[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mestimator[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroups[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscoring[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfit_params[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpre_dispatch[0m[0;34m=[0m[0;34m'2*n_jobs'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merror_score[0m[0;34m=[0m[0mnan[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Evaluate a score by cross-v