# The Machine Learning Process

* determine problem and constraints
* determine characteristics the problem / scenario dictates on the solution
	- model family
	- acceptable methods of dimensionality reduction and regularization
	- primary / secondary evaluation score (ie. accuracy) and methods (ie. confusion matrix, roc)
	
* integration and cleaning	
* exploration
* address balance (classification, anova, etc.)
* create Train, Validate, Test with split (above)

* configure Feature Extraction with feature_union
* configure Preprocess and choose model-families with pipeline

* use k-folds CV and grid search with Training set on multiple model-families and hyper-parameters
* evaluate with afore-mentioned confusion matrix, scoring, classifier-threshold, and tests
* select the best model-family / hyper-parameters

* apply to Validation set or all of Training set to model-family to parameterize it set final model
* use Testing set to evaluate final model characteristics

_Note:_ Train, Validate, and Test should be from different (independent) data sets, if possible.

<br>
<br>
<hr>

# Example

In [31]:
#dependencies
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
# make data (both numeric and categorical)
from sklearn.datasets.samples_generator import make_blobs
factor, resp = make_blobs(n_samples=100, centers=2, n_features=50, random_state=1)

data = ["paris", "barcelona", "kolkata", "new york"]
import random
[random.sample(data, 2) for _ in range(100)]

In [3]:
#begin process with response encoding
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(resp)

In [4]:
X = factor
y = y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [43]:
# create feature union of numeric data
features = []
features.append(('pca', PCA()))			#<<<-grid
features.append(('select_best', SelectKBest(k=6))) #<<<-grid
num_feature_eng = FeatureUnion(features)

# We create the preprocessing pipelines for both numeric and categorical data.
#columns_numeric = (X_train.loc[:, X_train.dtypes == np.float64]).columns
columns_numeric = list(range(50))
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
	('num_feature_eng', num_feature_eng)
	])

#columns_categoric = (X_train.loc[:, X_train.dtypes != np.float64]).columns
columns_categoric = []
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
	('select_best', SelectKBest(k=6))
	])
	
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, columns_numeric),
        ('cat', categorical_transformer, columns_categoric)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(penalty='l1', solver='liblinear'))])

In [44]:
grid = {'preprocessor__num__num_feature_eng__pca__n_components':[.75, .80, .85, .90, .95],
        'classifier__C': np.logspace(-4, 4, 5)
       }
gridClf = GridSearchCV(clf, grid, cv=5)

clf.steps

In [45]:
gridClf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...ty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'preprocessor__num__num_feature_eng__pca__n_components': [0.75, 0.8, 0.85, 0.9, 0.95], 'classifier__C': array([1.e-04, 1.e-02, 1.e+00, 1.e+02, 1.e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
#print( gridClf.best_estimator_ )
print( gridClf.best_params_ )
print("Best parameter (CV score=%0.3f):" % gridClf.best_score_)

{'classifier__C': 0.01, 'preprocessor__num__num_feature_eng__pca__n_components': 0.75}
Best parameter (CV score=1.000):


In [236]:
tmp1 = gridClf.best_estimator_.named_steps['preprocessor']
tmp2 = (tmp1.transformers_[0][1].named_steps['num_feature_eng'])
tmp3 = dict(tmp2.transformer_list).get('pca')

In [254]:
print("Amount of variance explained by components chosen: %0.3f" % tmp3.explained_variance_ratio_ )
print("Number of components chosen: %i" % tmp3.n_components_ )

Amount of variance explained by components chosen: 0.766
Number of components chosen: 1


In [256]:
from sklearn.metrics import confusion_matrix
y_test_pred = gridClf.predict(X_test)
print( "Confusion matrix: Test")
confusion_matrix(y_test, y_test_pred)

Confusion matrix: Test


array([[12,  0],
       [ 0,  8]])

In [257]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00         8

   micro avg       1.00      1.00      1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

