### Import Packages and Load Data

In [246]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [247]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [248]:
from sklearn import set_config
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score,cross_validate

In [249]:
import xgboost as xgb

In [250]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [251]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Initial Dataframe Exploration

In [252]:
titanic_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [253]:
titanic_training.set_index("PassengerId", inplace=True)

In [254]:
titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [255]:
titanic_training.drop(columns=['Ticket', 'Name','Cabin'], inplace = True)

In [256]:
titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [257]:
titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

In [258]:
titanic_training['MultipleCabins']=np.where(titanic_training['CabinNum'] > 1, 1, 0)

In [259]:
titanic_training.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)

In [260]:
titanic_training.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,1,0,7.25,S,,0
2,1,1,female,38.0,1,0,71.2833,C,C,0
3,1,3,female,26.0,0,0,7.925,S,,0
4,1,1,female,35.0,1,0,53.1,S,C,0
5,0,3,male,35.0,0,0,8.05,S,,0


In [261]:
nullseries = titanic_training.isnull().sum()
nullseries[nullseries > 0]

Age            177
Embarked         2
CabinLetter    691
dtype: int64

In [262]:
nullseries = titanic_training.isnull().sum()/len(titanic_training)
nullseries[nullseries > 0]

Age            0.198653
Embarked       0.002245
CabinLetter    0.775533
dtype: float64

In [263]:
titanic_training.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,MultipleCabins
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.022447
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.148214
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,0.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [264]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass'):
    titanic_training[column] = titanic_training[column].astype('category')

In [265]:
titanic_training.dtypes

Survived             int64
Pclass            category
Sex               category
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked          category
CabinLetter       category
MultipleCabins       int64
dtype: object

In [266]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [267]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state = 202)

In [23]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [24]:
scores = cross_val_score(knn_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=5)

scores.mean()

0.7944774193548387

In [25]:
knn_pipeline.fit(train_X, train_y)
knn_pipeline.score(test_X, test_y)

0.7798507462686567

In [26]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', RandomForestClassifier(random_state =123))
    ])

In [28]:
scores = cross_val_score(rf_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.7849462365591398

In [27]:
rf_pipeline.fit(train_X, train_y)
rf_pipeline.score(test_X, test_y)

0.8022388059701493

In [38]:
logit_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', LogisticRegression(max_iter=1000,random_state=123))
    ])

In [39]:
scores = cross_val_score(logit_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8202764976958525

In [40]:
logit_pipeline.fit(train_X, train_y)
logit_pipeline.score(test_X, test_y)

0.7761194029850746

In [41]:
xgb_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=123))
    ])

In [43]:
scores = cross_val_score(xgb_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8090373783922171

In [42]:
xgb_pipeline.fit(train_X, train_y)
xgb_pipeline.score(test_X, test_y)

0.7910447761194029

In [None]:
data_mean, data_std = np.mean(train_X.Age), np.std(train_X.Age)
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off

print('Identified outliers: %d' % len(outliers))
outliers_removed = [x for x in train_X.Age if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))

In [127]:
def OutlierRemover(data, column):
    
    outlier_index = []
    
    for col in column:
        feature_mean, feature_std = np.mean(data[col]), np.std(data[col])
        cut_off = feature_std * 3
        lower, upper = feature_mean - cut_off, feature_mean + cut_off
        outliers = data[col][(data[col] < lower) | (data[col] > upper)].index
        outlier_index.append(outliers)
    
    return data.drop(outlier_index)

In [182]:
    column = ['Age','Fare']
    outlier_list = []
    data = train_X.copy()
    
    for col in column:
        feature_mean, feature_std = np.mean(data[col]), np.std(data[col])
        cut_off = feature_std * 3
        lower, upper = feature_mean - cut_off, feature_mean + cut_off
        outliers = list(data[col][(data[col] < lower) | (data[col] > upper)].index)
        outlier_list = outlier_list + outliers
    

In [221]:
def OutlierRemover(data_input, column):
    
    column = column
    outlier_list = []
    data = data_input.copy()
    
    for col in column:
        feature_mean, feature_std = np.mean(data[col]), np.std(data[col])
        cut_off = feature_std * 3
        lower, upper = feature_mean - cut_off, feature_mean + cut_off
        outliers = list(data[col][(data[col] < lower) | (data[col] > upper)].index)
        outlier_list = outlier_list + outliers
    
    data_clean = data.drop(outlier_list)
    
    return data_clean

In [236]:
def OutlierRemover(data_input):
    
    data = train_X.copy()
    column = data_input.columns
    outlier_list = []
    
    for col in column:
        feature_mean, feature_std = np.mean(data[col]), np.std(data[col])
        cut_off = feature_std * 3
        lower, upper = feature_mean - cut_off, feature_mean + cut_off
        outliers = list(data[col][(data[col] < lower) | (data[col] > upper)].index)
        outlier_list = outlier_list + outliers
    
    data_clean = data.drop(outlier_list)
    
    return data_clean

In [235]:
OutlierRemover(train_X[['Age']])

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
312,313,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0000,,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
276,277,3,"Lindblom, Miss. Augusta Charlotta",female,45.0,0,0,347073,7.7500,,S
875,876,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C
376,377,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
185,186,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50.0000,A32,S
808,809,2,"Meyer, Mr. August",male,39.0,0,0,248723,13.0000,,S
460,461,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
60,61,3,"Sirayanian, Mr. Orsen",male,22.0,0,0,2669,7.2292,,C


In [216]:
outlier_list

[630, 851]

In [205]:
train_X[train_X.index == 630]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
630,631,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S


In [214]:
check = train_X.drop(outlier_list)

In [215]:
check.shape

(621, 11)

In [217]:
check[check.index == 630]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [129]:

feature_mean, feature_std = np.mean(train_X['Fare']), np.std(train_X['Fare'])
cut_off = feature_std * 3
lower, upper = feature_mean - cut_off, feature_mean + cut_off
outlier_index = train_X[(train_X['Fare'] < lower) | (train_X['Fare'] > upper)].index
train_X.drop(outlier_index)


Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
313,2,female,26.0,1,1,26.0000,S,,0
5,3,male,35.0,0,0,8.0500,S,,0
277,3,female,45.0,0,0,7.7500,S,,0
876,3,female,15.0,0,0,7.2250,C,,0
377,3,female,22.0,0,0,7.2500,S,,0
...,...,...,...,...,...,...,...,...,...
186,1,male,,0,0,50.0000,S,A,0
809,2,male,39.0,0,0,13.0000,S,,0
461,1,male,48.0,0,0,26.5500,S,E,0
61,3,male,22.0,0,0,7.2292,C,,0


In [239]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
         ('Outlier_removal', FunctionTransformer(OutlierRemover, validate = False), ['Age', 'Fare']),
         ('dummies', OneHotEncoder(), ['Pclass', 'Sex', 'Embarked', 'CabinLetter'])
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [274]:
knn_preprocess = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
         ('Outlier_removal', FunctionTransformer(OutlierRemover, validate = False), ['Age', 'Fare'])
              ])]
    ])

In [275]:
train_X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
313,2,female,26.0,1,1,26.0,S,,0
5,3,male,35.0,0,0,8.05,S,,0
277,3,female,45.0,0,0,7.75,S,,0
876,3,female,15.0,0,0,7.225,C,,0
377,3,female,22.0,0,0,7.25,S,,0


In [276]:
knn_preprocess.fit_transform(train_X)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 607 and the array at index 1 has size 623

In [223]:
OutlierRemover(train_X, ['Age', 'Fare'])

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
312,313,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0000,,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
276,277,3,"Lindblom, Miss. Augusta Charlotta",female,45.0,0,0,347073,7.7500,,S
875,876,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C
376,377,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...
185,186,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50.0000,A32,S
808,809,2,"Meyer, Mr. August",male,39.0,0,0,248723,13.0000,,S
460,461,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.5500,E12,S
60,61,3,"Sirayanian, Mr. Orsen",male,22.0,0,0,2669,7.2292,,C


In [56]:
knn_pipeline.fit(train_X, train_y)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 617 and the array at index 1 has size 623

In [None]:
X.select_dtypes(include=['category']).columns

In [None]:
param_grid = {'model__leaf_size': list(range(1, 100)),
              'model__n_neighbors': list(range(1, 10)),
              'model__weights': ['uniform', 'distance'],
              'model__algorithm': ['ball_tree', 'kd_tree', 'brute'],
              'model__p': [1,2]
             }

In [None]:
random_knn = RandomizedSearchCV(
        estimator=knn_pipeline,
        param_distributions = param_grid, 
        n_iter = 100,
        n_jobs= -1, 
        scoring='accuracy', 
        cv = 10)

In [None]:
random_knn.fit(train_X, train_y);

In [None]:
random_knn.best_params_

In [None]:
random_knn.best_score_

In [None]:
knn_model_best_random = grid_knn.best_estimator_

In [None]:
knn_model_best_random.score(test_X, test_y)

In [None]:
param_grid = {
              'model__n_neighbors': list(range(1, 10))
             }

In [None]:
grid_knn = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

In [None]:
grid_knn.fit(train_X, train_y);

In [None]:
grid_knn.best_score_

In [None]:
grid_knn.best_params_

In [None]:
knn_model_best = grid_knn.best_estimator_

In [None]:
knn_model_best.score(test_X, test_y)

In [None]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ])

In [None]:
rf_model = RandomForestClassifier(random_state =123)

In [None]:
x_processed = rf_pipeline.fit_transform(X)

In [None]:
param_grid = {'max_depth': [2, 4, 8, 15],
              'max_features': ['auto', 'sqrt']
             }

In [None]:
grid_rf_class = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    refit=True, return_train_score=True)

In [None]:
grid_rf_class.fit(x_processed, y)

In [None]:
grid_rf_class.best_score_

In [None]:
grid_rf_class.best_params_

In [None]:
grid_search = GridSearchCV(estimator = rf_pipeline, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_rf_class = GridSearchCV(
    estimator=rf_class,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    refit=True, return_train_score=True)
print(grid_rf_class)

In [None]:
# Read the cv_results property into a dataframe & print it out
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
print(cv_results_df)

# Extract and print the column with a dictionary of hyperparameters used
column = cv_results_df.loc[:, ['params']]
print(column)

# Extract and print the row that had the best mean test score
best_row = cv_results_df[cv_results_df['rank_test_score'] == 1 ]
print(best_row)

# Print out the ROC_AUC score from the best-performing square
best_score = grid_rf_class.best_score_
print(best_score)

# Create a variable from the row related to the best-performing square
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
best_row = cv_results_df.loc[[grid_rf_class.best_index_]]
print(best_row)

# Get the n_estimators parameter from the best-performing square and print
best_n_estimators = grid_rf_class.best_params_["n_estimators"]
print(best_n_estimators)

In [None]:
grid_search = GridSearchCV(estimator = logit_pipeline, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 64],
    "logistic__C": np.logspace(-4, 4, 4),
}
search = GridSearchCV(rf_pipeline, param_grid, n_jobs=-1)
search.fit(X_digits, y_digits)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
rf_pipeline.predict(valid_X)

In [None]:
penalty="l2", C=1e42, solver="liblinear", 

In [None]:
pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough',
        transformers=[
            ('dummies', OneHotEncoder(drop='first'), ['Category', 'currency', 'endDay']),
        ]
    )],
    ['model', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')]
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

preprocessing = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ]
)

model = LinearRegression(normalize=True)

pipeline = Pipeline([
    ['preprocessing', preprocessing],
    ['model', model],
])

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [None]:
import numpy as np
from sklearn.model_selection import cross_validate

scores = cross_validate(scaled_pipeline, X, y, 
                        scoring=('r2', 'neg_mean_absolute_error'), cv=5)

In [None]:
classifier = Pipeline([
    ['model', VotingClassifier([
        ('logit', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')),
        ('dtree', DecisionTreeClassifier()),
        ('bagging', BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)),
        ('boosted', AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier())),
        ('rf', RandomForestClassifier(max_features=3, min_samples_split=300,
                                      random_state=0, n_estimators=100, criterion='entropy')),
        ('lda', LinearDiscriminantAnalysis()),
        ('nn', Pipeline([
            ('scaler', MinMaxScaler()),
            ('nn', MLPClassifier(hidden_layer_sizes=(10), activation='logistic', solver='lbfgs', 
                                 random_state=12, max_iter=5000)),
        ]))
    ], voting='soft')]
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')   

pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough',
        transformers=[
            ('dummies', OneHotEncoder(drop='first'), ['column1', 'column2', 'column3']),
        ]
    )],
    ['model', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')]
])

X = df.drop(columns=[outcome])
y = df[outcome]

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

pipeline.fit(train_X, train_y)

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ..... 
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [None]:
xg_model = xgb.XGBClassifier(objective='binary:logistic', 
                             n_estimators=20, 
                             seed=123,
                             use_label_encoder=False,
                             verbosity=0)

In [None]:
xg_model.fit(train_X_imp, train_y)

In [None]:
xg_model.score(valid_X_imp, valid_y)