### Import Packages and Load Data

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
import xgboost as xgb

In [4]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [5]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Initial Dataframe Exploration

In [6]:
titanic_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
titanic_training.set_index("PassengerId", inplace=True)

In [8]:
titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [9]:
titanic_training.drop(columns=['Ticket', 'Name','Cabin'], inplace = True)

In [10]:
titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [11]:
titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

In [12]:
titanic_training['MultipleCabins']=np.where(titanic_training['CabinNum'] > 1, 1, 0)

In [13]:
titanic_training.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)

In [14]:
titanic_training.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,1,0,7.25,S,,0
2,1,1,female,38.0,1,0,71.2833,C,C,0
3,1,3,female,26.0,0,0,7.925,S,,0
4,1,1,female,35.0,1,0,53.1,S,C,0
5,0,3,male,35.0,0,0,8.05,S,,0


In [15]:
nullseries = titanic_training.isnull().sum()
nullseries[nullseries > 0]

Age            177
Embarked         2
CabinLetter    691
dtype: int64

In [16]:
nullseries = titanic_training.isnull().sum()/len(titanic_training)
nullseries[nullseries > 0]

Age            0.198653
Embarked       0.002245
CabinLetter    0.775533
dtype: float64

In [17]:
titanic_training.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,MultipleCabins
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.022447
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.148214
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,0.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [18]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass'):
    titanic_training[column] = titanic_training[column].astype('category')

In [19]:
titanic_training.dtypes

Survived             int64
Pclass            category
Sex               category
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked          category
CabinLetter       category
MultipleCabins       int64
dtype: object

In [20]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [21]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state = 202)

In [22]:
train_X.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
567,3,male,19.0,0,0,7.8958,S,,0
325,3,male,,8,2,69.55,S,,0
757,3,male,28.0,0,0,7.7958,S,,0
193,3,female,19.0,1,0,7.8542,S,,0
222,2,male,27.0,0,0,13.0,S,,0


In [27]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [28]:
train_X.dtypes

Pclass            category
Sex               category
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked          category
CabinLetter       category
MultipleCabins       int64
dtype: object

In [60]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [61]:
knn_pipeline.fit(train_X, train_y)
knn_pipeline.score(valid_X, valid_y)

0.7899159663865546

In [62]:
scores = cross_val_score(knn_pipeline, X, y, 
                        scoring=('accuracy'), cv=5)

scores.mean()

0.8013746783001695

In [114]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', RandomForestClassifier(random_state =123))
    ])

In [87]:
rf_pipeline.fit(train_X, train_y)
rf_pipeline.score(valid_X, valid_y)

0.803921568627451

In [65]:
scores = cross_val_score(rf_pipeline, X, y, 
                        scoring=('accuracy'), cv=5)
scores.mean()

0.8047266336074319

In [66]:
logit_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', LogisticRegression(penalty="l2", C=1e42, solver="liblinear", random_state=123))
    ])

In [67]:
logit_pipeline.fit(train_X, train_y)
logit_pipeline.score(valid_X, valid_y)

0.7703081232492998

In [68]:
scores = cross_val_score(logit_pipeline, X, y, 
                        scoring=('accuracy'), cv=5)
scores.mean()

0.7957253154227607

In [92]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ])

In [108]:
rf_model = RandomForestClassifier(random_state =123)

In [106]:
x_processed = rf_pipeline.fit_transform(X)

In [109]:
param_grid = {'max_depth': [2, 4, 8, 15],
              'max_features': ['auto', 'sqrt']
             }

In [110]:
grid_rf_class = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    refit=True, return_train_score=True)

In [111]:
grid_rf_class.fit(x_processed, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=123), n_jobs=4,
             param_grid={'max_depth': [2, 4, 8, 15],
                         'max_features': ['auto', 'sqrt']},
             return_train_score=True, scoring='roc_auc')

In [112]:
grid_rf_class.best_score_

0.8684152732420524

In [113]:
grid_rf_class.best_params_

{'max_depth': 8, 'max_features': 'auto'}

In [125]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', RandomForestClassifier())
    ])

In [126]:
grid_rf_class = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid)

In [127]:
grid_rf_class.fit(X, y)

ValueError: Invalid parameter max_depth for estimator Pipeline(steps=[['preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dummies', OneHotEncoder(),
                                                  Index(['Pclass', 'Sex', 'Embarked', 'CabinLetter'], dtype='object'))])],
                ('imputer', SimpleImputer()),
                ('model', RandomForestClassifier())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
grid_search = GridSearchCV(estimator = rf_pipeline, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_rf_class = GridSearchCV(
    estimator=rf_class,
    param_grid=param_grid,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    refit=True, return_train_score=True)
print(grid_rf_class)

In [None]:
# Read the cv_results property into a dataframe & print it out
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
print(cv_results_df)

# Extract and print the column with a dictionary of hyperparameters used
column = cv_results_df.loc[:, ['params']]
print(column)

# Extract and print the row that had the best mean test score
best_row = cv_results_df[cv_results_df['rank_test_score'] == 1 ]
print(best_row)

# Print out the ROC_AUC score from the best-performing square
best_score = grid_rf_class.best_score_
print(best_score)

# Create a variable from the row related to the best-performing square
cv_results_df = pd.DataFrame(grid_rf_class.cv_results_)
best_row = cv_results_df.loc[[grid_rf_class.best_index_]]
print(best_row)

# Get the n_estimators parameter from the best-performing square and print
best_n_estimators = grid_rf_class.best_params_["n_estimators"]
print(best_n_estimators)

In [None]:
grid_search = GridSearchCV(estimator = logit_pipeline, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 64],
    "logistic__C": np.logspace(-4, 4, 4),
}
search = GridSearchCV(rf_pipeline, param_grid, n_jobs=-1)
search.fit(X_digits, y_digits)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [49]:
rf_pipeline.predict(valid_X)

array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [None]:
pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough',
        transformers=[
            ('dummies', OneHotEncoder(drop='first'), ['Category', 'currency', 'endDay']),
        ]
    )],
    ['model', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')]
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

preprocessing = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ]
)

model = LinearRegression(normalize=True)

pipeline = Pipeline([
    ['preprocessing', preprocessing],
    ['model', model],
])

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [None]:
import numpy as np
from sklearn.model_selection import cross_validate

scores = cross_validate(scaled_pipeline, X, y, 
                        scoring=('r2', 'neg_mean_absolute_error'), cv=5)

In [None]:
classifier = Pipeline([
    ['model', VotingClassifier([
        ('logit', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')),
        ('dtree', DecisionTreeClassifier()),
        ('bagging', BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)),
        ('boosted', AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier())),
        ('rf', RandomForestClassifier(max_features=3, min_samples_split=300,
                                      random_state=0, n_estimators=100, criterion='entropy')),
        ('lda', LinearDiscriminantAnalysis()),
        ('nn', Pipeline([
            ('scaler', MinMaxScaler()),
            ('nn', MLPClassifier(hidden_layer_sizes=(10), activation='logistic', solver='lbfgs', 
                                 random_state=12, max_iter=5000)),
        ]))
    ], voting='soft')]
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config
set_config(display='diagram')   

pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough',
        transformers=[
            ('dummies', OneHotEncoder(drop='first'), ['column1', 'column2', 'column3']),
        ]
    )],
    ['model', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')]
])

X = df.drop(columns=[outcome])
y = df[outcome]

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

pipeline.fit(train_X, train_y)

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ..... 
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [51]:
xg_model = xgb.XGBClassifier(objective='binary:logistic', 
                             n_estimators=20, 
                             seed=123,
                             use_label_encoder=False,
                             verbosity=0)

In [53]:
xg_model.fit(train_X_imp, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=8, num_parallel_tree=1, random_state=123,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=123,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=0)

In [57]:
xg_model.score(valid_X_imp, valid_y)

0.8179271708683473