### Import Packages and Load Data

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import set_config
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score,cross_validate
import xgboost as xgb

In [None]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [None]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Data Preprocessing + Feature Engineering

In [None]:
titanic_training.head()

In [None]:
titanic_training.set_index("PassengerId", inplace=True)

In [None]:
titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [None]:
titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [None]:
titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

In [None]:
titanic_training.drop(columns=['Cabin1', 'Cabin2','Cabin3'], inplace = True)

In [None]:
titanic_training['CabinLetter'] = titanic_training['CabinLetter'].fillna(value = 'N')

In [None]:
titanic_training[['TicketLetter', 'TicketNum']] = titanic_training['Ticket'].str.split(" ", 1, expand=True)

In [None]:
titanic_training[['TicketLetter', 'TicketNum']] = titanic_training['Ticket'].\
                str.extract(r'([\w\/\.]+)\s(\d+)')

In [None]:
titanic_training['TicketSpecial'] = np.where(pd.isna(titanic_training['TicketLetter']), 0 ,1)

In [None]:
titanic_training[['NameTitle']] = titanic_training['Name'].\
                str.extract(r'((?<=\,\s)[\w]+)')

In [None]:
titanic_training.drop(columns=['Ticket', 'Name','Cabin','TicketLetter', 'TicketNum'], inplace = True)

In [None]:
nullseries = titanic_training.isnull().sum()
nullseries[nullseries > 0]

In [None]:
nullseries = titanic_training.isnull().sum()/len(titanic_training)
nullseries[nullseries > 0]

In [None]:
titanic_training.describe()

In [None]:
titanic_training['NameTitle'] = np.where(titanic_training['NameTitle'].\
                                         isin(['Mr', 'Mrs', 'Miss']), 'Standard', 'Elite')

In [None]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass', 'NameTitle'):
    titanic_training[column] = titanic_training[column].astype('category')

In [None]:
titanic_training.dtypes

In [None]:
titanic_training.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
             ('imputer', SimpleImputer(strategy='mean'), ['Age']),
             ('dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns),
             ('name', OrdinalEncoder(categories=[['Standard', 'Elite']]), ['NameTitle'])
              ])]
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

### Model Selection

In [None]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state = 202)

In [None]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [None]:
scores = cross_val_score(knn_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=5)

scores.mean()

In [None]:
knn_pipeline.fit(train_X, train_y)
knn_pipeline.score(test_X, test_y)

In [None]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', RandomForestClassifier(random_state =123))
    ])

In [None]:
scores = cross_val_score(rf_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

In [None]:
rf_pipeline.fit(train_X, train_y)
rf_pipeline.score(test_X, test_y)

In [None]:
logit_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', LogisticRegression(max_iter=1000,random_state=123))
    ])

In [None]:
scores = cross_val_score(logit_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

In [None]:
logit_pipeline.fit(train_X, train_y)
logit_pipeline.score(test_X, test_y)

In [None]:
xgb_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=123))
    ])

In [None]:
scores = cross_val_score(xgb_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

In [None]:
xgb_pipeline.fit(train_X, train_y)
xgb_pipeline.score(test_X, test_y)

### Tune the Model

In [None]:
param_grid = {'model__n_neighbors': list(range(1, 10))}

In [None]:
grid_knn = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

In [None]:
grid_knn.fit(train_X, train_y);
knn_model_best = grid_knn.best_estimator_;

In [None]:
print(grid_knn.best_score_)
print(grid_knn.best_params_)
print(knn_model_best.score(test_X, test_y))

In [None]:
param_grid = {
'model__n_estimators': [100,350,500],
'model__max_features': ['log2','auto','sqrt'],
'model__min_samples_leaf': list(range(2, 30)),
'model__max_depth': list(range(1, 10))
    
}

In [None]:
random_rf = RandomizedSearchCV(
        estimator=rf_pipeline,
        param_distributions = param_grid, 
        n_iter = 150,
        n_jobs= -1, 
        scoring='accuracy', 
        cv = 10)

random_rf.fit(train_X, train_y);

In [None]:
print(random_rf.best_score_)
print(random_rf.best_params_)
print(random_rf.score(test_X, test_y))

In [None]:
param_grid = {
'model__n_estimators': [25,50,100],
'model__max_features': ['sqrt'],
'model__min_samples_leaf': [2,3,4,5,6,7,8],
'model__max_depth': [4,5,6]
}

In [None]:
grid_rf = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_rf.fit(train_X, train_y);

In [None]:
print(grid_rf.best_score_)
print(grid_rf.best_params_)
print(grid_rf.score(test_X, test_y))

In [None]:
param_grid = {'model__penalty' : ['l1', 'l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['liblinear']}

In [None]:
grid_logit = GridSearchCV(
    estimator=logit_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

In [None]:
grid_logit.fit(train_X, train_y);
logit_model_best = grid_logit.best_estimator_;

In [None]:
print(grid_logit.best_score_)
print(grid_logit.best_params_)
print(logit_model_best.score(test_X, test_y))

In [None]:
param_grid = {'model__learning_rate': np.arange(0.05, 1, 0.05),
    'model__max_depth': np.arange(3, 10, 1),
    'model__n_estimators': np.arange(50, 200, 50)}

In [None]:
random_xgb = RandomizedSearchCV(
        estimator=xgb_pipeline,
        param_distributions = param_grid, 
        n_iter = 25,
        n_jobs= -1, 
        scoring='accuracy', 
        cv = 10)

random_xgb.fit(train_X, train_y);

In [None]:
print(random_xgb.best_score_)
print(random_xgb.best_params_)
print(random_xgb.score(test_X, test_y))

In [None]:
param_grid = {'model__learning_rate': [0,15,0.2, 0.25],
    'model__max_depth': [2,3,4,5],
    'model__n_estimators': [20,30,50]}

In [None]:
grid_xgb = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

In [None]:
grid_xgb.fit(train_X, train_y);
xgb_model_best = grid_xgb.best_estimator_;

In [None]:
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)
print(grid_xgb.score(test_X, test_y))

### Fit Model To Test Data

In [None]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator=gbm,param_grid=gbm_param_grid,            
                        scoring='neg_mean_squared_error', cv=4, verbose=1)


# Fit grid_mse to the data
grid_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

In [None]:
# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)

# Perform random search: grid_mse
randomized_mse =  RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm,scoring='neg_mean_squared_error', n_iter=5, cv=4, verbose=1)


# Fit randomized_mse to the data
randomized_mse.fit(X,y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

In [None]:
#check label econoder/dict vectorizer

In [None]:
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict("records"), y,
scoring = "neg_mean_squared_error", cv = 10)


In [None]:
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor())]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Fit the pipeline
xgb_pipeline.fit(X.to_dict("records"), y)

In [None]:
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)

# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object

# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature], Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

In [None]:
# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     ("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)

# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))

In [None]:
# Create the parameter grid
gbm_param_grid = {
    'clf__learning_rate': np.arange(0.05, 1, 0.05),
    'clf__max_depth': np.arange(3, 10, 1),
    'clf__n_estimators': np.arange(50, 200, 50)
}

# Perform RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,param_distributions=gbm_param_grid, n_iter = 2, scoring='roc_auc', cv=2, verbose = 1 )

# Fit the estimator
randomized_roc_auc.fit(X,y)

# Compute metrics
print(randomized_roc_auc.best_score_)
print(randomized_roc_auc.best_estimator_)

In [None]:
titanic_test.head()

In [None]:
titanic_test[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_test['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [None]:
titanic_test.drop(columns=['Ticket', 'Name','Cabin'], inplace = True)

In [None]:
titanic_test['CabinLetter'] = titanic_test['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [None]:
titanic_test['Cabin1'] = np.where(titanic_test['Cabin1'].isnull(), 0, 1)
titanic_test['Cabin2'] = np.where(titanic_test['Cabin2'].isnull(), 0, 1)
titanic_test['Cabin3'] = np.where(titanic_test['Cabin3'].isnull(), 0, 1)
titanic_test['CabinNum'] = titanic_test['Cabin1'] + titanic_test['Cabin2'] + titanic_test['Cabin3']

In [None]:
titanic_test['MultipleCabins']=np.where(titanic_test['CabinNum'] > 1, 1, 0)

In [None]:
titanic_test.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)

In [None]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass'):
    titanic_test[column] = titanic_test[column].astype('category')

In [None]:
titanic_test.head()

In [None]:
#convert output to dataframe 
final_data = {'PassengerId': titanic_test.PassengerId, 'Survived': xgb_pipeline.predict(titanic_test.drop('PassengerId', axis = 1))}
submission = pd.DataFrame(data=final_data)

In [None]:
submission.head()

In [None]:
submission.to_csv('submission_1.csv', index =False)

In [None]:
titanic_training.Survived.value_counts()/len(titanic_training.Survived)

In [None]:
submission.Survived.value_counts()/len(submission.Survived)

In [None]:
submission.shape

In [None]:
#convert output to dataframe 
final_data = {'PassengerId': titanic_test.PassengerId, 'Survived': logit_pipeline.predict(titanic_test.drop('PassengerId', axis = 1))}
submission = pd.DataFrame(data=final_data)

In [None]:
submission.to_csv('submission_2.csv', index =False)

In [None]:
submission.Survived.value_counts()/len(submission.Survived)

In [None]:
#titanic_training['MultipleCabins']=np.where(titanic_training['CabinNum'] > 1, 1, 0)
#titanic_training.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)
#titanic_training[['Cabin', 'TicketNum']] = titanic_training['Ticket'].str.split(" ", 1, expand=True)
#'neg_mean_squared_error'
#'roc_auc'

In [None]:
##Example Submission
#example_submission = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/gender_submission.csv')
#example_submission.head()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

preprocessing = ColumnTransformer(remainder='passthrough',
    transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ]
)

model = LinearRegression(normalize=True)

pipeline = Pipeline([
    ['preprocessing', preprocessing],
    ['model', model],
])

In [None]:
scaled_pipeline = Pipeline([
    ['preprocessing', ColumnTransformer(remainder='passthrough', transformers=[
        ('vacation', OrdinalEncoder(categories=[['No', 'Yes'], ['No', 'Yes']]), ['VACATION', 'SW']),
        ('slot', OrdinalEncoder(categories=[['Free', 'Controlled']]), ['SLOT']),
        ('gate', OrdinalEncoder(categories=[['Free', 'Constrained']]), ['GATE']),
    ])],
    ['scaler', StandardScaler()],
    ['model', LinearRegression(normalize=True)],
])

In [None]:
classifier = Pipeline([
    ['model', VotingClassifier([
        ('logit', LogisticRegression(penalty="l2", C=1e42, solver='liblinear')),
        ('dtree', DecisionTreeClassifier()),
        ('bagging', BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=0.5)),
        ('boosted', AdaBoostClassifier(n_estimators=100, base_estimator=DecisionTreeClassifier())),
        ('rf', RandomForestClassifier(max_features=3, min_samples_split=300,
                                      random_state=0, n_estimators=100, criterion='entropy')),
        ('lda', LinearDiscriminantAnalysis()),
        ('nn', Pipeline([
            ('scaler', MinMaxScaler()),
            ('nn', MLPClassifier(hidden_layer_sizes=(10), activation='logistic', solver='lbfgs', 
                                 random_state=12, max_iter=5000)),
        ]))
    ], voting='soft')]
])

In [None]:
import numpy as np
from sklearn.model_selection import cross_validate

scores = cross_validate(scaled_pipeline, X, y, 
                        scoring=('r2', 'neg_mean_absolute_error'), cv=5)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
             ('imputer', SimpleImputer(strategy='mean'), ['Age']),
             ('dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns),
             ('name', OrdinalEncoder(categories=[['Standard', 'Elite']]), ['NameTitle'])
              ])]
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])