# Import Libraries

In [106]:
import numpy as np
import pandas as pd
import os
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

### Load Data

In [107]:
TITANIC_PATH =  "C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data"
def load_titanic_data(titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, "train.csv")
    csv_path
    return pd.read_csv(csv_path)

def load_titanic_test_data(titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, "test.csv")
    csv_path
    return pd.read_csv(csv_path)

titanic_base = load_titanic_data()
titanic_test = load_titanic_test_data()

titanic_base.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Data Clean etc.

In [108]:
y = titanic_base.Survived
X = titanic_base.drop("Survived", axis=1).drop("Ticket", axis=1).drop(
    "Name", axis=1).drop("PassengerId", axis=1).drop("Cabin", axis=1)
X["Embarked"].fillna('S',inplace=True)

#Removed Cabin feature
#X["Cabin"].fillna('Z',inplace=True)
#X["Cabin"] = X["Cabin"].str[0]
#X["Cabin"] = X["Cabin"].str.replace('T','A', regex=True)
#.str.replace('f.', 'ba', regex=True)
X_f = titanic_test
X_Pid_f = X_f.PassengerId
X_f = X_f.drop("Ticket", axis=1).drop("Name", axis=1).drop("PassengerId", axis=1).drop("Cabin", axis=1)
X_f["Embarked"].fillna('S',inplace=True)
#X_f["Cabin"].fillna('Z',inplace=True)
#X_f["Cabin"] = X_f["Cabin"].str[0]

X_num = X.drop("Sex", axis=1).drop("Embarked", axis=1).drop("Pclass", axis=1)#.drop("Cabin", axis=1)#.

### PipeLine

In [109]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

num_attribs = list(X_num)
cat_attribs = ["Sex","Embarked","Pclass"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

X_prep = full_pipeline.fit_transform(X)

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_prep,y,test_size=0.2, stratify=y)

# Train Classifiers

## SGD Classifier

In [111]:
sgd_clf = SGDClassifier(max_iter=10000,tol=1e-6)
sgd_clf.fit(X_train,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge',
       max_iter=10000, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       tol=1e-06, validation_fraction=0.1, verbose=0, warm_start=False)

#### Determine Accuracy

In [112]:
sgd_scores = cross_val_score(sgd_clf,X_train,y_train, cv=10, scoring="accuracy")
sgd_scores.mean()

0.7793734630002236

In [113]:
y_train_pred = cross_val_predict(sgd_clf,X_train,y_train, cv=10)
confusion_matrix(y_train, y_train_pred)

array([[356,  83],
       [ 91, 182]], dtype=int64)

## Forest Classifier

In [114]:
forest_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,n_jobs=-1)
forest_clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [115]:
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8285513078470826

In [174]:
param = { 
    'n_estimators': [1, 10, 100, 1000],
    'max_depth'         : [11, 12, 13, 14, 15,16,17,18],
    'max_leaf_nodes'         : [14, 16, 18, 20, 22,24,26,],
    'max_features': ['auto', 'sqrt', 'log2']
}
rand_grid_search = RandomizedSearchCV(forest_clf, param, cv=10, scoring="accuracy",n_iter=45)
forestGridS = GridSearchCV(forest_clf, param, cv=5, scoring="accuracy")

In [175]:
rand_grid_search.fit(X_train,y_train)



RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=45, n_jobs=None,
          param_distributions={'n_estimators': [1, 10, 100, 1000], 'max_depth': [11, 12, 13, 14, 15, 16, 17, 18], 'max_leaf_nodes': [14, 16, 18, 20, 22, 24, 26], 'max_features': ['auto', 'sqrt', 'log2']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [176]:
#forestGridS.fit(X_train,y_train)
#winsound.Beep(frequency, duration)
rand_grid_search.best_params_

{'n_estimators': 1000,
 'max_leaf_nodes': 22,
 'max_features': 'sqrt',
 'max_depth': 14}

In [177]:
y_pred = rand_grid_search.predict(X_train)
accuracy_score(y_train, y_pred)

0.8637640449438202

In [169]:
y_pred = rand_grid_search.predict(X_train)
accuracy_score(y_train, y_pred)

0.8609550561797753

## Train Set Run forest_clf

In [182]:
#rand_grid_search.best_score_
y_pred = rand_grid_search.predict(X_train)
#y_pred = forestGridS.predict(X_train)
accuracy_score(y_train, y_pred)

0.8637640449438202

## Test Set Run forest_clf

In [178]:
y_pred = rand_grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.8156424581005587

## Final Test Data forest_clf

In [179]:
X_f_preped = full_pipeline.fit_transform(X_f)

In [180]:
#y_pred_final = forestGridS.predict(X_f_preped)
y_pred_final = rand_grid_search.predict(X_f_preped)

In [181]:
df = pd.DataFrame({'PassengerId':X_Pid_f.values,'Survived':y_pred_final})
df.to_csv(r'C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data\\MyResultsForest_clf.csv', index = None, header=True)

## MLP

In [126]:
mlp_clf = MLPClassifier()
mlp_clf.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [127]:
mlp_scores = cross_val_score(mlp_clf,X_train,y_train, cv=10, scoring="accuracy")
mlp_scores.mean()



0.8116890230270511

In [128]:
y_pred = mlp_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8100558659217877

In [129]:
y_pred_final = mlp_clf.predict(X_f_preped)
df = pd.DataFrame({'PassengerId':X_Pid_f.values,'Survived':y_pred_final})
df.to_csv(r'C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data\\MLP_clf.csv', index = None, header=True)

## Extra Trees

In [130]:
extra_trees_clf = ExtraTreesClassifier(n_estimators=1000)
extra_trees_clf.fit(X_train,y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [131]:
extra_trees_scores = cross_val_score(extra_trees_clf,X_train,y_train, cv=10, scoring="accuracy")
extra_trees_scores.mean()

0.7963928012519561

In [132]:
y_pred = extra_trees_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8044692737430168

In [134]:
y_pred_final = extra_trees_clf.predict(X_f_preped)
df = pd.DataFrame({'PassengerId':X_Pid_f.values,'Survived':y_pred_final})
df.to_csv(r'C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data\\ExtraTrees_clf.csv', index = None, header=True)

## SVM

In [135]:
svm_clf = LinearSVC(C=1)
svm_clf.fit(X_train, y_train)



LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [136]:
svm_scores = cross_val_score(extra_trees_clf,X_train,y_train, cv=10, scoring="accuracy")
svm_scores.mean()

0.7963732394366196

In [137]:
y_pred = svm_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8044692737430168

In [138]:
y_pred_final = svm_clf.predict(X_f_preped)
df = pd.DataFrame({'PassengerId':X_Pid_f.values,'Survived':y_pred_final})
df.to_csv(r'C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data\\SVM_clf.csv', index = None, header=True)

## Ensemble

In [157]:
named_estimators = [
    ("forest_clf", forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
   # ("sgd_clf", sgd_clf)
]

In [158]:
voting_clf = VotingClassifier(named_estimators)

In [159]:
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('forest_clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [160]:
voting_clf.score(X_test, y_test)

0.8100558659217877

In [162]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.8044692737430168,
 0.8100558659217877,
 0.8044692737430168,
 0.8100558659217877]

In [163]:
y_pred_final = voting_clf.predict(X_f_preped)
df = pd.DataFrame({'PassengerId':X_Pid_f.values,'Survived':y_pred_final})
df.to_csv(r'C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data\\Ensamble_clf.csv', index = None, header=True)

In [164]:
del voting_clf.estimators_[2]
voting_clf.voting = "soft"
voting_clf.score(X_test, y_test)

0.8044692737430168

In [152]:
y_pred_final = voting_clf.predict(X_f_preped)
df = pd.DataFrame({'PassengerId':X_Pid_f.values,'Survived':y_pred_final})
df.to_csv(r'C:\\Users\\Josias\\Desktop\\ADA\\TitanicDataset\\titanic_data\\EnsambleSoft_clf.csv', index = None, header=True)