In [29]:
import pandas as pd
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
import numpy as np

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

class CabinValuizer(BaseEstimator, TransformerMixin):
    def __init__(self, simple):
        self.simple = simple
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print("Wait")
        for i in range(len(X)):
            if type(X[i][0]) == float:
                X[i][0] = 'Z'

        place = np.zeros(len(X))
        for i in range(len(X)):
            if 'Z' in X[i][0]:
                continue
            if 'A' in X[i][0]:
                place[i] = 11
            if 'B' in X[i][0]:
                place[i] = 12
            if 'C' in X[i][0]:
                place[i] = 13
            if 'D' in X[i][0]:
                place[i] = 14
            if 'E' in X[i][0]:
                place[i] = 15
            if 'F' in X[i][0]:
                place[i] = 16
            if 'G' in X[i][0]:
                place[i] = 17

        X = place.reshape(len(place),1)

        return X

class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

train = pd.read_csv("/home/guillaume/Projects/DataAnalytics/Titanic/train.csv")
test = pd.read_csv("/home/guillaume/Projects/DataAnalytics/Titanic/test.csv")

encodeAtt = ["Sex"] #Binary encode sex(male/female)
changeAtt = ["Cabin"] #Attributes to be manually changed (place) Cabin to be changed
numAtt = ["Age", "Fare", "Pclass", "SibSp", "Parch"] #Attributes to be imputed and scaled, etc, etc
y_labels = ["Survived"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(numAtt)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

enc_pipeline = Pipeline([
    ('selector', DataFrameSelector(encodeAtt)),
    ('label_binarizer', MyLabelBinarizer()),
])

chg_pipeline = Pipeline([
    ('selector', DataFrameSelector(changeAtt)),
    ('changer', CabinValuizer(False)),
    ('std_scaler', StandardScaler()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("enc_pipeline", enc_pipeline),
    ("chg_pipeline", chg_pipeline),
])

num_train_prepared = num_pipeline.fit_transform(train)
enc_train_prepared = enc_pipeline.fit_transform(train)
chg_train_prepared = chg_pipeline.fit_transform(train)

Wait


In [30]:
X_train = full_pipeline.fit_transform(train)
X_test = full_pipeline.fit_transform(test)

Wait
Wait


In [3]:
np.shape(X_train)

(891, 7)

In [4]:
y_train = (train.Survived.values).reshape(len(train.Survived.values),1)

In [5]:
np.shape(y_train)

(891, 1)

In [6]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [7]:
sgd_clf = SGDClassifier()
knn_clf = KNeighborsClassifier()
dct_clf = DecisionTreeClassifier()
nbs_clf = GaussianNB()
svc_clf = SVC()
log_clf = LogisticRegression()

rnd_clf = RandomForestClassifier()
vot_clf = VotingClassifier(
    estimators=[('lr', log_clf),
                ('gd', sgd_clf),
                ('kn', knn_clf),
                ('nb', nbs_clf),
                ('sv', svc_clf),
                ('rf', rnd_clf)]
)

In [8]:
for clf in (sgd_clf, knn_clf, dct_clf, nbs_clf, svc_clf, log_clf, rnd_clf, vot_clf):
    clf.fit(X_train, y_train.ravel())
    y_pred = clf.predict(X_train)
    print(clf.__class__.__name__, accuracy_score(y_train, y_pred))



SGDClassifier 0.7407407407407407
KNeighborsClassifier 0.8529741863075196
DecisionTreeClassifier 0.9831649831649831
GaussianNB 0.7811447811447811
SVC 0.8327721661054994
LogisticRegression 0.8058361391694725
RandomForestClassifier 0.9696969696969697
VotingClassifier 0.8439955106621774


In [9]:
vot_clf = VotingClassifier(
    estimators=[('lr', log_clf),
                ('kn', knn_clf),
                ('sv', svc_clf),
                ('rf', rnd_clf)]
)

In [13]:
for clf in (sgd_clf, knn_clf, dct_clf, nbs_clf, svc_clf, log_clf, rnd_clf, vot_clf):
    clf.fit(X_train, y_train.ravel())
    y_pred = cross_val_score(clf, X_train, y_train.ravel(), scoring='accuracy', cv=20)
    print(clf.__class__.__name__, y_pred.mean())



SGDClassifier 0.7391007905138339
KNeighborsClassifier 0.805
DecisionTreeClassifier 0.7926273605621431
GaussianNB 0.7689163372859026




SVC 0.8182125603864735
LogisticRegression 0.8035177865612647
RandomForestClassifier 0.8151504172156345






VotingClassifier 0.8273254281949936




In [16]:
from sklearn.model_selection import RandomizedSearchCV

rnd_param = {
     'n_estimators':[2,5,10,20,50],
     'max_features':[2,4,6,7],
}

knn_param = {
    'n_neighbors':[1,2,3,5,7,10,12,15]
}

svc_param = {
    'C':[1,10,100,1000],
    'gamma':[1,0.1,0.001,0.0001], 
    'kernel':['linear','rbf'],
    'degree':[3,4]
}

log_param = {
    'penalty':['l1','l2'],
    'C':np.linspace(-4,4,20)
}

In [17]:
rnd_grid = GridSearchCV(rnd_clf, rnd_param, cv=5)
knn_grid = GridSearchCV(knn_clf, knn_param, cv=5)
svc_grid = GridSearchCV(svc_clf, svc_param, cv=5)
log_grid = GridSearchCV(log_clf, log_param, cv=5)

In [63]:
svc_param = {
    'C':[1,10,100],
    'gamma':[1,0.1,0.01],
    'kernel':['linear','rbf'],
    'degree':[3,4]
}

svc_grid = RandomizedSearchCV(svc_clf, svc_param, cv=10)

svc_grid.fit(X_train, y_train.ravel())

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': [1, 10, 100], 'gamma': [1, 0.1, 0.01], 'kernel': ['linear', 'rbf'], 'degree': [3, 4]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [60]:
log_clf = LogisticRegression(solver='lbfgs')
log_param = {
    'penalty':['l2'],
    'C':np.linspace(0,4,20)[1:]
}

log_grid = RandomizedSearchCV(log_clf, log_param, cv=10)

log_grid.fit(X_train, y_train.ravel())

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'penalty': ['l2'], 'C': array([0.21053, 0.42105, 0.63158, 0.84211, 1.05263, 1.26316, 1.47368,
       1.68421, 1.89474, 2.10526, 2.31579, 2.52632, 2.73684, 2.94737,
       3.15789, 3.36842, 3.57895, 3.78947, 4.     ])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [67]:
rnd_clf = RandomForestClassifier(n_estimators=100)
rnd_param = {
     'n_estimators':[2,5,10,20,50],
     'max_features':[2,4,6,7],
     'bootstrap':[True, False]
}

rnd_grid = RandomizedSearchCV(rnd_clf, rnd_param, cv=10)

rnd_grid.fit(X_train, y_train.ravel())



RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': [2, 5, 10, 20, 50], 'max_features': [2, 4, 6, 7], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [72]:
for clf in (svc_grid, log_grid, rnd_grid):
    y_pred = cross_val_score(clf.best_estimator_, X_train, y_train.ravel(), scoring='accuracy', cv=20)
    print(clf.best_estimator_.__class__.__name__, y_pred.mean())
    
y_pred = cross_val_score(optim_vot_clf, X_train, y_train.ravel(), scoring='accuracy', cv=20)
print(optim_vot_clf.__class__.__name__, y_pred.mean())

SVC 0.8204106280193239
LogisticRegression 0.8035430390865173
RandomForestClassifier 0.8138416776460253
VotingClassifier 0.825993631971893


In [69]:
print(svc_grid.best_estimator_)
print(svc_clf)
print("\n")
print(log_grid.best_estimator_)
print(log_clf)
print("\n")
print(rnd_grid.best_estimator_)
print(rnd_clf)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=4, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)


LogisticRegression(C=0.21052631578947367, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=

In [74]:
optim_vot_clf = VotingClassifier(
    estimators=[('lr', log_grid.best_estimator_),
                ('sv', svc_grid.best_estimator_),
                ('rf', rnd_grid.best_estimator_)]
)
optim_vot_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('lr', LogisticRegression(C=0.21052631578947367, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False))...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [31]:
svc_grid
final_pred = svc_grid.best_estimator_.predict(X_test)

In [32]:

fname = 'final.csv'
subm = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":final_pred})

In [33]:
subm.to_csv(fname, index=False)

In [41]:
X_train[0][5]

1.0

In [75]:
log_pred = log_grid.best_estimator_.predict(X_test)
log_sumb = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":log_pred})
log_sumb.to_csv("optim_log_sumb.csv", index=False)

svc_pred = svc_grid.best_estimator_.predict(X_test)
svc_sumb = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":svc_pred})
svc_sumb.to_csv("optim_svc_sumb.csv", index=False)

rnd_pred = rnd_grid.best_estimator_.predict(X_test)
rnd_sumb = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":rnd_pred})
rnd_sumb.to_csv("optim_rnd_sumb.csv", index=False)

vot_pred = optim_vot_clf.predict(X_test)
vot_sumb = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":vot_pred})
vot_sumb.to_csv("optim_vot_sumb.csv", index=False)

print("Done writing.")

Done writing.
