In [1]:
import os
TITANIC_PATH = os.path.join("C:\\Users\\barak\\Documents\\GitHub\\\Kaggle_Titanic_Competition","Dataset")

In [2]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [3]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [4]:
y = train_data.Survived
X = train_data.drop(['Survived'],axis=1)

from sklearn.model_selection import train_test_split
X_train, X_val , y_train, y_val= train_test_split(X,y,test_size=0.20)


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
# To szybka klasa która robi i zamienia dane kategoryczne na numeryczne
# Zamieniamy na dane numeryczne ABC i jeśli chcemy zamienic na numeryczne to bierzemy one hot encode

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#Imputer wypełnia dane na odstawie tej Strategoo
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age",'Pclass',])),
        ("scaler", StandardScaler()),
        ("imputer", SimpleImputer(strategy="median")),
    ])


In [7]:

num_pipeline.fit_transform(X_train)

array([[-0.10520516,  0.83827931],
       [ 0.02967173, -0.36022444],
       [ 1.44587899,  0.83827931],
       ...,
       [-0.10520516,  0.83827931],
       [-1.92604307, -0.36022444],
       [ 0.50174082, -1.55872819]])

In [8]:
# Inspired from stackoverflow.com/questions/25239958

# Szybka klasa która wylicza najczęściej występujacy element itp
# 
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [9]:
# from future_encoders import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(['Embarked','Name','Sex','Ticket','Cabin'])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])



In [10]:
cat_pipeline.fit_transform(X_train)

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [11]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler

#Mamy tutaj Feature Union którea łączy nasze cechy  i pipelien'y
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [12]:
from sklearn.model_selection import StratifiedKFold

seed=123
kfold = StratifiedKFold(n_splits=5)

In [13]:
# SVC predictors

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

pipe_1 = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='rbf'))])
pipe_2 = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='poly'))])
pipe_3 = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='linear'))])
pipe_4 = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', LogisticRegression())])

param_grid = {
            'classifier__gamma': [0.001,0.003, 0.01,0.03, 0.1,0.3, 1, 10, 100,200],
            'classifier__C': [0.001,0.003, 0.01, 0.03, 0.1,0.3, 1, 10, 100,200]
}
param_grid_2 = {
            'classifier__C': [0.001, 0.01 ,0.03, 0.1,0.3, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe_1, param_grid, cv=kfold)
grid_2 = GridSearchCV(pipe_2, param_grid, cv=kfold)
grid_3 = GridSearchCV(pipe_3, param_grid, cv=kfold)
grid_4 = GridSearchCV(pipe_4, param_grid_2, cv=kfold)

grid_1.fit(X_train, y_train)
grid_2.fit(X_train, y_train)
grid_3.fit(X_train, y_train)
grid_4.fit(X_train, y_train)



KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

preds = [(grid_1.best_estimator_,'SVC_rbf'),(grid_2.best_estimator_,'SVC_poly'),(grid_3.best_estimator_,'SVC_rbf'),(grid_4.best_estimator_,'LogReg')]

for clf,name in preds:
    clf.fit(X_val, y_val)
    y_pred = clf.predict(X_val)
    print(name, accuracy_score(y_val, y_pred))


In [None]:
prediction_1 = grid_1.best_estimator_.predict(test_data)
prediction_2= grid_2.best_estimator_.predict(test_data)
prediction_3= grid_3.best_estimator_.predict(test_data)
prediction_4= grid_4.best_estimator_.predict(test_data)

In [None]:
import numpy as np

data=np.vstack([prediction_1,prediction_2,prediction_3,prediction_4]).T
prediction_1 = np.vstack([np.arange(len(X_train)+1,len(X_train)+len(prediction_1)+1,1),prediction_1]).T
prediction_2 = np.vstack([np.arange(len(X_train)+1,len(X_train)+len(prediction_1)+1,1),prediction_2]).T
prediction_3 = np.vstack([np.arange(len(X_train)+1,len(X_train)+len(prediction_1)+1,1),prediction_3]).T
prediction_4 = np.vstack([np.arange(len(X_train)+1,len(X_train)+len(prediction_1)+1,1),prediction_4]).T
df_prediction_1 = pd.DataFrame(prediction_1,columns=['PassengerId','Survived'])
df_prediction_2 = pd.DataFrame(prediction_2,columns=['PassengerId','Survived'])
df_prediction_3 = pd.DataFrame(prediction_3,columns=['PassengerId','Survived'])
df_prediction_4 = pd.DataFrame(prediction_4,columns=['PassengerId','Survived'])


df_prediction_1.to_csv(os.path.join("C:\\Users\\barak\\Documents\\GitHub\\\Kaggle_Titanic_Competition\\Results",'SVC_rbf.csv'),index=False) # BEST ONE SO FAR
df_prediction_2.to_csv(os.path.join("C:\\Users\\barak\\Documents\\GitHub\\\Kaggle_Titanic_Competition\\Results",'SVC_poly.csv'),index=False)
df_prediction_3.to_csv(os.path.join("C:\\Users\\barak\\Documents\\GitHub\\\Kaggle_Titanic_Competition\\Results",'SVC_lin.csv'),index=False)
df_prediction_4.to_csv(os.path.join("C:\\Users\\barak\\Documents\\GitHub\\\Kaggle_Titanic_Competition\\Results",'LogReg.csv'),index=False)


Best one:  **SVM**:
+ kernel: rbf
+ C = 100
+ gamma = 0.01
![image.png](attachment:image.png)

In [None]:
grid_1.best_estimator_

In [None]:
X_val

In [None]:
X_train

In [14]:
X_val = preprocess_pipeline.transform(X_val)
X_train = preprocess_pipeline.transform(X_train)

In [None]:
X_val = pd.DataFrame(X_val)
X_train = pd.DataFrame(X_train)

In [15]:
X_train

array([[-0.10520516,  0.83827931,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.02967173, -0.36022444,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.44587899,  0.83827931,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.10520516,  0.83827931,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-1.92604307, -0.36022444,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.50174082, -1.55872819,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [16]:
X_val

array([[ 0.16454861,  0.83827931,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.37495892, -1.55872819,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.71215113, -1.55872819,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.02967173, -0.36022444,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.40867814,  0.83827931,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.44239736, -0.36022444,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

## Deep Learning

In [None]:
#NEXT Pipeline

from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers import Dense

from keras.callbacks import History


model = Sequential()
model.add(Dense(4000,activation="relu", input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dense(2000,activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1000,activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1,activation="sigmoid"))
model.summary()
model.compile(loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])
history = model.fit(X_train, y_train, validation_data= (X_val, y_val), batch_size=128, epochs=300)

model.evaluate(X_val,y_val)

from sklearn import  metrics
import matplotlib.pyplot as plt 

import pandas as pd

pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 4000)              5596000   
                                                                 
 batch_normalization_4 (Batc  (None, 4000)             16000     
 hNormalization)                                                 
                                                                 
 dense_9 (Dense)             (None, 2000)              8002000   
                                                                 
 batch_normalization_5 (Batc  (None, 2000)             8000      
 hNormalization)                                                 
                                                                 
 dense_10 (Dense)            (None, 1000)              2001000   
                                                                 
 batch_normalization_6 (Batc  (None, 1000)            

Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300


Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300


Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300

In [21]:
metrics.accuracy_score(y_true= y_val, y_pred= (model.predict(X_val)  > 0.5))


0.8435754189944135