### Importing and getting dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_dataset = pd.read_csv('./data/train.csv')

In [3]:
X = train_dataset.copy().drop('Survived', axis=1)
y = train_dataset['Survived']

In [4]:
X = X.drop(['Fare', 'Ticket'], axis=1)
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,C148,C


---
### Replacing NANs

In [5]:
girls_df = X.copy()[X['Name'].str.contains('Miss')]
girls_na = np.median(girls_df['Age'].value_counts()[girls_df.copy()['Age'].value_counts() > 3].index.sort_values())
girls_df['Age'] = girls_df['Age'].fillna(value=girls_na)
girls_na

18.0

In [6]:
boys_df = X.copy()[X['Name'].str.contains('Master')]
boys_na = np.median(boys_df['Age'].value_counts()[boys_df['Age'].value_counts() > 3].index.sort_values())
boys_df['Age'] = boys_df['Age'].fillna(value=boys_na)
boys_na

3.0

In [7]:
women_df = X.copy()[~X['Name'].str.contains('Miss')]
women_df = women_df.copy()[women_df['Sex'] == 'female']
women_na = np.median(women_df['Age'].value_counts()[women_df['Age'].value_counts() > 3].index.sort_values())
women_df['Age'] = women_df['Age'].fillna(value=women_na)
women_na

35.0

In [8]:
men_df = X.copy()[~X['Name'].str.contains('Master')]
men_df = men_df.copy()[men_df['Sex'] == 'male']
men_na = np.median(men_df['Age'].value_counts()[men_df['Age'].value_counts() > 3].index.sort_values())
men_df['Age'] = men_df['Age'].fillna(value=men_na)
men_na

33.0

In [9]:
new_dataset = pd.concat([girls_df, boys_df, women_df, men_df])

In [10]:
embarked_na = new_dataset['Embarked'].value_counts().index[0]
new_dataset['Embarked'] = new_dataset['Embarked'].fillna(embarked_na)

In [11]:
pclass_1 = new_dataset.copy()[new_dataset['Pclass'] == 1]
pclass_other = new_dataset.copy()[new_dataset['Pclass'] != 1]

In [12]:
to_na_1 = pclass_1['Cabin'].str[0].value_counts().index[0]
to_na_1

'C'

In [13]:
to_na_other = 'Unknown'

In [14]:
pclass_1['Cabin'] = pclass_1['Cabin'].fillna(to_na_1)
pclass_other['Cabin'] = pclass_other['Cabin'].fillna(to_na_other)

In [15]:
df_ = pd.concat([pclass_1, pclass_other])
df_

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Cabin,Embarked
11,12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,C103,S
61,62,1,"Icard, Miss. Amelie",female,38.0,0,0,B28,S
88,89,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,C23 C25 C27,S
136,137,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,D47,S
177,178,1,"Isham, Miss. Ann Elizabeth",female,50.0,0,0,C49,C
...,...,...,...,...,...,...,...,...,...
881,882,3,"Markun, Mr. Johann",male,33.0,0,0,Unknown,S
883,884,2,"Banfield, Mr. Frederick James",male,28.0,0,0,Unknown,S
884,885,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,Unknown,S
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,Unknown,S


In [16]:
cabin_dataset = df_.copy()[df_['Cabin'] != 'Unknown']
unknown_dataset = df_.copy()[df_['Cabin'] == 'Unknown']

str_cabin = cabin_dataset.copy()['Cabin'].str[0]
num_of_cabins = cabin_dataset.copy()['Cabin'].str.split().str.len()

cabin_dataset['Cabin Class'] = str_cabin
cabin_dataset['Num Of Cabins'] = num_of_cabins

unknown_dataset['Cabin Class'] = 'none'
unknown_dataset['Num Of Cabins'] = 0

In [17]:
cabin_data = pd.concat([cabin_dataset, unknown_dataset])
cabin_data = cabin_data.sort_index()

In [18]:
new_dataset = cabin_data.copy()
new_dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Cabin,Embarked,Cabin Class,Num Of Cabins
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,Unknown,S,none,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,C85,C,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,Unknown,S,none,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,C123,S,C,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,Unknown,S,none,0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,Unknown,S,none,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,B42,S,B,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,18.0,1,2,Unknown,S,none,0
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,C148,C,C,1


In [19]:
def name_extract(word):
    return word.split(',')[1].split('.')[0].strip()

In [20]:
new_dataset['Salutation'] = new_dataset.copy()['Name'].apply(name_extract)
new_dataset.loc[~new_dataset.copy()['Salutation'].isin(['Mr', 'Miss', 'Mrs', 'Master']), 'Salutation'] = 'Other'
new_dataset.count()

PassengerId      891
Pclass           891
Name             891
Sex              891
Age              891
SibSp            891
Parch            891
Cabin            891
Embarked         891
Cabin Class      891
Num Of Cabins    891
Salutation       891
dtype: int64

In [21]:
new_dataset['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

---
### Encoding Features

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation', 'Cabin Class']]
new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch', 'Num Of Cabins']]

In [24]:
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X_str)

categories = list(encoder.categories_[0])
categories.extend(encoder.categories_[1])
categories.extend(encoder.categories_[2])
categories.extend(encoder.categories_[3])
categories

['female',
 'male',
 'C',
 'Q',
 'S',
 'Master',
 'Miss',
 'Mr',
 'Mrs',
 'Other',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'T',
 'none']

In [25]:
new_X[categories] = X_encoded
new_X = new_X.sort_index()
new_X

Unnamed: 0,Pclass,Age,SibSp,Parch,Num Of Cabins,female,male,C,Q,S,...,Mrs,Other,A,B,D,E,F,G,T,none
0,3,22.0,1,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,38.0,1,0,1,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,26.0,0,0,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,35.0,1,0,1,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,35.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,1,19.0,0,0,1,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
888,3,18.0,1,2,0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
889,1,26.0,0,0,1,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
### Getting Polynomial Features

In [26]:
#from sklearn.preprocessing import PolynomialFeatures

In [27]:
#poly = PolynomialFeatures(2)

In [28]:
#X_poly = poly.fit_transform(new_X)
#columns = poly.get_feature_names()
#poly_df = pd.DataFrame(X_poly, columns=columns)

In [29]:
#new_X = poly_df.copy()
#new_X

---
### Scaling

In [30]:
from sklearn.preprocessing import MinMaxScaler
scaler1 = MinMaxScaler(feature_range=(0, 10))
X_scaled_1 = scaler1.fit_transform(new_X)
df_scaled = pd.DataFrame(X_scaled_1, columns=new_X.columns)
df_scaled

Unnamed: 0,Pclass,Age,SibSp,Parch,Num Of Cabins,female,male,C,Q,S,...,Mrs,Other,A,B,D,E,F,G,T,none
0,10.0,2.711737,1.25,0.000000,0.0,0.0,10.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
1,0.0,4.722292,1.25,0.000000,2.5,10.0,0.0,10.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.0,3.214375,0.00,0.000000,0.0,10.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
3,0.0,4.345313,1.25,0.000000,2.5,10.0,0.0,10.0,0.0,10.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,4.345313,0.00,0.000000,0.0,0.0,10.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,5.0,3.340035,0.00,0.000000,0.0,0.0,10.0,0.0,0.0,10.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
887,0.0,2.334757,0.00,0.000000,2.5,10.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
888,10.0,2.209098,1.25,3.333333,0.0,10.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
889,0.0,3.214375,0.00,0.000000,2.5,0.0,10.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
### Feature Selection

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
'''xgb = XGBRegressor(n_estimators=1000)
xgb.fit(X_scaled, y)'''

'xgb = XGBRegressor(n_estimators=1000)\nxgb.fit(X_scaled, y)'

In [33]:
'''xgb.feature_importances_'''

'xgb.feature_importances_'

In [34]:
dt = DecisionTreeClassifier(random_state=9, criterion='entropy', max_depth=50)
dt.fit(X_scaled_1, y)

DecisionTreeClassifier(criterion='entropy', max_depth=50, random_state=9)

In [35]:
importance = dt.feature_importances_
#df = pd.DataFrame(importance, index=columns, columns=['importance'])
#columns_keep = df[df['importance'] > 0].sort_values(by='importance', ascending=False).index.tolist()

In [36]:
# Feature Importance
fi_col = []
fi = []

for i, column in enumerate(new_X):
    fi_col.append(column)
    fi.append(dt.feature_importances_[i])

fi_df = zip(fi_col, fi)
fi_df = pd.DataFrame(fi_df, columns=['feature', 'feature_importance'])

fi_df = fi_df.sort_values('feature_importance', ascending=False).reset_index()

columns_to_keep = fi_df[fi_df['feature_importance'] > 0.004]['feature']
columns_to_keep

0               Age
1                Mr
2            Pclass
3             SibSp
4             Other
5             Parch
6              none
7                 S
8               Mrs
9              male
10    Num Of Cabins
11                Q
12                A
13                E
14             Miss
15                G
16                F
17           Master
Name: feature, dtype: object

In [37]:
new_X_keep = new_X.copy()[columns_to_keep]
new_X_keep

Unnamed: 0,Age,Mr,Pclass,SibSp,Other,Parch,none,S,Mrs,male,Num Of Cabins,Q,A,E,Miss,G,F,Master
0,22.0,1.0,3,1,0.0,0,1.0,1.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.0,0.0,1,1,0.0,0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,26.0,0.0,3,0,0.0,0,1.0,1.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,35.0,0.0,1,1,0.0,0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,35.0,1.0,3,0,0.0,0,1.0,1.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0.0,2,0,1.0,0,1.0,1.0,0.0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,19.0,0.0,1,0,0.0,0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
888,18.0,0.0,3,1,0.0,2,1.0,1.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,26.0,1.0,1,0,0.0,0,0.0,0.0,0.0,1.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---
### Model

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, classification_report

#### Scaling and train_test_split

In [39]:
scaler2 = MinMaxScaler(feature_range=(0, 10))
X_scaled_2 = scaler2.fit_transform(new_X_keep)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_2, y, test_size=0.2, random_state=13)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=13)

#### LogisticRegression

In [42]:
from sklearn.linear_model import LogisticRegression

In [None]:
C_values = np.geomspace(1e-8, 1e3, num=50)
random_state = [x for x in range(0, 1100)]
CA = []
Logarithmic_Loss = []
Random_State = []
c_values = []

for c in C_values:
    for r_s in random_state:
        X_train, X_test, y_train, y_test = train_test_split(X_scaled_2, y, test_size=0.2, random_state=r_s)
        c_values.append(c)
        
        Random_State.append(r_s)
        
        log_reg2 = LogisticRegression(random_state=0, solver='lbfgs', C=c, max_iter=1e5, n_jobs=-1)
        log_reg2.fit(X_train, y_train)

        score = log_reg2.score(X_test, y_test)
        CA.append(score)
        pred_test_proba = log_reg2.predict_proba(X_test)
        log_loss2 = log_loss(y_test, pred_test_proba)
        Logarithmic_Loss.append(log_loss2)

In [None]:
CA2 = np.array(CA).reshape(len(CA),)
Logarithmic_Loss2 = np.array(Logarithmic_Loss).reshape(len(Logarithmic_Loss),)
C_values_array = np.array(c_values).reshape(len(c_values),)
R_S = np.array(Random_State).reshape(len(Random_State),)

outcomes = zip(R_S, C_values_array, CA2, Logarithmic_Loss2)
df_outcomes = pd.DataFrame(outcomes, columns=['R_S', 'C', 'CA2', 'Logarithmic_Loss'])
df_outcomes[df_outcomes['C'] < 1e-1].sort_values(['Logarithmic_Loss', 'C', 'CA2'], ascending=True).reset_index()

---
### Working?

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_2, y, test_size=0.2, random_state=715)

In [44]:
model = LogisticRegression(random_state=0, solver='lbfgs', C=9.102982e-02, max_iter=1e5, n_jobs=-1)
model.fit(X_train, y_train)

LogisticRegression(C=0.09102982, max_iter=100000.0, n_jobs=-1, random_state=0)

#### SVM

In [None]:
from sklearn.svm import LinearSVC

#### Neural network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
max_test_score, max_train_score, max_score, our_r_s = 0, 0, 0., -1
max_valid_score = 0.8671328671328671
for r_s in range(1, 100):
    dt = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=100)
    dt.fit(X_scaled_1, y)

    importance = dt.feature_importances_
    df = pd.DataFrame(importance, index=columns, columns=['importance'])
    columns_to_keep = df[df['importance'] > 0.005].sort_values(by='importance', ascending=False).index.tolist()
    new_X_keep = new_X.copy()[columns_to_keep]
    
    scaler2 = MinMaxScaler(feature_range=(0, 10))
    X_scaled_2 = scaler2.fit_transform(new_X_keep)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_2, y, test_size=0.2, random_state=15)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=15)
    
    mlp = MLPClassifier(hidden_layer_sizes=(7), activation='relu', alpha=1e-4, 
                        epsilon=1e-8, solver='adam', max_iter=100000000000, random_state=0)
    mlp.fit(X_train, y_train.values.ravel())

    #predictions_train = mlp.predict(X_train)
    #print(classification_report(y_train, predictions_train))
    
    #predictions_test = mlp.predict(X_test)
    #print(classification_report(y_test, predictions_test))
    
    train_score = mlp.score(X_train, y_train)
    test_score = mlp.score(X_test, y_test)
    score = (1 / 2) * (train_score + test_score)
    if score > max_score or max_test_score < test_score:
        valid_score = mlp.score(X_valid, y_valid)
        if max_test_score < test_score:
            max_test_score = test_score 
        our_r_s = r_s
        if max_score < score:
            max_score = score
        if max_valid_score < valid_score:
            max_valid_score = valid_score
            print('v', train_score, test_score, score, valid_score, our_r_s)
        print(train_score, test_score, score, our_r_s)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X_keep, y, test_size=0.21, random_state=1054)
mlp = MLPClassifier(hidden_layer_sizes=(60), activation='logistic', alpha=0.85e-5, 
                    epsilon=1.2e-7, solver='adam', max_iter=100000000000, random_state=615)

mlp.fit(X_train, y_train)

predictions_train = mlp.predict(X_train)
y_train_predicted = mlp.predict(X_train)
print(classification_report(y_train, predictions_train))
print('Train Logistic Loss', log_loss(y_train, y_train_predicted))

predictions_test = mlp.predict(X_test)
y_test_predicted = mlp.predict(X_test)
print(classification_report(y_test, predictions_test))
print('Test Logistic Loss', log_loss(y_test, y_test_predicted))

---
### Final Model

In [None]:
# X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation', 'Cabin Class']]
# new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch', 'Num Of Cabins']]
# polynomal features - 2 degree

dt = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=100)
dt.fit(X_scaled_1, y)
importance = dt.feature_importances_
df = pd.DataFrame(importance, index=columns, columns=['importance'])
columns_to_keep = df[df['importance'] > 0.005].sort_values(by='importance', ascending=False).index.tolist()
new_X_keep = new_X.copy()[columns_to_keep]

scaler2 = MinMaxScaler(feature_range=(0, 10))
X_scaled_2 = scaler2.fit_transform(new_X_keep)

X_train, X_test, y_train, y_test = train_test_split(X_scaled_2, y, test_size=0.2, random_state=15)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=15)

mlp = MLPClassifier(hidden_layer_sizes=(7), activation='relu', alpha=1e-4, 
                    epsilon=1e-8, solver='adam', max_iter=100000000000, random_state=0)
mlp.fit(X_train, y_train.values.ravel())

'''importance = dt.feature_importances_
df = pd.DataFrame(importance, index=columns, columns=['importance'])
columns_to_keep = df[df['importance'] > 0.005].sort_values(by='importance', ascending=False).index.tolist()
new_X_keep = new_X.copy()[columns_to_keep]

scaler2 = MinMaxScaler(feature_range=(0, 10))
X_scaled_2 = scaler2.fit_transform(new_X_keep)

X_train, X_test, y_train, y_test = train_test_split(X_scaled_2, y, test_size=0.21, random_state=1054)
mlp = MLPClassifier(hidden_layer_sizes=(82), activation='logistic', alpha=3.7e-6, 
                    epsilon=1.2e-7, solver='adam', max_iter=100000000000, random_state=904)
mlp.fit(X_train, y_train.values.ravel())'''

predictions_train = mlp.predict(X_train)
print(classification_report(y_train, predictions_train))

predictions_test = mlp.predict(X_test)
print(classification_report(y_test, predictions_test))



---
### Getting survivors from test dataset

In [45]:
test_dataset = pd.read_csv('./data/test.csv')
X = test_dataset.copy()
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [46]:
girls_df = X.copy()[X['Name'].str.contains('Miss')]
girls_df['Age'] = girls_df['Age'].fillna(value=girls_na)
girls_na

18.0

In [47]:
boys_df = X.copy()[X['Name'].str.contains('Master')]
boys_df['Age'] = boys_df['Age'].fillna(value=boys_na)
boys_na

3.0

In [48]:
women_df = X.copy()[~X['Name'].str.contains('Miss')]
women_df = women_df.copy()[women_df['Sex'] == 'female']
women_df['Age'] = women_df['Age'].fillna(value=women_na)
women_na

35.0

In [49]:
men_df = X.copy()[~X['Name'].str.contains('Master')]
men_df = men_df.copy()[men_df['Sex'] == 'male']
men_df['Age'] = men_df['Age'].fillna(value=men_na)
men_na

33.0

In [50]:
new_dataset = pd.concat([girls_df, boys_df, women_df, men_df])
new_dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
18,910,3,"Ilmakangas, Miss. Ida Livija",female,27.0,1,0,STON/O2. 3101270,7.9250,,S
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,B36,C
36,928,3,"Roth, Miss. Sarah A",female,18.0,0,0,342712,8.0500,,S
37,929,3,"Cacic, Miss. Manda",female,21.0,0,0,315087,8.6625,,S
...,...,...,...,...,...,...,...,...,...,...,...
406,1298,2,"Ware, Mr. William Jeffery",male,23.0,1,0,28666,10.5000,,S
407,1299,1,"Widener, Mr. George Dunton",male,50.0,1,1,113503,211.5000,C80,C
413,1305,3,"Spector, Mr. Woolf",male,33.0,0,0,A.5. 3236,8.0500,,S
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S


In [51]:
new_dataset['Embarked'] = new_dataset['Embarked'].fillna(embarked_na)
new_dataset.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            418
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [52]:
pclass_1 = new_dataset.copy()[new_dataset['Pclass'] == 1]
pclass_other = new_dataset.copy()[new_dataset['Pclass'] != 1]

In [53]:
pclass_1['Cabin'] = pclass_1['Cabin'].fillna(to_na_1)
pclass_other['Cabin'] = pclass_other['Cabin'].fillna(to_na_other)

In [54]:
df_ = pd.concat([pclass_1, pclass_other])
df_.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            418
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin          418
Embarked       418
dtype: int64

In [55]:
cabin_dataset = df_.copy()[df_['Cabin'] != 'Unknown']
unknown_dataset = df_.copy()[df_['Cabin'] == 'Unknown']

str_cabin = cabin_dataset.copy()['Cabin'].str[0]
num_of_cabins = cabin_dataset.copy()['Cabin'].str.split().str.len()

cabin_dataset['Cabin Class'] = str_cabin
cabin_dataset['Num Of Cabins'] = num_of_cabins

unknown_dataset['Cabin Class'] = 'none'
unknown_dataset['Num Of Cabins'] = 0

In [56]:
cabin_data = pd.concat([cabin_dataset, unknown_dataset])
cabin_data = cabin_data.sort_index()

In [57]:
new_dataset = cabin_data.copy()
new_dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin Class,Num Of Cabins
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q,none,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,Unknown,S,none,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Unknown,Q,none,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Unknown,S,none,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Unknown,S,none,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,33.0,0,0,A.5. 3236,8.0500,Unknown,S,none,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Unknown,S,none,0
416,1308,3,"Ware, Mr. Frederick",male,33.0,0,0,359309,8.0500,Unknown,S,none,0


In [58]:
def name_extract(word):
    return word.split(',')[1].split('.')[0].strip()

In [59]:
new_dataset['Salutation'] = new_dataset.copy()['Name'].apply(name_extract)
new_dataset.loc[~new_dataset.copy()['Salutation'].isin(['Mr', 'Miss', 'Mrs', 'Master']), 'Salutation'] = 'Other'
new_dataset.count()

PassengerId      418
Pclass           418
Name             418
Sex              418
Age              418
SibSp            418
Parch            418
Ticket           418
Fare             417
Cabin            418
Embarked         418
Cabin Class      418
Num Of Cabins    418
Salutation       418
dtype: int64

In [60]:
new_dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin Class,Num Of Cabins,Salutation
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q,none,0,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,Unknown,S,none,0,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Unknown,Q,none,0,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Unknown,S,none,0,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Unknown,S,none,0,Mrs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,33.0,0,0,A.5. 3236,8.0500,Unknown,S,none,0,Mr
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,C,1,Other
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,Unknown,S,none,0,Mr
416,1308,3,"Ware, Mr. Frederick",male,33.0,0,0,359309,8.0500,Unknown,S,none,0,Mr


---
### Encoding Features

In [61]:
X_str = new_dataset.copy()[['Sex', 'Embarked', 'Salutation', 'Cabin Class']]
new_X = new_dataset.copy()[['Pclass', 'Age', 'SibSp', 'Parch', 'Num Of Cabins']]

In [62]:
X_encoded = encoder.transform(X_str)

categories = list(encoder.categories_[0])
categories.extend(encoder.categories_[1])
categories.extend(encoder.categories_[2])
categories.extend(encoder.categories_[3])
categories

['female',
 'male',
 'C',
 'Q',
 'S',
 'Master',
 'Miss',
 'Mr',
 'Mrs',
 'Other',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'T',
 'none']

In [63]:
new_X[categories] = X_encoded
new_X = new_X.sort_index()
new_X

Unnamed: 0,Pclass,Age,SibSp,Parch,Num Of Cabins,female,male,C,Q,S,...,Mrs,Other,A,B,D,E,F,G,T,none
0,3,34.5,0,0,0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,47.0,1,0,0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,62.0,0,0,0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,27.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3,22.0,1,1,0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,33.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
414,1,39.0,0,0,1,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415,3,38.5,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
416,3,33.0,0,0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


---
### Getting Polynomial Features

In [64]:
'''X_poly = poly.transform(new_X)
columns = poly.get_feature_names()
poly_df = pd.DataFrame(X_poly, columns=columns)'''

'X_poly = poly.transform(new_X)\ncolumns = poly.get_feature_names()\npoly_df = pd.DataFrame(X_poly, columns=columns)'

In [65]:
'''new_X = poly_df.copy()
new_X'''

'new_X = poly_df.copy()\nnew_X'

---
### Getting only important features and Scaling

In [66]:
new_X_keep = new_X.copy()[columns_to_keep]
X_scaled = scaler2.transform(new_X_keep)
X_scaled

array([[ 4.28248304, 10.        , 10.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 5.85322945,  0.        , 10.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 7.73812516, 10.        ,  5.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 4.78512189, 10.        , 10.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 4.09399347, 10.        , 10.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.32420206,  0.        , 10.        , ...,  0.        ,
         0.        , 10.        ]])

---
### Predicting

In [67]:
survived = model.predict(new_X_keep)
len(survived)

418

In [68]:
survived_df = pd.DataFrame(data={'PassengerId': X['PassengerId'], 'Survived': survived})
survived_df.to_csv('survived.csv', index=False)
survived_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,0
415,1307,0
416,1308,0
