In [156]:
import pandas as pd

test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')

In [157]:
def get_x_data(df):
    df['Embarked'] = df['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2})
    df['Embarked'] = df['Embarked'].fillna(2)
    df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1})
    df['Sex'] = df['Sex'].fillna(0)
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    return df


def get_y_data(df):
    return df['Survived']


def clean_data(df):
    df.Embarked.dropna(inplace=True)


def drop_x_data(df):
    data_to_use = df.drop(
        ['Cabin', 'Name', 'Ticket', 'PassengerId'], axis='columns')

    if 'Survived' in data_to_use.columns:
        data_to_use = data_to_use.drop(['Survived'], axis='columns')

    return data_to_use


def create_custom_columns(df):
    df['Age'].fillna(df['Age'].median(), inplace=True)

    for dataset in [df]:
        title = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        title = title.replace(['Lady', 'Countess', 'Capt', 'Col', 'Don',
                              'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        title = title.replace('Mlle', 'Miss').replace(
            'Ms', 'Miss').replace('Mme', 'Mrs')

        dataset['Title'] = title

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

    df['Title'] = df['Title'].replace(title_mapping)

    return df

In [158]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(20)

clean_data(train)

x = drop_x_data(create_custom_columns(get_x_data(train)))
y = get_y_data(train)

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.25)

print(x_train)
model = SVC()
model.fit(x_train, y_train)

predictions = model.predict(x_test)

score = accuracy_score(predictions, y_test)
print(score)



     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  FamilySize  Title
886       2    0  27.0      0      0  13.0000       0.0           1      5
241       3    1  28.0      1      0  15.5000       2.0           2      2
188       3    0  40.0      1      1  15.5000       2.0           3      1
23        1    0  28.0      0      0  35.5000       0.0           1      1
412       1    1  33.0      1      0  90.0000       2.0           2      2
..      ...  ...   ...    ...    ...      ...       ...         ...    ...
327       2    1  36.0      0      0  13.0000       0.0           1      3
671       1    0  31.0      1      0  52.0000       0.0           2      1
751       3    0   6.0      0      1  12.4750       0.0           2      4
421       3    0  21.0      0      0   7.7333       2.0           1      1
711       1    0  28.0      0      0  26.5500       0.0           1      1

[668 rows x 9 columns]
0.7174887892376681


In [159]:

print(train[['Survived', 'SibSp']].groupby(['SibSp']).mean())
print(train[['Survived', 'Parch']].groupby(['Parch']).mean())

       Survived
SibSp          
0      0.345395
1      0.535885
2      0.464286
3      0.250000
4      0.166667
5      0.000000
8      0.000000
       Survived
Parch          
0      0.343658
1      0.550847
2      0.500000
3      0.600000
4      0.000000
5      0.200000
6      0.000000
