In [1]:
import numpy as np
import pandas as pd
import numpy as np
import re as re

train_set = pd.read_csv('./titanic/train.csv')
test_set = pd.read_csv('./titanic/test.csv')

full_data = [train_set, test_set]

# Some features of my own that I have added in
# Gives the length of the name
train_set['Name_length'] = train_set['Name'].apply(len)
test_set['Name_length'] = test_set['Name'].apply(len)
# Feature that tells whether a passenger had a cabin on the Titanic
train_set['Has_Cabin'] = train_set["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test_set['Has_Cabin'] = test_set["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

train_set.info()
for dataset in full_data:
    dataset['Family_Size'] = dataset['SibSp'] + dataset['Parch'] + 1
print(train_set[['Family_Size', 'Survived']].groupby(['Family_Size'], as_index=False).mean())
for dataset in full_data:
    dataset['isAlone'] = 0
    dataset.loc[dataset['Family_Size'] == 1, 'isAlone'] = 1
print(train_set[['isAlone', 'Survived']].groupby(['isAlone'], as_index=False).mean())
for dataset in full_data:
    dataset['Embarked'].fillna('S')
print(train_set[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())
train_set['Fare'].fillna(dataset['Fare'].median())
train_set['Cat_Fare'] = pd.qcut(dataset['Fare'], 4)
print(train_set[['Cat_Fare', 'Survived']].groupby(['Cat_Fare'], as_index=False).mean())
for dataset in full_data:
    mean = dataset['Age'].mean()
    std = dataset['Age'].std()
    null_count = dataset['Age'].isnull().sum()

    age_null_list = np.random.randint(mean - std, mean + std, size=null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_list
    dataset['Age'] = dataset['Age'].astype(int)

train_set['CatAge'] = pd.qcut(train_set['Age'], 5)
print(train_set[['CatAge', 'Survived']].groupby(['CatAge'], as_index=False).mean())


def get_title(name):
    search_t = re.search(' ([A-Za-z]+)\.', name)
    if (search_t):
        return search_t.group(1)
    return ""


for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
print(pd.crosstab(train_set['Title'], train_set['Sex']))
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', \
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

print(train_set[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).fillna(0).astype(int)

    # Mapping Fare
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].fillna(0).astype(int)

    # Mapping Age
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4

# Feature Selection
drop_elements = ['Name', 'SibSp', 'Ticket', 'Cabin', 'Parch', 'Family_Size']
train_set = train_set.drop(drop_elements, axis=1)
train_set = train_set.drop(['PassengerId'], axis=1)
train_set = train_set.drop(['CatAge', 'Cat_Fare'], axis=1)

test_set = test_set.drop(drop_elements, axis=1)

print(train_set.head(10))
print(test_set.head(10))

train = train_set.values
test = test_set.drop(['PassengerId'], axis=1).values
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]
log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

X = train[0::, 1::]
y = train[0::, 0]

acc_dict = {}

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 10.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
candidate_classifier = SVC(probability=True)
candidate_classifier.fit(train[0::, 1::], train[0::, 0])
y_result = candidate_classifier.predict(test)
submission = pd.DataFrame({
    "PassengerId": test_set["PassengerId"],
    "Survived": y_result
})
submission.to_csv('titanic.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Name_length    891 non-null int64
Has_Cabin      891 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 97.5+ KB
   Family_Size  Survived
0            1  0.303538
1            2  0.552795
2            3  0.578431
3            4  0.724138
4            5  0.200000
5            6  0.136364
6            7  0.333333
7            8  0.000000
8           11  0.000000
   isAlone  Survived
0        0  0.505650
1        1  0.303538
  Embarked  Survived
0        C  0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



    Title  Survived
0  Master  0.575000
1    Miss  0.702703
2      Mr  0.156673
3     Mrs  0.793651
4    Rare  0.347826
   Survived  Pclass  Sex  Age  Fare  Embarked  Name_length  Has_Cabin  \
0         0       3    1    1     0         0           23          0   
1         1       1    0    2     3         1           51          1   
2         1       3    0    1     1         0           22          0   
3         1       1    0    2     3         0           44          1   
4         0       3    1    2     1         0           24          0   
5         0       3    1    2     1         2           16          0   
6         0       1    1    3     3         0           23          1   
7         0       3    1    0     2         0           30          0   
8         1       3    0    1     1         0           49          0   
9         1       2    0    0     2         1           35          0   

   isAlone  Title  
0        0      1  
1        0      3  
2        1     