In [1]:
import re 
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC

In [2]:
# Train test
train = pd.read_table("/Users/mengxuan/Desktop/Titanic/train.csv", sep=',')
test = pd.read_table("/Users/mengxuan/Desktop/Titanic/test.csv", sep=',')
test['Survived'] = np.nan
train_test = pd.concat([train, test])
ids = test['PassengerId'].values

In [3]:
# Sex
train_test['Sex'] = train_test['Sex'].map({'female': 0, 'male': 1}).astype(int)

# Fare
train_test.loc[train_test.Fare.isnull(), 'Fare'] = np.median(train_test['Fare'].dropna())
train_test.loc[train_test.Fare < 7.91, 'Fare'] = 0
train_test.loc[(train_test.Fare >= 7.91) & (train_test.Fare < 14.45), 'Fare'] = 1
train_test.loc[(train_test.Fare >= 14.45) & (train_test.Fare < 31), 'Fare'] = 2
train_test.loc[train_test.Fare >= 31, 'Fare'] = 3
train_test['Fare'] = train_test['Fare'].astype(int)

# Name
def get_title(name):
    title_search = re.search(' ([a-zA-Z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
train_test['Title'] = train_test['Name'].apply(get_title)
train_test['Title'] = train_test['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_test['Title'] = train_test['Title'].replace('Mlle', 'Miss')
train_test['Title'] = train_test['Title'].replace('Ms', 'Miss')
train_test['Title'] = train_test['Title'].replace('Mme', 'Mrs')
train_test['Title'] = train_test['Title'].map({'Master': 0, 'Miss': 1, 'Mr': 2, 'Mrs': 3, 'Rare': 4})

# Set missing ages
age_df = train_test[['Age', 'Fare', 'Parch', 'SibSp', 'Title', 'Pclass']]
know_age = age_df[age_df.Age.notnull()]
unknow_age = age_df[age_df.Age.isnull()]
X = know_age.values[:, 1:]
y = know_age.values[:, 0]
rtf = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
rtf.fit(X, y)
predictedAges = rtf.predict(unknow_age.values[:, 1:])
train_test.loc[train_test.Age.isnull(), 'Age'] = predictedAges
train_test.loc[train_test.Age < 16, 'Age'] = 0
train_test.loc[(train_test.Age >= 16) & (train_test.Age < 32), 'Age'] = 1
train_test.loc[(train_test.Age >= 32) & (train_test.Age < 48), 'Age'] = 2
train_test.loc[(train_test.Age >= 48) & (train_test.Age < 64), 'Age'] = 3
train_test.loc[train_test.Age >= 64, 'Age'] = 4
train_test['Age'] = train_test['Age'].astype(int)

In [4]:
# Feature Selection 
features = ['Age', 'Fare', 'Parch', 'SibSp', 'Title', 'Pclass', 'Sex', 'Survived']
drop_features = [f for f in train_test.columns if f not in features]

train_test = train_test.drop(drop_features, axis=1)
train = train_test[train_test.Survived.notnull()]
train['Survived'] = train['Survived'].astype(int)
test = train_test[train_test.Survived.isnull()]
print(train.head())

   Age  Fare  Parch  Pclass  Sex  SibSp  Survived  Title
0    1     0      0       3    1      1         0      2
1    2     3      0       1    0      1         1      3
2    1     1      0       3    0      0         1      1
3    2     3      0       1    0      1         1      3
4    2     1      0       3    1      0         0      2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
# Predict
target = 'Survived'
predictor = [c for c in train.columns if c not in [target]]

clf = SVC(probability=True)


classifiers = [
    SVC(probability=True),
    #DecisionTreeClassifier(),
    RandomForestClassifier(),
    #AdaBoostClassifier(),
    GradientBoostingClassifier()]

target = 'Survived'
predictor = [c for c in train.columns if c not in [target]]

results = []
for clf in classifiers:
    clf.fit(train[predictor], train[target])
    result = clf.predict(test[predictor])
    results.append(result)

results = np.array(results)
rsum = np.sum(results, axis=0)
result = map(lambda x: 1 if x >= 2 else 0, rsum)
result = list(map(int, result))
test_result = pd.DataFrame(columns=['PassengerId', 'Survived'])
test_result['PassengerId'] = ids
test_result['Survived'] = np.array(result)
test_result.to_csv('/Users/mengxuan/Desktop/Titanic/result.csv', index=False)