In [2465]:
import pandas as pd
import os

from sklearn.neighbors import KNeighborsClassifier

TITANIC_PATH = os.path.join("datasets", "titanic")


def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [2466]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2467]:
print(test_data.info())
print(train_data.shape, test_data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
((891, 12), (418, 11))


In [2468]:
from sklearn import preprocessing
def preprocess(data):
    le = preprocessing.LabelEncoder()
    cols = ['Ticket', 'Cabin']
    data = data.drop(cols, axis=1)
    data["Sex"]=le.fit_transform(data["Sex"])
    data['FamilySize'] = data['SibSp'] + data['Parch']
    data['Title'] = data.Name.str.extract('([A-Za-z]+)\.')
    data['Title'] = data['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = le.fit_transform(data['Title'])
    data = data.drop(['Name','Age','Fare','SibSp','Parch','Embarked'], axis=1)
    return data

train_data = preprocess(train_data)
test_data = preprocess(test_data)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,FamilySize,Title
0,1,0,3,1,1,8
1,2,1,1,0,1,9
2,3,1,3,0,0,7
3,4,1,1,0,1,9
4,5,0,3,1,0,8


In [2469]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
FamilySize     891 non-null int64
Title          891 non-null int64
dtypes: int64(6)
memory usage: 41.8 KB


In [2470]:
train_data['Title'].unique()

array([ 8,  9,  7,  6,  2, 10,  3,  5, 11,  1,  0,  4])

In [2471]:
X_test  = test_data.drop('PassengerId',axis=1)
X_train = train_data.drop(['Survived','PassengerId'], axis=1)
y_train = train_data['Survived'].values

In [2472]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.2)
all_features = X_train1.columns
all_transformer = Pipeline(steps = [('stdscaler', StandardScaler())])

all_preprocess = ColumnTransformer(transformers = [('allfeatures', all_transformer, all_features)])

classifiers = [LogisticRegression(),RandomForestClassifier(random_state=42),KNeighborsClassifier(),SVC()]
first_round_scores = {}
for classifier in classifiers:
    classifier.fit(X_train1, y_train1)
    print(classifier.__class__.__name__[:10])
    score = classifier.score(X_test1, y_test1)
    print("model score: %.3f" % score)

LogisticRe
model score: 0.810
RandomFore
model score: 0.849
KNeighbors
model score: 0.821
SVC
model score: 0.838


In [2473]:
BestClassifier = classifiers[0]
BestClassifier.fit(X_train,y_train)
y_pred = BestClassifier.predict(X_test)

In [2474]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('./output/submission.csv', index=False)