In [1]:
import pandas as pd
train = pd.read_csv('../data/titanic/train.csv')
test = pd.read_csv('../data/titanic/test.csv')

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Baseline

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [5]:
tree = DecisionTreeClassifier()

In [6]:
X = train.drop('Survived', axis=1)
y = train.Survived

In [7]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing

In [8]:
train['PassengerId'] = train.PassengerId.astype(str)

In [9]:
train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891,891.0,235,1.0,,,,,,,
Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891,891.0,"Williams, Mr. Charles Duane",1.0,,,,,,,
Sex,891,2.0,male,577.0,,,,,,,
Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891,681.0,347082,7.0,,,,,,,
Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329


In [10]:
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']
keep_cols = [c for c in train.columns if c not in drop_cols]

In [11]:
X = train[keep_cols].copy()
y = train.Survived

In [12]:
cat_cols = ['Sex', 'Embarked']

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
for c in cat_cols:
    X[c] = X[c].astype(str)

In [15]:
lencoders = {}
for c in cat_cols:
    lencoders[c] = LabelEncoder()
    X[c] = lencoders[c].fit_transform(X[c])

In [16]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [17]:
mean_age = X.Age.mean()

In [18]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [19]:
X['AgeMissing'] = X.Age.isnull()
X['Age'] = X.Age.fillna(mean_age)

In [20]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeMissing
0,3,1,22.0,1,0,7.25,2,False
1,1,0,38.0,1,0,71.2833,0,False
2,3,0,26.0,0,0,7.925,2,False
3,1,0,35.0,1,0,53.1,2,False
4,3,1,35.0,0,0,8.05,2,False


In [21]:
tree.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
y_pred = tree.predict(X)

In [23]:
len(y_pred), len(X)

(891, 891)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
train_acc = accuracy_score(y, y_pred)
print(f'Train accuracy: {train_acc:.2%}')

Train accuracy: 98.20%


# Test

In [100]:
test.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,418,,,,1100.5,120.81,892.0,996.25,1100.5,1204.75,1309.0
Pclass,418,,,,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0
Name,418,418.0,"Fleming, Miss. Honora",1.0,,,,,,,
Sex,418,2.0,male,266.0,,,,,,,
Age,332,,,,30.2726,14.1812,0.17,21.0,27.0,39.0,76.0
SibSp,418,,,,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0
Parch,418,,,,0.392344,0.981429,0.0,0.0,0.0,0.0,9.0
Ticket,418,363.0,PC 17608,5.0,,,,,,,
Fare,417,,,,35.6272,55.9076,0.0,7.8958,14.4542,31.5,512.329
Cabin,91,76.0,B57 B59 B63 B66,3.0,,,,,,,


In [93]:
def process_data(X, keep_cols, cat_cols, lencoders,
                 mean_age):
    X = X[keep_cols].copy()
    for c in cat_cols:
        X[c] = X[c].astype(str)
    for c in cat_cols:
        X[c] = lencoders[c].transform(X[c])
    X['AgeMissing'] = X.Age.isnull()
    X['Age'] = X.Age.fillna(mean_age)
    return X

In [94]:
X_test = process_data(test, keep_cols, cat_cols,
                      lencoders, mean_age)

In [102]:
X_test.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
AgeMissing    0
dtype: int64

In [101]:
X_test['Fare'] = X_test.Fare.fillna(X.Fare.mean())

In [103]:
y_test = tree.predict(X_test)

In [114]:
y_prob = tree.predict_proba(X_test)

In [104]:
len(y_test), len(X_test)

(418, 418)

In [105]:
submission = pd.read_csv('gender_submission.csv')

In [107]:
submission.PassengerId.tolist() == test.PassengerId.tolist()

True

In [109]:
submission['Survived'] = y_test

In [110]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1


In [111]:
submission.to_csv('submission.csv', index=False)

## Lasso

In [116]:
from sklearn.linear_model import Lasso

In [117]:
model = Lasso()

In [118]:
model.fit(X, y)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [119]:
model.coef_

array([-0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.00211411, -0.        , -0.        ])