In [91]:
import pandas as pd
import numpy as np
from __future__ import division
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross validation
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

In [70]:
data = pd.read_csv("train.csv")

#EDA

In [71]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [72]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


#Data Munging

**Notice that the age count is less than the rest of the columns, which means that we are missing data** 


What should we do about this? We have a few options:
        1. List wise deletion: Delete observations where any of the variable is missing
        2. Pair Wise Deletion: Include all cases in data analysis in which the variables of interest are present
        3. Mean/Mode Substitution: Replace missing values with the mean or mode of the other values in that column


We'll use option 3 here.


In [73]:
data["Age"] = data["Age"].fillna(data["Age"].median())

**We can use describe again to verify our age count is accurate**

In [74]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


The numeric columns above now look ready to analyze, but what about the other remaining columns in the data? For these we need to convert them to number equivalents for our model to properly analyze them. We will not be using the ticket and name columns because those most likely will not have a impact on our model.

Let's first start with sex, we first need to see how many unique values are in the column and then come up with a numbering system to replace them.

In [75]:
data.Sex.unique()

array(['male', 'female'], dtype=object)

Now that we know the diffent sexes listed, we can replace them. It's easiest just to use 0 and 1 in this case.

In [76]:
data.loc[data["Sex"] == "male", "Sex"] = 0 #Use .loc for index search
data.loc[data["Sex"] == "female", "Sex"] = 1

In [77]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S


Now let's do the same thing to the embarked column.

In [78]:
#See if any of the values in the column are missing
data['Embarked'].isnull().values.any()

True

In [79]:
data["Embarked"].value_counts()

S    644
C    168
Q     77
dtype: int64

In [80]:
#Since the majority are S we'll fill in the blanks with S
data["Embarked"] = data["Embarked"].fillna("S")

In [81]:
#Double check to be safe
data['Embarked'].isnull().values.any()

False

In [82]:
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2

#Cross Validation

We'll use a helper from sklearn to split the data up into cross validation folds, and then train an algorithm for each fold, and make predictions.

In [83]:
# The columns we want to use as our predictors
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()

# Generate cross validation folds for data. Here we are making 3 diffierent folds.  
# It returns the row indices corresponding to train and test.
# We set random_state to ensure we get the same splits every time we run it.
kf = KFold(data.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (data[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = data["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(data[predictors].iloc[test,:])
    predictions.append(test_predictions)

#Evaluating Error

We need to define some sort of error metric in order to see how accurate the model is. From the Kaggle competition description, the error metric is percentage of correct predictions. This generally involves finding the number of values in predictions that are the exact same as their counterparts in data["Survived"], and then dividing by the total number of passengers.

In [84]:
# The predictions are in three separate numpy arrays.  Concatenate them into one.  
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = sum(predictions[predictions == data["Survived"]]) / len(predictions)



In [88]:
accuracy

0.7833894500561167

#Logistic Regression

Our accuracy isn't great, let's use logistic regression to map our value between 0 and 1.

In [92]:
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.
scores = cross_validation.cross_val_score(alg, data[predictors], data["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.787878787879


This isn't great, but let's go ahead anyway so that we can get a first submission in. (We'll come back later and improve the algorithm). In order to get our submission, we have to read in the test data set from Kaggle and perform the same steps as we did on the training data.

In [95]:
titanic_test = pd.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(data["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

To generate our submission we have to train an algorithm on the training data. Then, we make predictions on the test set. Finally, we'll generate a csv file with the predictions and passenger ids.

In [98]:
# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(data[predictors], data["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

In [100]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [102]:
#export submission to csv to submit to kaggle
submission.to_csv("kaggle.csv", index=False)