In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
#from sklearn.model_selection import KFold

### load

In [5]:
# load csv
titanic = pd.read_csv("train.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### clean up 

In [7]:
titanic.describe()



Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,,0.0,0.0,7.9104
50%,446.0,0.0,3.0,,0.0,0.0,14.4542
75%,668.5,1.0,3.0,,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# fill empty age values with median
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

In [9]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
# Replace all the occurences of male with the number 0
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0

# Replace all the occurences of female with the number 1
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

# 
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [11]:
# Find all of the unique values for "Embarked"
print(titanic["Embarked"].unique())

['S' 'C' 'Q' nan]


In [12]:
# Replace all the occurences of nan with S
titanic["Embarked"] = titanic["Embarked"].fillna("S")

# Replace all the occurences of S with the number 0
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0

# Replace all the occurences of C with the number 1
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1

# Replace all the occurences of Q with the number 2
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

print(titanic["Embarked"].unique())

[0 1 2]


### predictions

In [20]:
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross-validation folds for the titanic data set
# It returns the row indices corresponding to train and test
# We set random_state to ensure we get the same splits every time we run this
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    # The predictors we're using to train the algorithm  
    # Note how we only take the rows in the train folds
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
predictions

[array([  8.99877810e-02,   9.60756206e-01,   5.92676278e-01,
          9.31138728e-01,   5.29343071e-02,   1.70275685e-01,
          3.69943590e-01,   1.03474847e-01,   5.21597906e-01,
          8.74491050e-01,   6.48883611e-01,   8.29742769e-01,
          1.34797198e-01,  -1.61126844e-01,   6.58141307e-01,
          6.39819748e-01,   1.51733875e-01,   2.95432718e-01,
          5.35377959e-01,   6.21007683e-01,   2.61872592e-01,
          2.62687561e-01,   7.31739160e-01,   5.05995897e-01,
          5.61398567e-01,   3.35039734e-01,   1.30338808e-01,
          4.68765767e-01,   6.60737753e-01,   9.10819218e-02,
          4.77223920e-01,   1.04220026e+00,   6.60691613e-01,
          8.71539273e-02,   5.28550732e-01,   4.01874338e-01,
          1.30340307e-01,   1.29339672e-01,   5.72717129e-01,
          6.65238822e-01,   4.83215779e-01,   7.60807408e-01,
          1.30578363e-01,   8.71867121e-01,   7.09855487e-01,
          9.11369897e-02,   1.39181745e-01,   6.60691613e-01,
        

In [22]:
# The predictions are in three separate NumPy arrays  
# Concatenate them into a single array 
# We concatenate them on axis 0, because they only have one axis
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (the only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

# estimate
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)



In [27]:
predictions.shape

(891,)

In [18]:
predictions.describe()

AttributeError: 'list' object has no attribute 'describe'