In [58]:
#Dataset can be downloaded from kaggle website under titanic competition
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

In [59]:
import pandas
import numpy as np

In [60]:
#Reading training data
titanic = pandas.read_csv("train.csv")

In [61]:
print(titanic.head(5))
print(titanic.describe())
print(titanic["Age"].unique())
print(titanic["Embarked"].unique())
print(titanic["Fare"].unique())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
       P

In [62]:
# making for missing data in Age column
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].mean())
# making for missing data in Embarked with 'S'
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

In [63]:
# converting non-numeric values to numeric male = 0, female = 1
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

#conver non-numeric to numeric in "Embarked"
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2


In [64]:
# Defining columns to be used as predictors
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [65]:
# Algorithm inititalization
alg = LinearRegression()

In [66]:
#generating cross validation for dataset, returns row indices corresonding to train and test
kf = KFold(titanic.shape[0], n_folds = 3, random_state = 1)
# random state is 1 to get same split everytime
print(kf)

sklearn.cross_validation.KFold(n=891, n_folds=3, shuffle=False, random_state=1)


In [67]:
predictions = []

In [68]:
for train, test in kf:
    # The predictors we're using the train the algorithm.  Note how we only take the rows in the train folds.
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm.
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target.
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

In [69]:
#concatenante prediction consisting of 3 saperate numpy arrays over axis = 0
predictions = np.concatenate(predictions, axis = 0)

In [70]:
# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)

0.785634118967




In [73]:
#replacing missing value and numeric with non-numeric values same as above
titanic_test = pandas.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].mean())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2


In [74]:
#Trainig the algorithm using all training data
alg.fit(titanic[predictors], titanic["Survived"])

#Make predictions using test
predictions = alg.predict(titanic_test[predictors])
#Generating a submission file with desired format
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle.csv", index=False) 

