In [2]:
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

In [3]:
#Reading training data
titanic = pandas.read_csv("train.csv")
print(titanic.head(5))
print(titanic.describe())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
       P

In [4]:
#Making up for missing data for Age column
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].mean())


In [5]:
#converting non-numeric value -> male=0, female=1
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

In [6]:
#Find unique values in "embarked" to find nan if any
print(titanic["Embarked"].unique())


['S' 'C' 'Q' nan]


In [7]:
#Replace nan in "Embarked" with "S"
titanic["Embarked"] = titanic["Embarked"].fillna("S")

#conver non-numeric to numeric in "Embarked"
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

In [8]:
#defining columns to be used for prediction
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]


In [9]:
#Algorithm Initialization 
alg = LogisticRegression(random_state = 1)

In [11]:
#cross validation score with 3 folds to avoid overfitting
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv =3)
#mean of 3 scores
print(scores.mean())

0.79012345679


In [12]:
#processing the test set to query
titanic_test = pandas.read_csv("test.csv")
print(titanic_test.head(5))
print(titanic_test["Fare"].unique())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  
[   7.8292    7.        9.6875    8.6625   12.2875    9.225     7.6292
   29.        7.2292   24.15      7.8958   26.       82.2667   61.175
   27.7208   12.35      7.225     7.

In [13]:
#replacing missing value and numeric with non-numeric values same as above
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].mean())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

In [14]:
#Trainig the algorithm using all training data
alg.fit(titanic[predictors], titanic["Survived"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
#Make predictions using test
predictions = alg.predict(titanic_test[predictors])

In [16]:
#Generating a submission file with desired format
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle.csv", index=False)