# Titanic: Machine Learning from Disaster

In [3]:
import pandas as pd

In [43]:
titanic=pd.read_csv("C:/Users/Jyotsana/Desktop/Interview Prep/Data Science/MLcompetitions/titanic_kaggle/Dataset/train.csv")
print(titanic.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [50]:
#Getting Stats of Non-Numeric data
print(titanic.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [45]:
#Age has missing values can be seen from count
#Replacing missing values to median
titanic["Age"]=titanic["Age"].fillna(titanic["Age"].median())
print(titanic["Age"].describe())

count    891.000000
mean      29.361582
std       13.019697
min        0.420000
25%       22.000000
50%       28.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64


In [47]:
#Convert Non-numeric column sex to numeric for feeding to machine learning algorithm
#First find what all values it takes
print(titanic["Sex"].unique())
titanic.loc[titanic["Sex"]=="male","Sex"]=0
titanic.loc[titanic["Sex"]=="female","Sex"]=1


['male' 'female']


In [49]:
#Similarly for Embarked
print(titanic["Embarked"].unique())
print(titanic["Embarked"].value_counts())

#Since S occurs most frequently we replace NA with mode
titanic["Embarked"]=titanic["Embarked"].fillna('S')

#Now converting each categry to numeric value
titanic.loc[titanic["Embarked"]=='S',"Embarked"]=0
titanic.loc[titanic["Embarked"]=='C',"Embarked"]=1
titanic.loc[titanic["Embarked"]=='Q',"Embarked"]=2



['S' 'C' 'Q' nan]
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [52]:
#Using linear regression to make prediction and using kfolding
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross-validation
from sklearn.cross_validation import KFold

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class
alg = LinearRegression()
# Generate cross-validation folds for the titanic data set
# It returns the row indices corresponding to train and test
# We set random_state to ensure we get the same splits every time we run this
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
    # The predictors we're using to train the algorithm  
    # Note how we only take the rows in the train folds
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)


#Now doing evaluation metrics using Accuracy:-i.e. percentage of passengers you correctly predict
#First concatenate predictions into single column.
import numpy as np
predictions = np.concatenate(predictions, axis=0)
# Map predictions to outcomes (the only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print (accuracy)

0.783389450056




In [54]:
#Using logistic regression
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross-validation folds; this is much simpler than what we did before
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.787878787879


In [56]:
# Now testing on Test data
titanic_test=pd.read_csv("C:/Users/Jyotsana/Desktop/Interview Prep/Data Science/MLcompetitions/titanic_kaggle/Dataset/test.csv")
#Getting Stats of Non-Numeric data
print(titanic_test.describe())

#Age has missing values can be seen from count
#Replacing missing values to median of original training data
titanic_test["Age"]=titanic_test["Age"].fillna(titanic["Age"].median())
print(titanic_test["Age"].describe())

#Convert Non-numeric column sex to numeric for feeding to machine learning algorithm
#First find what all values it takes
print(titanic_test["Sex"].unique())
titanic_test.loc[titanic_test["Sex"]=="male","Sex"]=0
titanic_test.loc[titanic_test["Sex"]=="female","Sex"]=1


#Similarly for Embarked
#Since S occurs most frequently we replace NA with mode
titanic_test["Embarked"]=titanic_test["Embarked"].fillna('S')
#Now converting each categry to numeric value
titanic_test.loc[titanic_test["Embarked"]=='S',"Embarked"]=0
titanic_test.loc[titanic_test["Embarked"]=='C',"Embarked"]=1
titanic_test.loc[titanic_test["Embarked"]=='Q',"Embarked"]=2

#Fare has one less count means it has one NA
#So we replace it with median
#Test set can have different issue of cleaning as training data
titanic_test["Fare"]=titanic_test["Fare"].fillna(titanic_test["Fare"].median())




       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.896760    0.981429   55.907576
min     892.000000    1.000000    0.170000    0.000000    0.000000    0.000000
25%     996.250000    1.000000         NaN    0.000000    0.000000         NaN
50%    1100.500000    3.000000         NaN    0.000000    0.000000         NaN
75%    1204.750000    3.000000         NaN    1.000000    0.000000         NaN
max    1309.000000    3.000000   76.000000    8.000000    9.000000  512.329200
count    418.000000
mean      29.805024
std       12.667969
min        0.170000
25%       23.000000
50%       28.000000
75%       35.750000
max       76.000000
Name: Age, dtype: float64
['male' 'female']


In [None]:
#Creating Submitting file from Logistic Regression Since we get best accuracy from it.
# Initialize the algorithm class
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)

#print (titanic.head(5))

# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the data set
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })

print submission.head(5)

submission.to_csv("C:/Users/Jyotsana/Desktop/Interview Prep/Data Science/MLcompetitions/titanic_kaggle/Dataset/submission_LR.csv")

This was the end of my first Submission. My Submission got Accuracy of 75%. Now I will try to improve this model.