# Titanic Prediction Attempt
## Geoff Pidcock | June 17th 2017

### Importing and wrangling data

In [108]:
# Import and wrangle data
import pandas as pd
titanic = pd.read_csv('train.csv',index_col='PassengerId')
# titanic.isnull().sum() # Age has 177 null values, Cabin 687, Embarked 2
titanic.columns.values

array(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [66]:
# Look at values and distributions of categorical features
# titanic.groupby('Pclass')['Name'].count() # values 1,2,3
# titanic.groupby('SibSp')['Name'].count() # values 0 to 8
# titanic.groupby('Parch')['Name'].count() # values 0 to 6
# titanic.groupby('Sex')['Name'].count() # values female and male
titanic.Embarked.unique()
# Age, Fare, continuous variables.
# titanic.Cabin.unique().shape #148 unique values for cabin. 687 nulls!

array(['S', 'C', 'Q', nan], dtype=object)

In [109]:
# Hypothesis - siblings and spouses, parents and children, gender, class,
# age should hold information towards survival.

# Assign dummies for Pclass, SibSp, Parch, Sex, Embarked

titanic_wdum = pd.get_dummies(data=titanic, columns = ['Sex', 'Embarked', 'Pclass', 'Parch', 'SibSp'], prefix = ['Sex', 'Embarked', 'Pclass', 'Parch', 'SibSp'] )
titanic_wdum.columns.values

array(['Survived', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'SibSp_0', 'SibSp_1', 'SibSp_2',
       'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8'], dtype=object)

In [110]:
# Fill 177 null values for Age

# titanic_wdum['Age'] = titanic_wdum[['Age','Parch_0','Parch_1','Parch_2','Parch_3','Parch_4','Parch_5','Sex_male','Pclass_1', 'Pclass_2','Embarked_C','Embarked_Q','SibSp_0','SibSp_1','SibSp_2','SibSp_3','SibSp_4','SibSp_5']].groupby(['Age','Parch_0','Parch_1','Parch_2','Parch_3','Parch_4','Parch_5','Sex_male','Pclass_1', 'Pclass_2','Embarked_C','Embarked_Q','SibSp_0','SibSp_1','SibSp_2','SibSp_3','SibSp_4','SibSp_5'])['Age'].transform(lambda x: x.fillna(x.mean()))
# That doesn't seem to work - too much grouping in applying the averages?
# Another alternative - just fill with global average?
titanic_wdum.Age.fillna(titanic.Age.mean(), inplace=True)
print(titanic_wdum.Age.isnull().sum(),titanic.Age.isnull().sum())

# If needed - can explore imputation.

0 177


### Feature Selection

In [None]:
# http://scikit-learn.org/stable/modules/feature_selection.html
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel
# May need to do this in advance of model selection.

### Model Selection

In [113]:
# Trying Regularized logistic regression w/ CV
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV

from sklearn.linear_model import LogisticRegressionCV
logregcv = LogisticRegressionCV(Cs = [1e-4,1e-2,1e0,1e2,1e4], cv = 5, penalty='l2')

### Model Training

In [115]:
feature_cols = ['Age', 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Pclass_1', 'Pclass_2', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5']
X = titanic_wdum[feature_cols]
y = titanic.Survived
# X.to_csv('titanicX_1.csv')
logregcv.fit(X,y)
print(logregcv.get_params,logregcv.predict_proba(X))


<bound method BaseEstimator.get_params of LogisticRegressionCV(Cs=[0.0001, 0.01, 1.0, 100.0, 10000.0],
           class_weight=None, cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)> [[ 0.88453239  0.11546761]
 [ 0.0700559   0.9299441 ]
 [ 0.40180375  0.59819625]
 ..., 
 [ 0.38842028  0.61157972]
 [ 0.43382357  0.56617643]
 [ 0.88676966  0.11323034]]


In [126]:
# No idea which features are performing the best in this prediction.
# probs=logregcv.predict_proba(X) # sample level probabilities
# probs.shape
feature_coef=zip(feature_cols,logregcv.coef_[0])
list(feature_coef)

[('Age', -0.037615544514703961),
 ('Sex_female', 2.677718059360441),
 ('Embarked_C', 0.33836153468423374),
 ('Embarked_Q', 0.44729514323719277),
 ('Pclass_1', 2.2076692107010008),
 ('Pclass_2', 1.0887160326083032),
 ('Parch_0', 1.0068928576521314),
 ('Parch_1', 1.419981539598175),
 ('Parch_2', 1.1088077162791075),
 ('Parch_3', 1.3239120602565499),
 ('Parch_4', -2.8469557722627417),
 ('Parch_5', -0.095248345247983424),
 ('SibSp_0', 2.4452779448352091),
 ('SibSp_1', 2.5385096820034505),
 ('SibSp_2', 2.1777766056498957),
 ('SibSp_3', 0.3059722653788029),
 ('SibSp_4', 0.69312049871183767),
 ('SibSp_5', -2.1033212920774571)]

In [127]:
logregcv.score(X,y)
# Aaargh - 81% accuracy

0.81481481481481477

### Model Evaluation

In [128]:
# Prepare test data
titanictest = pd.read_csv('test.csv',index_col='PassengerId')
titanictest_wdum = pd.get_dummies(data=titanictest, columns = ['Sex', 'Embarked', 'Pclass', 'Parch', 'SibSp'], prefix = ['Sex', 'Embarked', 'Pclass', 'Parch', 'SibSp'] )
titanictest_wdum.Age.fillna(titanictest.Age.mean(), inplace=True)
feature_cols = ['Age', 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Pclass_1', 'Pclass_2', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5']
Xpred = titanictest_wdum[feature_cols]
titanictest_wdum['Survived'] = logregcv.predict(Xpred)
# ypred.to_csv('test_pred.csv')

In [130]:
# Export predictions as a csv
titanictest_wdum['Survived'].to_csv('testpred.csv',header=True,index=True)

**Submit to Kaggle and review according to the score**
https://www.kaggle.com/c/titanic/leaderboard
## Result: 0.77033

In [None]:
# Logistic Regression reference - from lab 4
# Select feature columns
feature_cols = ['Pclass', 'Parch', 'Age']
X = titanic[feature_cols]
# Training/testing data split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
zip(feature_cols, logreg.coef_[0])
y_pred_class = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred_class))


### Reference

In [None]:
"""
Other models to look at:
Logistic Regression, 
Support Vector Machines, 
Neural Networks, or 
Decision Tree-based methods such as Gradient Boosted Decision Trees (GBDT)
"""



In [None]:
# Accuracy Evaluation reference
y_pred_class = logreg.predict(X_test)
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

# Confusion Matrix
from sklearn import metrics
prds = logreg.predict(X)
print(metrics.confusion_matrix(y_test, y_pred_class))

In [None]:
# Cross Validation reference - Week 3 Lab_B
from sklearn import cross_validation
X = bikeshare[['temp', 'hum']].join(weather[['weather_1', 'weather_2', 'weather_3']])
y = bikeshare.casual 
kf = cross_validation.KFold(len(X), n_folds=5, shuffle=True)
mse_values = []
scores = []
n= 0
print("~~~~ CROSS VALIDATION each fold ~~~~")
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    mse_values.append(metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index])))
    scores.append(lm.score(modeldata, y))
    n+=1
    print ('Model', n)
    print ('MSE:', mse_values[n-1])
    print ('R2:', scores[n-1])


print ("~~~~ SUMMARY OF CROSS VALIDATION ~~~~")
print ('Mean of MSE for all folds:', np.mean(mse_values))
print ('Mean of R2 for all folds:', np.mean(scores))

In [None]:
# Regularization reference - week 4 lab 1
# Setting up a CV version of the elastic net model
from sklearn.linear_model import ElasticNetCV
# Specifying parameter ranges
alpha_rangeEN = 10.**np.arange(-3, 3)
# l1_ratio_rangeEN = 10.**np.arange(-6,1) 
l1_ratio_rangeEN = [.1, .5, .7, .9, .95, .99, 1] # provides a better result.
enetcv = ElasticNetCV(alphas=alpha_rangeEN,l1_ratio=l1_ratio_rangeEN)
print(enetcv.get_params())
enetcv.fit(X_train,y_train)
print('Optimal Alpha Value: ',enetcv.alpha_)
print('Optimal l1_ratio Value: ',enetcv.l1_ratio_)
preds3 = enetcv.predict(X_test)
print('RMSE (enetcv reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds3)))