In [281]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm, datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from pprint import pprint  # pretty print

In [282]:
# import cleaned datasets: titanic_train, titanic_predict
titanic_train = pd.read_csv("C:\\Users\\Lillian\\Documents\\bootcamp\\titanic_train_clean.csv", index_col = 'PassengerId')
titanic_predict = pd.read_csv("C:\\Users\\Lillian\\Documents\\bootcamp\\titanic_predict_clean.csv", index_col = 'PassengerId')

In [283]:
# create dummies for categorical train data
Sex_dummy = pd.get_dummies(titanic_train["Sex"])
Embarked_dummy = pd.get_dummies(titanic_train["Embarked"], prefix="Embarked")
titanic_train_linear = titanic_train.join([Sex_dummy, Embarked_dummy])
titanic_train_linear = titanic_train_linear[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
titanic_train_linear.head(2)

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,22,1,0,7.25,0,1,0,0,1
2,1,1,38,1,0,71.2833,1,0,1,0,0


In [284]:
# create dummies for categorical predict data (do for Pclass?)
Sex_dummy = pd.get_dummies(titanic_predict["Sex"])
Embarked_dummy = pd.get_dummies(titanic_predict["Embarked"], prefix="Embarked")
titanic_predict_linear = titanic_predict.join([Sex_dummy, Embarked_dummy])
titanic_predict_linear = titanic_predict_linear[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
titanic_predict_linear.head(2)

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,34,0,0,7.8292,0,1,0,1,0
893,3,47,1,0,7.0,1,0,0,0,1


In [285]:
# X = features, y = labels
X = titanic_train.drop(['Survived'], 1)
y = titanic_train['Survived']
X_linear = titanic_train_linear.drop(['Survived'], 1)
y_linear = titanic_train_linear['Survived']

In [286]:
# split data into test and train, set seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear, y_linear, test_size=0.2, random_state=42)

In [287]:
# create subsets
X_train_linear2 = X_train_linear[["Pclass", "Age", "Fare", "female"]]
X_test_linear2 = X_test_linear[["Pclass", "Age", "Fare", "female"]]

X_train_linear3 = X_train_linear[["Age", "Fare", "female"]]
X_test_linear3 = X_test_linear[["Age", "Fare", "female"]]

X_train_linear4 = X_train_linear[["Pclass", "Age", "female"]]
X_test_linear4 = X_test_linear[["Pclass", "Age", "female"]]

X_train_linear5 = X_train_linear[["Age", "female"]]
X_test_linear5 = X_test_linear[["Age", "female"]]


In [288]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)
print("X_train_linear:", X_train_linear.shape)
print("X_test_linear:", X_test_linear.shape)
print("y_train_linear:", y_train_linear.shape)
print("y_test_linear:", y_test_linear.shape)

X_train: (711, 7)
X_test: (178, 7)
y_train: (711,)
y_test: (178,)
X_train_linear: (711, 10)
X_test_linear: (178, 10)
y_train_linear: (711,)
y_test_linear: (178,)


In [289]:
# run logistic regresion
clf_log =LogisticRegression()  # regularization doesn't change answer
clf_log.fit(X_train_linear, y_train_linear)  # train classifier on train dataset
print("train score:", clf_log.score(X_train_linear, y_train_linear))  # 80.73% of train was correctly predicted
print("test score :", clf_log.score(X_test_linear, y_test_linear))  # 80.34% of test was correctly predicted

train score: 0.8073136427566807
test score : 0.8033707865168539


In [290]:
# Pclass, Age, Fare, female
clf_log2 = LogisticRegression(C=100)  # with regularization
clf_log2.fit(X_train_linear2, y_train_linear)  # train classifier on train dataset
print("train score:", clf_log2.score(X_train_linear2, y_train_linear))  # 79.89% of train was correctly predicted
print("test score :", clf_log2.score(X_test_linear2, y_test_linear))  # 81.46% of test was correctly predicted

train score: 0.7988748241912799
test score : 0.8146067415730337


In [291]:
# Age, Fare, female
clf_log3 = LogisticRegression() # regularization doesn't change answer
clf_log3.fit(X_train_linear3, y_train_linear)  # train classifier on train dataset
print("train score:", clf_log3.score(X_train_linear3, y_train_linear))  # 77.77% of train was correctly predicted
print("test score :", clf_log3.score(X_test_linear3, y_test_linear))  # 80.34% of test was correctly predicted

train score: 0.7777777777777778
test score : 0.8033707865168539


In [292]:
# Pclass, Age, female
clf_log4 = LogisticRegression(C=1000)  # with regularization
clf_log4.fit(X_train_linear4, y_train_linear)  # train classifier on train dataset
print("train score:", clf_log4.score(X_train_linear4, y_train_linear))  # 79.89% of train was correctly predicted
print("test score :", clf_log4.score(X_test_linear4, y_test_linear))  # 80.90% of test was correctly predicted

train score: 0.7988748241912799
test score : 0.8089887640449438


In [293]:
# Age, female
clf_log5 = LogisticRegression()  # regularization doesn't change answer
clf_log5.fit(X_train_linear5, y_train_linear)  # train classifier on train dataset
print("train score:", clf_log5.score(X_train_linear5, y_train_linear))  # 78.34% of train was correctly predicted
print("test score :", clf_log5.score(X_test_linear5, y_test_linear))  # 79.77% of test was correctly predicted

train score: 0.7834036568213784
test score : 0.797752808988764


In [294]:
# run previous results with cross-validation
clf_log_cv = LogisticRegression()
grid = GridSearchCV(clf_log_cv, 
                    param_grid = {'C': [1, 10, 100, 1000] }, 
                    cv = 5, # tested: 5, 10
                    n_jobs = 1,
                    refit=True)
grid.fit(X_train_linear, y_train_linear)
print("best train score:", grid.best_score_, grid.best_estimator_) 
# X_train_linear - 80.73% no improvement  #1
# X_train_linear2 - 80.45% better results with cv = 10 (C = 100 or 1000)
# X_train_linear3 - no improvement
# X_train_linear4 - 80.39% better results with cv = 5 (C = 100 or 1000)  
# X_train_linear5 - no improvement

best train score: 0.8073136427566807 LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [295]:
# use lasso regularization
clf_log_lasso = LogisticRegression(penalty = 'l1') #l1 = lasso
grid = GridSearchCV(clf_log_lasso, 
                    param_grid = {'C': [1, 10, 100, 1000] }, 
                    cv=None, 
                    refit=True)
grid.fit(X_train_linear5, y_train_linear)
print("best train score:", grid.best_score_, grid.best_estimator_) 
# X_train_linear - no improvement, X_train_linear2,3,4 - worse results

best train score: 0.7834036568213784 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [296]:
# make logistic regression prediction
titanic_predict_linear["Survived"] = clf_log.predict(titanic_predict_linear)
titanic_predict_linear[['Survived']].to_csv('kaggle_titanic_log.csv')
titanic_predict_linear = titanic_predict_linear.drop(['Survived'], 1)

In [268]:
# run SVM - all data, hyperparameter search
clf_svm = svm.SVC() 
grid = GridSearchCV(clf_svm,
                    param_grid={'C': [10, 1.0, 0.1], 'kernel':['linear', 'rbf']},
                    scoring='accuracy',
                    n_jobs=1,
                    cv=None
                   )

grid.fit(X_train_linear5, y_train_linear)  # train classifier on train dataset using multiple hyperparams
print("best train score:", grid.best_score_, grid.best_estimator_)
# X_train_linear: 78.76%: best score with 'C': 10, 'kernel': 'linear', cv = None
# X_train_linear2: 78.34%: best score with 'C': 10, 'kernel': 'linear', cv = None
# X_train_linear3: 78.34%: best score with 'C': 10, 'kernel': 'linear', cv = None
# X_train_linear4: 78.34%: best score with 'C': 10, 'kernel': 'linear', cv = None
# X_train_linear5: 78.34%: best score with 'C': 10, 'kernel': 'linear', cv = None

best train score: 0.7834036568213784 SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [269]:
# run SVM - all data, test score based on above "best results"
clf_svm = svm.SVC(C = 10, kernel = 'linear')  
clf_svm.fit(X_train_linear, y_train_linear)  # train classifier on train dataset
print("train score:", clf_svm.score(X_train_linear, y_train_linear))  # 78.9% of train was correctly predicted
print("test score :", clf_svm.score(X_test_linear, y_test_linear))  # 80.34% of test was correctly predicted

train score: 0.7890295358649789
test score : 0.8033707865168539


In [270]:
# tweaking SVM manually: Pclass, Age, female
clf_svm4 = svm.SVC(C=2, kernel= 'rbf')
clf_svm4.fit(X_train_linear4, y_train_linear)  # train classifier on train dataset
print("train score:", clf_svm4.score(X_train_linear4, y_train_linear))  # 85.65% of train was correctly predicted
print("test score :", clf_svm4.score(X_test_linear4, y_test_linear))  # 83.15% of test was correctly predicted

train score: 0.8565400843881856
test score : 0.8314606741573034


In [271]:
titanic_predict_linear4 = titanic_predict_linear[["Pclass", "Age", "female"]]
titanic_predict_linear4["Survived"] = clf_svm4.predict(titanic_predict_linear4)
titanic_predict_linear4[['Survived']].to_csv('kaggle_titanic_svm.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [272]:
# do cross-validation
accuracy = cross_val_score(clf_log4, X_train_linear4, y_train_linear, cv=30, scoring='accuracy')
print("clf_log4 scores", accuracy) 
accuracy = cross_val_score(clf_svm4, X_train_linear4, y_train_linear, cv=30, scoring='accuracy')
print("clf_svm4 scores", accuracy) 

clf_log4 scores [0.68       0.875      0.75       0.83333333 0.95833333 0.83333333
 0.79166667 0.70833333 0.83333333 0.91666667 0.875      0.875
 0.75       0.83333333 0.75       0.83333333 0.95833333 0.79166667
 0.70833333 0.66666667 0.73913043 0.82608696 0.7826087  0.86956522
 0.60869565 0.86956522 0.7826087  0.73913043 0.82608696 0.69565217]
clf_svm4 scores [0.72       0.91666667 0.79166667 0.66666667 0.91666667 0.83333333
 0.75       0.79166667 0.91666667 0.91666667 0.95833333 0.79166667
 0.75       0.70833333 0.70833333 0.79166667 1.         0.75
 0.75       0.83333333 0.73913043 0.7826087  0.73913043 0.86956522
 0.73913043 0.82608696 0.82608696 0.7826087  0.7826087  0.7826087 ]


In [273]:
# run decision tree model - all data
clv_DT = DecisionTreeClassifier(max_depth=4)
clv_DT.fit(X_train_linear, y_train_linear)
print("train score:", clv_DT.score(X_train_linear, y_train_linear))  # 84.53% of train was correctly predicted
print("test score :", clv_DT.score(X_test_linear, y_test_linear))  # 82.58% of test was correctly predicted

train score: 0.8452883263009845
test score : 0.8202247191011236


In [274]:
# run decision tree model - Pclass, Age, female
clv_DT4 = DecisionTreeClassifier(max_depth=3)
clv_DT4.fit(X_train_linear4, y_train_linear)
print("train score:", clv_DT4.score(X_train_linear4, y_train_linear))  # 81.43% of train was correctly predicted
print("test score :", clv_DT4.score(X_test_linear4, y_test_linear))  # 81.43% of test was correctly predicted

train score: 0.8143459915611815
test score : 0.8146067415730337


In [275]:
# run decision tree model - Pclass, Fare, Age, female
clv_DT2 = DecisionTreeClassifier(max_depth=3)
clv_DT2.fit(X_train_linear2, y_train_linear)
print("train score:", clv_DT2.score(X_train_linear2, y_train_linear))  # 82.84% of train was correctly predicted
print("test score :", clv_DT2.score(X_test_linear2, y_test_linear))  # 82.02% of test was correctly predicted

train score: 0.8284106891701828
test score : 0.8202247191011236


In [280]:
# make decision tree prediction
titanic_predict_linear["Survived"] = clv_DT.predict(titanic_predict_linear)
titanic_predict_linear[['Survived']].to_csv('kaggle_titanic_DT.csv')
titanic_predict_linear = titanic_predict_linear.drop(['Survived'], 1)

In [277]:
# medium
# bootstrapping
# make predictions for new data from Kaggle: log_predict = clf_log.predict(titanic_predict_linear)
# submit the final prediction to Kaggle
# Use git to publish your code on GitHub
# complete the Linux command line tutorial

In [278]:
# hard
# build a dummy model that always predicts “does not survive”
# implement a scikit-learn Estimator class
# create a bar plot with percentage values on top
# implement Logistic Regression from scratch

In [279]:
titanic_predict_linear.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,female,male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,34,0,0,7.8292,0,1,0,1,0
893,3,47,1,0,7.0,1,0,0,0,1
894,2,62,0,0,9.6875,0,1,0,1,0
895,3,27,0,0,8.6625,0,1,0,0,1
896,3,22,1,1,12.2875,1,0,0,0,1
