In [22]:
# Import Dependencies
from sklearn import datasets
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_roc_curve
import pandas as pd
import matplotlib.pyplot as plt


## Titanic
Keywords: multivariate, classification.

## Description
The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

In this skill drill, you are being asked to build a predictive model that answers the question: “who is more likely to survive?” using passenger data (ie age, gender, socio-economic class, etc). The variables included in the dataset are the following:

- PassengerID: Passenger identifier
- Pclass: Ticket class - 1 = 1st; 2 = 2nd; 3 = 3rd;
- Sex: 0 = Female & 1 = Male
- Age: age in years
- SibSp: # of siblings / spouses aboard the Titanic
- Parch: # of parents / children aboard the Titanic
- Fare: Passenger fare cost
- Embarked: Port of Embarkation - 0 = Cherbourg; 1 = Queenstown; 2 = Southampton;
- Survived: whether during the shipwreck the individual survived - 0 = did not survive; 1 = survived; (the predicted attribute)

## Source
https://www.kaggle.com/c/titanic/data

In [4]:
# Read the csv file into a pandas DataFrame
titanic = pd.read_csv('../Resources/titanic.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
709,886,0,3,0,39.0,0,5,29.1250,1
710,887,0,2,1,27.0,0,0,13.0000,2
711,888,1,1,0,19.0,0,0,30.0000,2
712,890,1,1,1,26.0,0,0,30.0000,0


### Logistic Regression

In [54]:
# Select our independent X variables, and our dependent y variable. 
 = titanic.drop(columns=['Age'])
y = titanic['Age']

In [55]:
# Confirm independent variable data types are numeric
print(X.columns)
print(X.dtypes.unique())

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')
[dtype('int64') dtype('float64')]


In [56]:
# Create our Validation training and testing datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [57]:
# Create the Linear Regression model object
model = LinearRegression()

In [58]:
# Train the model using the training sets
model.fit(X_train, y_train)

LinearRegression()

In [59]:
# Make predictions using the testing dataset
y_pred = model.predict(X_test)

print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)

Weight coefficients:  [ 7.73910230e-04 -8.02428504e+00 -8.44199938e+00 -1.39317253e-01
 -4.03020350e+00 -6.56512446e-01 -1.36112533e-02  5.99260520e-01]
y-axis intercept:  53.57505996990342


In [60]:
# Score the model with the testing dataset
model.score(X_test, y_test)

0.2325344358533039

In [62]:
# Create DataFrame of results
df = pd.DataFrame({"Predicted": y_pred, "Actual": y_test, "Error": y_pred - y_test})[["Predicted", "Actual", "Error"]]
df.head(10)

Unnamed: 0,Predicted,Actual,Error
120,37.689405,42.0,-4.310595
329,25.099808,3.0,22.099808
39,25.522992,29.0,-3.477008
294,36.451862,24.0,12.451862
654,29.854306,43.0,-13.145694
436,24.964701,8.0,16.964701
65,9.183505,33.0,-23.816495
617,24.152278,54.0,-29.847722
78,29.418276,28.0,1.418276
652,29.971998,23.0,6.971998
