In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import logistic_regression_util


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.preprocessing


import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()

In [3]:
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [4]:
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
6,6,0,1,male,54.0,0,0,51.8625,S,First,E,Southampton,1
10,10,1,3,female,4.0,1,1,16.7,S,Third,G,Southampton,0
11,11,1,1,female,58.0,0,0,26.55,S,First,C,Southampton,1


In [6]:
df = df.rename(columns = {'sex':'gender'})

In [7]:
df.head(0)

Unnamed: 0,passenger_id,survived,pclass,gender,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone


In [8]:
test, train = prepare.prep_titanic(df)

In [9]:
train.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embark_town,alone,encoded_embark_town
806,0,1,male,0.543379,0,0,0.0,Southampton,1,2
516,1,2,female,0.472032,0,0,0.020495,Southampton,1,2
248,1,1,male,0.51484,1,1,0.102579,Southampton,0,2
11,1,1,female,0.814498,0,0,0.051822,Southampton,1,2
853,1,1,female,0.215183,0,1,0.076904,Southampton,0,2


In [10]:
X_train = train[['fare', 'pclass']]
y_train = train.survived
X_test = test[['fare', 'pclass']]
y_test = test.survived

In [11]:
logit = LogisticRegression(random_state = 123)

In [12]:
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
y_pred = logit.predict(X_train)

In [14]:
model_df = train[['survived']]

In [15]:
model_df['yhat'] = logit.predict(X_train)
y = logit.predict_proba(X_train)[:, 1]
model_df['probs'] = y
model_df.head()

Unnamed: 0,survived,yhat,probs
806,0,1,0.772498
516,1,1,0.783849
248,1,1,0.775639
11,1,1,0.774089
853,1,1,0.774856


In [16]:
model_df.head()

Unnamed: 0,survived,yhat,probs
806,0,1,0.772498
516,1,1,0.783849
248,1,1,0.775639
11,1,1,0.774089
853,1,1,0.774856


In [17]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.81


In [18]:
X = train[['fare', 'pclass','age']]
y = train.survived

In [19]:
logit = LogisticRegression()

In [20]:
logit.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
y_pred = logit.predict(X)

In [22]:
model_df['model2'] = logit.predict(X)

In [23]:
model_df.head()

Unnamed: 0,survived,yhat,probs,model2
806,0,1,0.772498,1
516,1,1,0.783849,1
248,1,1,0.775639,1
11,1,1,0.774089,1
853,1,1,0.774856,1


In [24]:
# print('Accuracy of model 1 on training set: {:.2f}'
#      .format(logit.score(X_train, y_train)))
print('Accuracy of model 2 on training set: {:.2f}'
     .format(logit.score(X, y)))

Accuracy of model 2 on training set: 0.81


In [25]:
X1 = train[['fare', 'pclass','age','gender']]
y1 = train.survived

In [26]:
encoder = sklearn.preprocessing.OneHotEncoder()

encoder.fit(X1[['gender']])

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [27]:
m = encoder.transform(train[['gender']]).todense()

In [28]:
concat = pd.concat([
    X1.gender,
    pd.DataFrame(m, columns=encoder.categories_[0], index=X1.index)
], axis=1)

In [29]:
cols = ['gender_' + c for c in encoder.categories_[0]]

m = encoder.transform(X1[['gender']]).todense()
X1 = pd.concat([
    X1,
    pd.DataFrame(m, columns=cols, index=X1.index)
], axis=1).drop(columns='gender')

In [30]:
X1.head()

Unnamed: 0,fare,pclass,age,gender_female,gender_male
806,0.0,1,0.543379,0.0,1.0
516,0.020495,2,0.472032,1.0,0.0
248,0.102579,1,0.51484,0.0,1.0
11,0.051822,1,0.814498,1.0,0.0
853,0.076904,1,0.215183,1.0,0.0


In [31]:
logit.fit(X1, y1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
y_pred = logit.predict(X1)

In [33]:
model_df['model3'] = logit.predict(X1)

In [34]:
model_df.head()

Unnamed: 0,survived,yhat,probs,model2,model3
806,0,1,0.772498,1,1
516,1,1,0.783849,1,1
248,1,1,0.775639,1,1
11,1,1,0.774089,1,1
853,1,1,0.774856,1,1


In [35]:
print('Accuracy of model 3 on training set: {:.2f}'
     .format(logit.score(X1, y1)))

Accuracy of model 3 on training set: 0.81
