In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

from pydataset import data

In [2]:
df = data('titanic')

In [3]:
df = df.dropna()

In [4]:
df.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [5]:
df = pd.get_dummies(df,columns=['class','age','sex'], drop_first=True)

In [6]:
df['survived'] = df.survived == 'yes'

In [7]:
df.survived.value_counts()

False    817
True     499
Name: survived, dtype: int64

In [8]:
X = df.drop(columns=['survived'])
y = df[['survived']]

In [9]:
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (736, 4) , validate:  (316, 4) , test:  (264, 4)
train:  (736, 1) , validate:  (316, 1) , test:  (264, 1)


In [10]:
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

In [11]:
logit.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, class_weight={0: 1, 1: 99}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.39619199 -1.30469094  1.63046194  2.51371292]]
Intercept: 
 [3.90468114]


In [15]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [16]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.36


In [17]:
print(confusion_matrix(y_train, y_pred))

[[  0 470]
 [  0 266]]


In [18]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00       470
        True       0.36      1.00      0.53       266

    accuracy                           0.36       736
   macro avg       0.18      0.50      0.27       736
weighted avg       0.13      0.36      0.19       736



  'precision', 'predicted', average, warn_for)


In [19]:
logit2 = LogisticRegression(C=.1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')

In [20]:
logit2.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.1, class_weight={0: 1, 1: 99}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-0.94665429 -1.07394577  1.13902305  2.13784568]]
Intercept: 
 [3.77120479]


In [22]:
y_pred2 = logit2.predict(X_train)
y_pred_proba2 = logit2.predict_proba(X_train)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit2.score(X_train, y_train)))

print(confusion_matrix(y_train, y_pred2))

print(classification_report(y_train, y_pred2))

Accuracy of Logistic Regression classifier on training set: 0.36
[[  0 470]
 [  0 266]]
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       470
        True       0.36      1.00      0.53       266

    accuracy                           0.36       736
   macro avg       0.18      0.50      0.27       736
weighted avg       0.13      0.36      0.19       736



  'precision', 'predicted', average, warn_for)
