In [6]:
# Import libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Import dataset
df = sns.load_dataset("titanic")
df.head()

# Remove non-numeric columns
df.drop(['pclass', 'sibsp', 'parch', 'fare', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive','alone'],axis=1,inplace=True)
df.head()

# Drop rows containing missing values
df.dropna(axis=0, how = 'any', subset = None, inplace = True)
df.head()

# Convert non-numeric data using one-hot encoding
df = pd.get_dummies(df,columns=['sex','embarked'])
df.head()

# Assign Variables
X = df.drop('survived',axis=1)
y = df['survived']

# Split data into test/train set (70/30 split) and shuffle
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,shuffle=True)

# Assign algorithm
model = LogisticRegression()

# Link algorithm to X and y variables
model.fit(X_train,y_train)

# Run algorithm on test data to make predictions
model_test = model.predict(X_test)

# Evaluate predictions
print(confusion_matrix(y_test, model_test))
print(classification_report(y_test,model_test))

[[110  23]
 [ 29  52]]
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       133
           1       0.69      0.64      0.67        81

    accuracy                           0.76       214
   macro avg       0.74      0.73      0.74       214
weighted avg       0.75      0.76      0.76       214



In [7]:
# Data point to predict 
passenger = [
    22, #age
    0, #sex_female
    1, #sex_male
    0, #embarked_C
    0, #embarked_Q
    1, #embarked_S 
]

# Make prediction 
predict_passenger = model.predict([passenger])
predict_passenger



array([0], dtype=int64)

In [10]:
# count of values in the survived column
print(df['survived'].value_counts())

survived
0    424
1    288
Name: count, dtype: int64
