In [2]:
import pandas as pd
titanic_df = pd.read_csv('titanic_processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,14.0,0,0,7.8542,0,0,1
1,1,1,1,28.0,0,0,26.55,0,0,1
2,1,1,0,36.0,1,2,120.0,0,0,1
3,0,3,1,17.0,1,0,7.0542,0,0,1
4,0,3,1,4.0,4,2,31.275,0,0,1


In [3]:
titanic_df.shape


(712, 10)

# Split your data into training/test set

In [4]:
from sklearn.model_selection import train_test_split
#train_test_split will split your data
X = titanic_df.drop('Survived', axis=1) #make input df ----- note axis=1 means "column" https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
Y = titanic_df['Survived'] #just make a df with your targeted column as output

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) 
#test_size=0.2 means we will hold 20% of our data for testing

In [5]:
x_train.shape, y_train.shape

((569, 9), (569,))

In [6]:
x_test.shape, y_test.shape

((143, 9), (143,))

### Logistic regression for classification

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


In [7]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

# penalty = L1 or L2(which uses L2 norm of coeffecients of the model -sum of squares of coeff-) to penalize complex models as regularization
# c for inverse of regularization strength-- smaller values indicate strong regularization
#liblinear is the optimization solver, liblinear works well for small datasets



In [8]:
# make prediction using your model coeffecients.
y_pred = logistic_model.predict(x_test)

In [9]:
#compare the actual test values to your predicted values
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})


In [10]:
#view small sample of actual vs predicted results
pred_results.head()

Unnamed: 0,y_test,y_pred
449,0,0
603,1,0
286,0,1
544,1,1
278,1,0


In [11]:
#lets make a confusion matrix
titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71,25
1,7,40


### Precision-recall scores

When we use these for multiclass classification we need to specify an averaging method to determine how the precision and recall scores for different labels should be weighted

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [13]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy_score : ", acc)
#how many outputs the model predicted right
print("precision_score : ", prec)
#how many positive were true positive, higher means few false positives
print("recall_score : ", recall)
#how many survivor actually survived, lower means low false negative

accuracy_score :  0.7762237762237763
precision_score :  0.851063829787234
recall_score :  0.6153846153846154


In [15]:
# you can make your metrics using crosstab
titanic_crosstab


y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71,25
1,7,40


In [16]:
TP = titanic_crosstab[1][1]
TN = titanic_crosstab[0][0]
FP = titanic_crosstab[0][1]
FN = titanic_crosstab[1][0]