In [1]:
# load data and imports
import pandas as pd
import seaborn as sbn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

clean_titanic_df = pd.read_csv('preprocessed_titanic.csv')

clean_titanic_df = clean_titanic_df.drop(columns=['Name', 'PassengerId', 'Unnamed: 0'])

train, test= train_test_split(clean_titanic_df,random_state=23, test_size = 0.2)

X_train, y_train = train.drop(columns=['Survived']), train['Survived']
X_test, y_test = test.drop(columns=['Survived']), test['Survived']

clean_titanic_df.head()
clean_titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [2]:
# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


X_log = clean_titanic_df.drop(columns=['Survived'])
y_log = clean_titanic_df['Survived']

X_log_num = X_log[['Age', 'Fare']]

X_log_cat_encoded_Pclass = pd.get_dummies(X_log['Pclass'], prefix='Pclass')
X_log_cat_encoded_Sex = pd.get_dummies(X_log['Sex'], prefix='Sex')
X_log_cat_encoded_SibSp = pd.get_dummies(X_log['SibSp'], prefix='SibSp')
X_log_cat_encoded_Parch = pd.get_dummies(X_log['Parch'], prefix='Parch')
X_log_cat_encoded_Embarked = pd.get_dummies(X_log['Embarked'], prefix='Embarked')
X_log_cat_encoded = pd.concat([X_log_cat_encoded_Pclass, X_log_cat_encoded_Sex, X_log_cat_encoded_SibSp, X_log_cat_encoded_Parch, X_log_cat_encoded_Embarked], axis=1)

X_log =  X_log_cat_encoded.join(X_log_num)

X_train_log, X_test_log= train_test_split(X_log,random_state=23, test_size = 0.2)

# print(X_train_log)

lor = LogisticRegression(penalty= 'none', max_iter= 1000, solver='newton-cg').fit(X_train_log, y_train)
print(classification_report(y_test, lor.predict(X_test_log)))
print(lor.score(X_test_log, y_test))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       115
           1       0.69      0.66      0.67        64

    accuracy                           0.77       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.77      0.77      0.77       179

0.770949720670391


In [3]:
# Apply 10-fold cross validation on logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)

i = 0
average_score = 0
for train_indices, test_indices in kf.split(X_log):
    #print(train_indices)
    start_train, stop_train = train_indices[0], train_indices[-1]+1
    start_test, stop_test = test_indices[0], test_indices[-1]+1
    
    lor = LogisticRegression(penalty= 'none', max_iter= 1000, solver='newton-cg').fit(X_log[start_train:stop_train], y_log[start_train:stop_train])
    print( classification_report( y_log[start_test:stop_test], lor.predict( X_log[start_test:stop_test] ) ) )
    print(lor.score( X_log[start_test:stop_test], y_log[start_test:stop_test] ))
    average_score += lor.score( X_log[start_test:stop_test], y_log[start_test:stop_test] )
    i+=1

print("average score: ", average_score/10)

              precision    recall  f1-score   support

           0       0.78      0.84      0.81        51
           1       0.77      0.69      0.73        39

    accuracy                           0.78        90
   macro avg       0.78      0.77      0.77        90
weighted avg       0.78      0.78      0.78        90

0.7777777777777778
              precision    recall  f1-score   support

           0       0.92      0.86      0.89        69
           1       0.60      0.75      0.67        20

    accuracy                           0.83        89
   macro avg       0.76      0.80      0.78        89
weighted avg       0.85      0.83      0.84        89

0.8314606741573034
              precision    recall  f1-score   support

           0       0.80      0.87      0.83        55
           1       0.76      0.65      0.70        34

    accuracy                           0.79        89
   macro avg       0.78      0.76      0.77        89
weighted avg       0.78      0.79   

In [4]:
# try to find optimal max iteration 

max_iterations = [x*100 for x in range(1,10)]

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

for i in max_iterations:
    lor = LogisticRegression(penalty= 'none', max_iter= i, solver='newton-cg').fit(X_train_log, y_train)
    print(classification_report(y_test, lor.predict(X_test_log)))
    print("Score for max iteration of {}".format(i))
    print(lor.score(X_test_log, y_test))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       115
           1       0.69      0.66      0.67        64

    accuracy                           0.77       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.77      0.77      0.77       179

Score for max iteration of 100
0.770949720670391
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       115
           1       0.69      0.66      0.67        64

    accuracy                           0.77       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.77      0.77      0.77       179

Score for max iteration of 200
0.770949720670391
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       115
           1       0.69      0.66      0.67        64

    accuracy                           0.77       179
   macro avg       0.75      0

In [5]:
# End of Logistic Regression portion

In [6]:
# Categorical Bayes classifier

from sklearn.naive_bayes import CategoricalNB

categorical_NB2 = CategoricalNB()
categorical_NB2.fit(X_train, y_train)
print(classification_report(y_test, categorical_NB2.predict(X_test)))
print(categorical_NB2.score(X_test, y_test))


              precision    recall  f1-score   support

           0       0.81      0.89      0.85       115
           1       0.75      0.62      0.68        64

    accuracy                           0.79       179
   macro avg       0.78      0.76      0.77       179
weighted avg       0.79      0.79      0.79       179

0.7932960893854749


In [7]:
# Apply 10-fold cross validation on Categorical Naive Bayes Classifier

from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.naive_bayes import CategoricalNB
kf = KFold(n_splits=10)

X = clean_titanic_df.drop(columns=['Survived'])
y = clean_titanic_df['Survived']

i = 0
average_score = 0
for train_indices, test_indices in kf.split(X):
    start_train, stop_train = train_indices[0], train_indices[-1]+1
    start_test, stop_test = test_indices[0], test_indices[-1]+1


    categorical_NB2 = CategoricalNB()
    categorical_NB2.fit(X[start_train:stop_train], y[start_train:stop_train])
    print(classification_report(y[start_test:stop_test], categorical_NB2.predict(X[start_test:stop_test])))
    print(categorical_NB2.score(X[start_test:stop_test], y[start_test:stop_test]))
    average_score += categorical_NB2.score(X[start_test:stop_test], y[start_test:stop_test])
    i+=1

print("average score: ", average_score/10)

              precision    recall  f1-score   support

           0       0.65      0.82      0.72        51
           1       0.64      0.41      0.50        39

    accuracy                           0.64        90
   macro avg       0.64      0.62      0.61        90
weighted avg       0.64      0.64      0.63        90

0.6444444444444445
              precision    recall  f1-score   support

           0       0.88      0.87      0.88        69
           1       0.57      0.60      0.59        20

    accuracy                           0.81        89
   macro avg       0.73      0.73      0.73        89
weighted avg       0.81      0.81      0.81        89

0.8089887640449438
              precision    recall  f1-score   support

           0       0.81      0.91      0.85        55
           1       0.81      0.65      0.72        34

    accuracy                           0.81        89
   macro avg       0.81      0.78      0.79        89
weighted avg       0.81      0.81   

In [8]:
# End of Categorical Naive Bayes Classifier portioin