In [214]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [215]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train.head()
train.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [216]:
train = train.drop(columns=['PassengerId', 'Name', 'Cabin', 'Ticket'])
test_ids = test['PassengerId']
test = test.drop(columns=['PassengerId', 'Name', 'Cabin', 'Ticket'])
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [217]:
from sklearn.preprocessing import LabelEncoder

In [218]:
categorical_cols = ['Sex', 'Embarked']

label_encoder = LabelEncoder()

for col in categorical_cols:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.transform(test[col])

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [219]:
train['Age'].fillna(train['Age'].mean(), inplace=True)

In [220]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(train.drop(columns=['Survived']), train['Survived'])

In [221]:
from sklearn.linear_model import LogisticRegression

In [222]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, Y_train)
Y_val_pred = logistic_regression.predict(X_val)

In [223]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

In [224]:
print(f"Accuracy:            {accuracy_score(Y_val, Y_val_pred):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(Y_val, Y_val_pred))

print(f"\nPrecision (Micro):   {precision_score(Y_val, Y_val_pred, average='micro'):.4f}")
print(f"Precision (Macro):   {precision_score(Y_val, Y_val_pred, average='macro'):.4f}")

print(f"\nRecall (Micro):      {recall_score(Y_val, Y_val_pred, average='micro'):.4f}")
print(f"Recall (Macro):      {recall_score(Y_val, Y_val_pred, average='macro'):.4f}")

Accuracy:            0.8251

Confusion Matrix:
[[118  17]
 [ 22  66]]

Precision (Micro):   0.8251
Precision (Macro):   0.8190

Recall (Micro):      0.8251
Recall (Macro):      0.8120


In [225]:
test['Age'].fillna(train['Age'].mean(), inplace=True)
test['Fare'].fillna(train['Fare'].median(), inplace=True)

In [226]:
Y_pred = logistic_regression.predict(test)

In [227]:
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': Y_pred
})

submission.to_csv('submission.csv', index=False)