https://www.kaggle.com/competitions/titanic
- use a simple logistic regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

### Import all data sets

In [2]:
df = pd.read_csv("../Data/titanic/train.csv")
test_data = pd.read_csv("../Data/titanic/test.csv")

### Data Cleaning

In [3]:
# Fill missing values in Age with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert Sex into a binary feature: 0 for female and 1 for male
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})

# Drop columns that require more advanced preprocessing
df.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], inplace=True)

# Drop rows with any remaining missing values (like Embarked)
df.dropna(inplace=True)

### Data Preparation

In [4]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Train model

In [5]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression()

In [6]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



### Test data

In [7]:
# Check for missing values in test_data
print(test_data.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [8]:
# Fill missing values
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

In [9]:
# preprocessing
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Sex'] = test_data['Sex'].map({'female': 0, 'male': 1})
test_data.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

# Scale the test data using the same scaler object that was used for training data
test_data_scaled = scaler.transform(test_data.drop('PassengerId', axis=1))

### Predictions 

In [10]:
predictions = clf.predict(test_data_scaled)

### Submission

In [11]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})

# Save the dataframe to a CSV file
submission.to_csv('../Submissions/submission_titanic.csv', index=False)