In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Step 1: Load and Prepare Data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Add a column to differentiate train and test sets
train['is_train'] = 1
test['is_train'] = 0
data = pd.concat([train, test], sort=False)

# Fill missing values
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

# Feature engineering
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
data = pd.get_dummies(data, columns=['Title'], drop_first=True)
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)
data['FamilySize'] = data['SibSp'] + data['Parch']
data['IsAlone'] = (data['FamilySize'] == 0).astype(int)
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'is_train'], axis=1, inplace=True)

# Split data back into train and test
train = data[data['Survived'].notna()]
test = data[data['Survived'].isna()]
X = train.drop('Survived', axis=1)
y = train['Survived']
X_test = test.drop('Survived', axis=1)

# Scale numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)


### Logistic Regression

In [3]:
# Logistic Regression
log_model = LogisticRegression(random_state=42)
log_model.fit(X, y)
log_preds = log_model.predict(X_test)

# Save submission
submission = pd.read_csv('dataset/gender_submission.csv')
submission['Survived'] = log_preds
submission.to_csv('dataset/submission_logistic.csv', index=False)
print("Logistic Regression: submission_logistic.csv created")


Logistic Regression: submission_logistic.csv created


### Random Forest

In [4]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)
rf_preds = rf_model.predict(X_test)

# Save submission
submission['Survived'] = rf_preds
submission.to_csv('dataset/submission_random_forest.csv', index=False)
print("Random Forest: submission_random_forest.csv created")


Random Forest: submission_random_forest.csv created


### XGBoost

In [5]:
# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X, y)
xgb_preds = xgb_model.predict(X_test)

# Save submission
submission['Survived'] = xgb_preds
submission.to_csv('dataset/submission_xgboost.csv', index=False)
print("XGBoost: submission_xgboost.csv created")


XGBoost: submission_xgboost.csv created


Parameters: { "use_label_encoder" } are not used.



### Bagging Classifier

In [6]:
# Bagging Classifier
bagging_model = BaggingClassifier(estimator=rf_model, n_estimators=10, random_state=42)
bagging_model.fit(X, y)
bagging_preds = bagging_model.predict(X_test)

# Save submission
submission['Survived'] = bagging_preds
submission.to_csv('dataset/submission_bagging.csv', index=False)
print("Bagging Classifier: submission_bagging.csv created")


Bagging Classifier: submission_bagging.csv created


### Voting Classifier

In [8]:
# Voting Classifier
voting_model = VotingClassifier(
    estimators=[
        ('lr', log_model),
        ('rf', rf_model),
        ('xgb', xgb_model)
    ],
    voting='soft'
)
voting_model.fit(X, y)
voting_preds = voting_model.predict(X_test)

# Save submission
submission['Survived'] = voting_preds
submission.to_csv('dataset/submission_voting.csv', index=False)
print("Voting Classifier: submission_voting.csv created")


Voting Classifier: submission_voting.csv created


Parameters: { "use_label_encoder" } are not used.



### Neural Network with PyTorch

In [9]:
# Convert data to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define Neural Network
class TitanicNN(nn.Module):
    def __init__(self, input_size):
        super(TitanicNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize model, loss, and optimizer
nn_model = TitanicNN(input_size=X_tensor.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001)

# Train the model
epochs = 50
for epoch in range(epochs):
    nn_model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = nn_model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(train_loader):.4f}")

# Predict with Neural Network
nn_model.eval()
with torch.no_grad():
    nn_preds = nn_model(X_test_tensor).squeeze()
    nn_preds = (nn_preds >= 0.5).int().numpy()

# Save submission
submission['Survived'] = nn_preds
submission.to_csv('dataset/submission_neural_network.csv', index=False)
print("Neural Network: submission_neural_network.csv created")


Epoch [10/50], Loss: 0.3897
Epoch [20/50], Loss: 0.3724
Epoch [30/50], Loss: 0.3607
Epoch [40/50], Loss: 0.3510
Epoch [50/50], Loss: 0.3442
Neural Network: submission_neural_network.csv created


### SMOTE for Imbalanced Data

In [10]:
# Handle class imbalance with SMOTE
sm = SMOTE(random_state=42)
X_smote, y_smote = sm.fit_resample(X, y)

# Train Random Forest on SMOTE data
rf_smote_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_smote_model.fit(X_smote, y_smote)
rf_smote_preds = rf_smote_model.predict(X_test)

# Save submission
submission['Survived'] = rf_smote_preds
submission.to_csv('dataset/submission_smote_rf.csv', index=False)
print("SMOTE Random Forest: submission_smote_rf.csv created")


SMOTE Random Forest: submission_smote_rf.csv created
