## Titanic Prediction


### Cleaning

In [45]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the datasets
gender_submission = pd.read_csv('gender_submission.csv')
test = pd.read_csv('/test.csv')
train = pd.read_csv('train.csv')

# Drop columns that are unnecessary or irrelevant for the analysis
columns_to_drop = ['Cabin', 'Ticket', 'Name']
train = train.drop(columns=columns_to_drop, errors='ignore')
test = test.drop(columns=columns_to_drop, errors='ignore')

# Handle missing values
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)

# Convert categorical variables to numerical
train = pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)

# Ensure both train and test datasets have the same columns
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    if col != 'Survived':
        test[col] = 0

test = test[train.columns.drop('Survived')]

# Save cleaned datasets
train.to_csv('cleaned_train.csv', index=False)
test.to_csv('cleaned_test.csv', index=False)

print("\nData cleaning complete.")


Data cleaning complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(test['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

### Train/Test

In [46]:
# Split the training data into features and target
X = train.drop('Survived', axis=1).values.astype(np.float64)
y = train['Survived'].values.astype(np.float64)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


### Modelling

In [47]:
# Train a logistic regression model using gradient descent
# Initialize parameters
n_features = X_train.shape[1]
weights = np.zeros(n_features, dtype=np.float64)
bias = 0.0
learning_rate = 0.01
n_iterations = 10000

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-np.array(z, dtype=np.float64)))

# Training the model using gradient descent
for i in range(n_iterations):
    # Linear model
    linear_model = np.dot(X_train, weights) + bias
    # Prediction
    y_predicted = sigmoid(linear_model)
    
    # Compute gradients
    dw = (1 / len(y_train)) * np.dot(X_train.T, (y_predicted - y_train))
    db = (1 / len(y_train)) * np.sum(y_predicted - y_train)
    
    # Debugging prints
    if i % 1000 == 0:
        print(f"Iteration {i}: weights shape: {weights.shape}, dw shape: {dw.shape}, bias: {bias}, db: {db}")

    # Update parameters
    weights -= learning_rate * dw
    bias -= learning_rate * db

# Train a simple neural network model using TensorFlow
model = Sequential()
model.add(Dense(16, input_dim=n_features, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train.astype(np.float32), y_train.astype(np.float32), epochs=100, batch_size=10, validation_data=(X_val.astype(np.float32), y_val.astype(np.float32)))

  return 1 / (1 + np.exp(-np.array(z, dtype=np.float64)))


Iteration 0: weights shape: (9,), dw shape: (9,), bias: 0.0, db: 0.12359550561797752
Iteration 1000: weights shape: (9,), dw shape: (9,), bias: -0.11993405632764051, db: 0.5559600563994923
Iteration 2000: weights shape: (9,), dw shape: (9,), bias: -0.15926159633809717, db: -0.3464327659073266
Iteration 3000: weights shape: (9,), dw shape: (9,), bias: -0.17938744562925057, db: -0.21099529533604158
Iteration 4000: weights shape: (9,), dw shape: (9,), bias: -0.19253481256711194, db: 0.5397873548492808
Iteration 5000: weights shape: (9,), dw shape: (9,), bias: -0.20595154129565887, db: -0.3477914755649024
Iteration 6000: weights shape: (9,), dw shape: (9,), bias: -0.2038531789167286, db: -0.21485971523788014
Iteration 7000: weights shape: (9,), dw shape: (9,), bias: -0.19961650260612127, db: 0.5333173990763294
Iteration 8000: weights shape: (9,), dw shape: (9,), bias: -0.19874316031565376, db: -0.34754246970984465
Iteration 9000: weights shape: (9,), dw shape: (9,), bias: -0.18472189392163

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5847 - loss: 7.9764 - val_accuracy: 0.6480 - val_loss: 1.2158
Epoch 2/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - accuracy: 0.5879 - loss: 1.3708 - val_accuracy: 0.6034 - val_loss: 1.0658
Epoch 3/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 902us/step - accuracy: 0.5476 - loss: 1.4274 - val_accuracy: 0.6313 - val_loss: 0.7820
Epoch 4/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 847us/step - accuracy: 0.5728 - loss: 0.7764 - val_accuracy: 0.6536 - val_loss: 0.6699
Epoch 5/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 857us/step - accuracy: 0.5757 - loss: 0.7417 - val_accuracy: 0.4078 - val_loss: 0.8686
Epoch 6/100
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 823us/step - accuracy: 0.5509 - loss: 0.7455 - val_accuracy: 0.6704 - val_loss: 0.6536
Epoch 7/100
[1m72/72[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x2de067903b0>

### Prediction

In [50]:
# Make predictions on the validation set using gradient descent model
linear_model_val = np.dot(X_val, weights) + bias
y_pred_val = sigmoid(linear_model_val)
y_pred_class = np.array([1 if i > 0.5 else 0 for i in y_pred_val])

# Make predictions on the validation set using neural network model
y_pred_nn_val = model.predict(X_val.astype(np.float32)).flatten()
y_pred_nn_class = np.array([1 if i > 0.5 else 0 for i in y_pred_nn_val])

# Make predictions on the test set using gradient descent model
linear_model_test = np.dot(test.values, weights) + bias
y_pred_test = sigmoid(linear_model_test)
test_predictions = np.array([1 if i > 0.5 else 0 for i in y_pred_test])

# Make predictions on the test set using neural network model
y_pred_nn_test = model.predict(test.values.astype(np.float32)).flatten()
test_predictions_nn = np.array([1 if i > 0.5 else 0 for i in y_pred_nn_test])

# Save predictions
gender_submission['Survived'] = test_predictions_nn
gender_submission.to_csv('submission.csv', index=False)

print("\nTraining, testing, and prediction complete. Predictions")


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 997us/step

Training, testing, and prediction complete. Predictions


### Evaluvation

In [52]:
# Evaluate the gradient descent model
accuracy = accuracy_score(y_val, y_pred_class)
print("\nLogistic Regression Accuracy on validation set:", accuracy)
print("\nClassification Report:")
print(classification_report(y_val, y_pred_class))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_class))

# Evaluate the neural network model
nn_accuracy = accuracy_score(y_val, y_pred_nn_class)
print("\nNeural Network Accuracy on validation set:", nn_accuracy)
print("\nClassification Report:")
print(classification_report(y_val, y_pred_nn_class))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_nn_class))



Logistic Regression Accuracy on validation set: 0.4301675977653631

Classification Report:
              precision    recall  f1-score   support

         0.0       0.60      0.09      0.15       105
         1.0       0.41      0.92      0.57        74

    accuracy                           0.43       179
   macro avg       0.51      0.50      0.36       179
weighted avg       0.52      0.43      0.32       179


Confusion Matrix:
[[ 9 96]
 [ 6 68]]

Neural Network Accuracy on validation set: 0.7821229050279329

Classification Report:
              precision    recall  f1-score   support

         0.0       0.84      0.77      0.81       105
         1.0       0.71      0.80      0.75        74

    accuracy                           0.78       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.78      0.78       179


Confusion Matrix:
[[81 24]
 [15 59]]
