## Titanic Prediction


### Cleaning

In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the datasets
gender_submission = pd.read_csv('gender_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Drop columns that are unnecessary or irrelevant for the analysis
columns_to_drop = ['Cabin', 'Ticket', 'Name']
train = train.drop(columns=columns_to_drop, errors='ignore')
test = test.drop(columns=columns_to_drop, errors='ignore')

# Handle missing values
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)

# Convert categorical variables to numerical
train = pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)

# Ensure both train and test datasets have the same columns
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    if col != 'Survived':
        test[col] = 0

test = test[train.columns.drop('Survived')]

# Save cleaned datasets
train.to_csv('cleaned_train.csv', index=False)
test.to_csv('cleaned_test.csv', index=False)

print("\nData cleaning complete.")


Data cleaning complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(test['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

### Train/Test

In [24]:
# Split the training data into features and target
X = train.drop('Survived', axis=1).values
y = train['Survived'].values

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Modelling

In [25]:
# Train a logistic regression model using gradient descent
# Initialize parameters
n_features = X_train.shape[1]
weights = np.zeros(n_features)
bias = 0
learning_rate = 0.01
n_iterations = 10000

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Training the model using gradient descent
for i in range(n_iterations):
    # Linear model
    linear_model = np.dot(X_train, weights) + bias
    # Prediction
    y_predicted = sigmoid(linear_model)
    
    # Compute gradients
    dw = (1 / len(y_train)) * np.dot(X_train.T, (y_predicted - y_train))
    db = (1 / len(y_train)) * np.sum(y_predicted - y_train)
    
    # Update parameters
    weights -= learning_rate * dw
    bias -= learning_rate * db

# Train a simple neural network model using TensorFlow
model = Sequential()
model.add(Dense(16, input_dim=n_features, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=10, validation_data=(X_val, y_val))

TypeError: loop of ufunc does not support argument 0 of type float which has no callable exp method

### Prediction

In [26]:
# Make predictions on the validation set using gradient descent model
linear_model_val = np.dot(X_val, weights) + bias
y_pred_val = sigmoid(linear_model_val)
y_pred_class = [1 if i > 0.5 else 0 for i in y_pred_val]

# Make predictions on the validation set using neural network model
y_pred_nn_val = model.predict(X_val)
y_pred_nn_class = [1 if i > 0.5 else 0 for i in y_pred_nn_val]

# Make predictions on the test set using gradient descent model
linear_model_test = np.dot(test.values, weights) + bias
y_pred_test = sigmoid(linear_model_test)
test_predictions = [1 if i > 0.5 else 0 for i in y_pred_test]

# Make predictions on the test set using neural network model
y_pred_nn_test = model.predict(test.values)
test_predictions_nn = [1 if i > 0.5 else 0 for i in y_pred_nn_test]

# Save predictions
gender_submission['Survived'] = test_predictions_nn
gender_submission.to_csv('submission.csv', index=False)

print("\nTraining, testing, and prediction complete. Predictions saved to '/mnt/data/'")


TypeError: loop of ufunc does not support argument 0 of type float which has no callable exp method

### Evaluvation

In [None]:
# Evaluate the gradient descent model
accuracy = accuracy_score(y_val, y_pred_class)
print("\nLogistic Regression Accuracy on validation set:", accuracy)
print("\nClassification Report:")
print(classification_report(y_val, y_pred_class))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_class))

# Evaluate the neural network model
nn_accuracy = accuracy_score(y_val, y_pred_nn_class)
print("\nNeural Network Accuracy on validation set:", nn_accuracy)
print("\nClassification Report:")
print(classification_report(y_val, y_pred_nn_class))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_nn_class))
