In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score

# Load dataset
file_path = "/content/drive/My Drive/COS30082/W3/Titanic-Dataset.csv"
df = pd.read_csv(file_path)

# Drop irrelevant columns (would likely not affect the result)
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Encode categorical variables (Sex, Embarked (txt->int))
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df['Embarked'] = df['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})

# Handle missing values (fill Age & Fare with median, Embarked with mode)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Define features (independent variables) and target (dependent variable)
X = df.drop(columns=['Survived'])  # Target feature droped out from training
y = df['Survived']                 # Target variable

# Normalize features
scaler = StandardScaler()
X[['Age', 'Fare', 'Embarked', 'Sex']] = scaler.fit_transform(X[['Age', 'Fare', 'Embarked', 'Sex']])

# 1. Train-Test Split & Logistic Regression Model (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)

# 2. Predictions & Model Evaluation
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")

# 3. Display Theta Parameter Values (Coefficients)
print("\nTheta (Coefficients) for Features:")
theta_values = pd.DataFrame(log_reg.coef_.flatten(), index=X.columns, columns=['Coefficient'])
print(theta_values)

# 4. Making Predictions for 3 Sample Passengers (random)
sample_data = pd.DataFrame({
    'Pclass': [1, 3, 2],   
    'Sex': [0, 1, 0],      
    'Age': [25, 40, 3],    
    'SibSp': [0, 1, 0],    
    'Parch': [0, 2, 1],    
    'Fare': [71, 7.5, 12], 
    'Embarked': [1, 3, 2]  
})

# Apply same normalization
sample_data[['Age', 'Fare', 'Embarked', 'Sex']] = scaler.transform(sample_data[['Age', 'Fare', 'Embarked', 'Sex']])

# Make predictions
sample_predictions = log_reg.predict(sample_data)

# Convert predictions to text format (1/0)
sample_results = ["Survived" if pred == 1 else "Not Survived" for pred in sample_predictions]
sample_data['Prediction'] = sample_results # Embed prediction to sample data -> result

print("\nPredictions for 3 Sample Passengers:")
print(sample_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Prediction']]) # List other features pred data (post norm)

# 5. Alter Train-Test Split & Max Iterations and Observe Changes
split_variants = [0.1, 0.3, 0.5]
iteration_variants = [100, 500, 1000]

print("\n🛠 Effect of Different Splits & Iterations on Model Performance:")
for split in split_variants:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
    
    for max_iter in iteration_variants:
        log_reg = LogisticRegression(max_iter=max_iter)
        log_reg.fit(X_train, y_train)
        y_pred = log_reg.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)

        print(f"Test Size: {split}, Max Iter: {max_iter} => Accuracy: {acc:.4f}, Recall: {rec:.4f}")
