In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Save test PassengerId for submission
test_passenger_ids = test_df['PassengerId']

# Compute imputation values from training data
age_median = train_df['Age'].median()
embarked_mode = train_df['Embarked'].mode()[0]
fare_median = train_df['Fare'].median()

def preprocess_data(df, age_median, embarked_mode, fare_median):
    df = df.copy()
    # Handle missing values
    df['Age'] = df['Age'].fillna(age_median)
    df['Embarked'] = df['Embarked'].fillna(embarked_mode)
    df['Fare'] = df['Fare'].fillna(fare_median)
    
    # Convert categorical to numerical
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    # Feature engineering
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # One-hot encode Embarked
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')
    
    # Ensure all expected columns are present
    expected_columns = [
        'Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone',
        'Embarked_C', 'Embarked_Q', 'Embarked_S'
    ]
    for col in expected_columns:
        if col not in df.columns:
            df[col] = 0
    
    # Drop unnecessary columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1)
    
    # Reorder columns
    df = df[expected_columns]
    
    return df

# Preprocess data
X_train = preprocess_data(train_df, age_median, embarked_mode, fare_median)
y_train = train_df['Survived']
X_test = preprocess_data(test_df, age_median, embarked_mode, fare_median)

# Split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

# Validate
val_accuracy = model.score(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Retrain on full data
model.fit(X_train, y_train)


# Predict and save
test_preds = model.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': test_preds
})
submission.to_csv('submission.csv', index=False)

Validation Accuracy: 0.8268
