In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
# 1. Load the dataset
df = pd.read_csv('i239e_project_train.csv')

In [3]:
# 2. Preprocessing: Drop non-predictive or high-missingness columns
# 'Name', 'Feature#9' (Cabin), and 'Feature#7' (Ticket) are dropped.
# 'Feature#1' appears to be a Passenger ID and is also excluded.
X = df.drop(columns=['Name', 'Feature#9', 'Feature#7', 'Feature#1', 'Survived'])
y = df['Survived']

In [4]:
# 3. Handle Missing Values (Imputation)
# Feature#4 (Age) has missing values. We fill them with the median.
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [5]:
# 4. Feature Selection using Mutual Information
# Top 5 features that share the most information with the 'Survived' label.
selector = SelectKBest(score_func=mutual_info_classif, k=5)
X_selected = selector.fit_transform(X_imputed, y)

# Identify the selected features
selected_mask = selector.get_support()
selected_features = X.columns[selected_mask].tolist()

print(f"Selected 5 Features: {selected_features}")

Selected 5 Features: ['Feature#2', 'Feature#8', 'Feature#4', 'Feature#3', 'Feature#6']


In [6]:
# 5. Scaling (Essential for Neural Networks)
# We scale the features to a mean of 0 and variance of 1.
scaler = StandardScaler()
X_final = scaler.fit_transform(X_selected)

# Save the transformed training data (features + labels) for the model
output_df = pd.DataFrame(X_final, columns=selected_features)
output_df['Survived'] = y.values
output_df.to_csv('processed_train_features.csv', index=False)
print(f"Saved processed_train_features.csv with shape {output_df.shape}")
print(output_df.head())

Saved processed_train_features.csv with shape (741, 6)
   Feature#2  Feature#8  Feature#4  Feature#3  Feature#6  Survived
0   0.831680  -0.242902  -0.877071  -0.743948   0.740426         0
1   0.831680  -0.467013   2.565684   1.344179  -0.465411         1
2  -0.367299  -0.395034   0.040997  -0.743948  -0.465411         0
3   0.831680  -0.197641  -0.112014   1.344179   1.946262         1
4  -0.367299  -0.447766  -0.035509   1.344179  -0.465411         1
