In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [21]:
# 1. Load the dataset (train and test)
df_train = pd.read_csv('/content/sample_data/i239e_project_train.csv')
df_test = pd.read_csv('/content/sample_data/i239e_project_test.csv')

In [23]:
# 2. Preprocessing: Drop non-predictive or high-missingness columns
# 'Name', 'Feature#9' (Cabin), and 'Feature#7' (Ticket) are dropped.
# 'Feature#1' appears to be a Passenger ID and is also excluded.
X_train = df_train.drop(columns=['Name', 'Feature#9', 'Feature#7', 'Feature#1', 'Survived'])
y_train = df_train['Survived']

X_test = df_test.drop(columns=['Name', 'Feature#9', 'Feature#7', 'Feature#1'])

In [24]:
# 3. Handle Missing Values (Imputation)
# Feature#4 (Age) has missing values. We fill them with the median.
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)

In [25]:
# 4. Feature Selection using Mutual Information (using Train Data Only)
# Top 5 features that share the most information with the 'Survived' label.
selector = SelectKBest(score_func=mutual_info_classif, k=5)
X_train_selected = selector.fit_transform(X_train_imputed, y_train)
print(X_train_selected.shape)

# Identify the selected features
selected_mask = selector.get_support()
selected_features = X_train_imputed.columns[selected_mask].tolist()

print(f"Selected 5 Features: {selected_features}")

(741, 5)
Selected 5 Features: ['Feature#5', 'Feature#2', 'Feature#8', 'Feature#4', 'Feature#3']


In [26]:
# 5. Select the same features (as in Train data) for Test data
X_test_selected = X_test_imputed[selected_features]
# Convert X_test_selected to a NumPy array to match X_train_selected's structure
X_test_selected = X_test_selected.values
print(X_test_selected.shape)

(150, 5)


In [27]:
# 6. Sanity check
print(f"X_train_selected type: {type(X_train_selected)}")
print(f"X_test_selected type: {type(X_test_selected)}")
print(f"X_train_selected shape: {X_train_selected.shape}")
print(f"X_test_selected shape: {X_test_selected.shape}")

X_train_selected type: <class 'numpy.ndarray'>
X_test_selected type: <class 'numpy.ndarray'>
X_train_selected shape: (741, 5)
X_test_selected shape: (150, 5)


In [28]:
X_train_selected

array([[ 1.    ,  3.    , 20.2125, 18.    ,  0.    ],
       [ 0.    ,  3.    ,  9.5875, 63.    ,  1.    ],
       [ 0.    ,  2.    , 13.    , 30.    ,  0.    ],
       ...,
       [ 0.    ,  1.    , 38.5   , 47.    ,  0.    ],
       [ 0.    ,  3.    ,  8.1125, 28.    ,  0.    ],
       [ 1.    ,  1.    , 77.9583, 51.    ,  1.    ]])

In [29]:
X_test_selected

array([[  0.    ,   3.    ,   7.775 ,  16.    ,   0.    ],
       [  0.    ,   3.    ,   7.7875,  27.    ,   1.    ],
       [  3.    ,   3.    ,  25.4667,  27.    ,   1.    ],
       [  1.    ,   2.    ,  26.25  ,  31.    ,   0.    ],
       [  0.    ,   3.    ,   7.75  ,  27.    ,   0.    ],
       [  4.    ,   3.    ,  31.275 ,   9.    ,   1.    ],
       [  0.    ,   2.    ,  12.35  ,  30.    ,   1.    ],
       [  0.    ,   1.    ,  26.55  ,  34.    ,   0.    ],
       [  0.    ,   3.    ,   7.7958,  27.    ,   0.    ],
       [  1.    ,   1.    ,  71.    ,  70.    ,   0.    ],
       [  0.    ,   3.    ,   8.05  ,  35.    ,   0.    ],
       [  3.    ,   2.    ,  21.    ,  30.    ,   1.    ],
       [  0.    ,   3.    ,   7.6292,  27.    ,   1.    ],
       [  3.    ,   1.    , 263.    ,  19.    ,   0.    ],
       [  0.    ,   2.    ,  13.    ,  52.    ,   0.    ],
       [  8.    ,   3.    ,  69.55  ,  27.    ,   0.    ],
       [  0.    ,   2.    ,  13.    ,  18.    ,   1.    