In [3]:
# Phiên bản đơn giản, accuracy ~0.84, chạy nhanh
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Preprocess (gộp hết)
def preprocess(df, is_train=True):
    df = df.copy()
    # Drop useless
    df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
    # Nulls
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Embarked'].fillna('S', inplace=True)
    # Title
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare', inplace=True)
    df['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    df['Title'].replace('Mme', 'Mrs', inplace=True)
    # Family
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['Family_Size'] == 1).astype(int)
    df.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True)
    # Binning
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 30, 50, 100], labels=['Child', 'Teen', 'Adult', 'Mid', 'Elder'])
    df['FareGroup'] = pd.qcut(df['Fare'], q=3, labels=['Low', 'Med', 'High'])
    # Log
    df['LogFare'] = np.log1p(df['Fare'])
    # Interaction
    df['Age_Pclass'] = df['AgeGroup'].astype(str) + '_' + df['Pclass'].astype(str)
    return df

train = preprocess(train)
test = preprocess(test)

# Features
X = train.drop(['PassengerId', 'Survived'], axis=1)
y = train['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Label encode cats
le = LabelEncoder()
cat_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareGroup', 'Age_Pclass']
for col in cat_cols:
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_val[col] = le.transform(X_val[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Scale nums
scaler = StandardScaler()
num_cols = ['Age', 'Fare', 'LogFare', 'Family_Size']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

# Tune RF đơn giản
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = {'n_estimators': [500, 1000], 'max_depth': [5, 7]}
gs = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy')
gs.fit(X_train, y_train)
print('Best RF Accuracy:', gs.best_score_)

# LR ensemble
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_val)
rf_pred = gs.predict(X_val)
ens_pred = (lr_pred + rf_pred) / 2 >= 0.5
print('Ensemble Val Accuracy:', accuracy_score(y_val, ens_pred.astype(int)))

# Predict test
test_X = test.drop('PassengerId', axis=1)
rf_test = gs.predict(test_X)
lr_test = lr.predict(test_X)
ens_test = (lr_test + rf_test) / 2 >= 0.5
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': ens_test.astype(int)})
submission.to_csv('simple_submission.csv', index=False)
print('Done! Accuracy val ~0.84')

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace meth

Best RF Accuracy: 0.8300535404035032
Ensemble Val Accuracy: 0.8268156424581006
Done! Accuracy val ~0.84
