In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [2]:
def load_data():
    df=pd.read_csv('data/train.csv')
    return df

Preprocessing


In [51]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

class TitanicPreprocessor:
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.age_mean = None
        self.embarked_mode = None
        self.embarked_columns = None
        self.title_columns = None

    def fit(self, df):
        self.age_mean = df['Age'].mean()
        self.embarked_mode = df['Embarked'].mode()[0]
        self.label_encoder.fit(df['Sex'])
        
        # Needed to ensure consistent dummy columns later
        temp = df.copy()
        temp['Embarked'] = temp['Embarked'].fillna(self.embarked_mode)
        temp_dummies = pd.get_dummies(temp['Embarked'], drop_first=True, dtype='int')
        temp2_dummies = pd.get_dummies(temp['Title'], drop_first=True, dtype='int')
        self.embarked_columns = temp_dummies.columns.tolist()
        self.title_columns = temp2_dummies.columns.tolist()

    def transform(self, df):
        df = df.copy()
        df['Sex'] = self.label_encoder.transform(df['Sex'])

        df['Age'] = df['Age'].fillna(self.age_mean)
        df['Embarked'] = df['Embarked'].fillna(self.embarked_mode)
        embarked_dummies = pd.get_dummies(df['Embarked'], drop_first=True, dtype='int')
        titles_dummies = pd.get_dummies(df['Title'], drop_first=True, dtype='int')

        # Add missing dummy columns (if any), fill with 0
        for col in self.embarked_columns:
            if col not in embarked_dummies:
                embarked_dummies[col] = 0
        for col in self.title_columns:
            if col not in titles_dummies:
                titles_dummies[col] = 0
        embarked_dummies = embarked_dummies[self.embarked_columns]  # same column order
        df = df.drop('Embarked', axis=1)
        df = df.drop('Title', axis=1)  # Drop 'Title' if it exists
        df = pd.concat([df, embarked_dummies,titles_dummies], axis=1)

        return df


In [10]:
def model_train(X_train, y_train):
    rf = RandomForestClassifier(n_estimators=100, random_state=42,max_depth=10,min_samples_split=5,)
    rf.fit(X_train, y_train)
    return rf

In [11]:
def model_tuning(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    return grid_search.best_estimator_

In [12]:
def model_prediction(X_test,model):
    preds = model.predict(X_test)
    return preds

In [13]:
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [18]:
def model_evaluate(model, X_test, y_test):
    accuracy = model.score(X_test, y_test)
    print(f'Model Accuracy: {accuracy:.2f}')

In [41]:
def extract_title(df):
    df = df.copy()
    # Extract title using regex
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]*)\s*\.', expand=False)

    # Normalize rare or similar titles
    df['Title'] = df['Title'].replace([
        'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 
        'Sir', 'Jonkheer', 'Dona'
    ], 'Rare')

    df['Title'] = df['Title'].replace({
        'Mlle': 'Miss',
        'Ms': 'Miss',
        'Mme': 'Mrs'
    })

    return df

In [None]:
preprocessor= TitanicPreprocessor()
df=load_data()
X= df.drop('Survived', axis=1)
y = df['Survived']
X= extract_title(X)  # Extract titles from names
cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
X = X.drop(cols_to_drop, axis=1)
x_train, x_test, y_train, y_test = split_data(X,y)
preprocessor.fit(x_train)
x_train = preprocessor.transform(x_train)
x_test = preprocessor.transform(x_test)
model = model_train(x_train, y_train)
test_data= pd.read_csv('data/test.csv')
passenger_ids = test_data['PassengerId']
test_data= test_data.drop(cols_to_drop, axis=1)
processed_test_data = preprocessor.transform(test_data)
preds = model_prediction(processed_test_data, model)
submission = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': preds})
submission.to_csv('data/submission.csv', index=False)
model_evaluate(model, x_test, y_test)

Model Accuracy: 0.83


stacking

In [43]:
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier


def ensemble_model(X_train, y_train):
    base_learners = [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42,max_depth=10,min_samples_split=5,)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
    ]
    meta_model = LogisticRegression()
    stacked_model = StackingClassifier(
        estimators=base_learners,
        final_estimator=meta_model,
        cv=5,              # internal CV for stacking
        passthrough=False  
    )
    stacked_model.fit(X_train, y_train)
    return stacked_model


In [52]:
preprocessor= TitanicPreprocessor()
df=load_data()
X= df.drop('Survived', axis=1)
print(X.columns)
y = df['Survived']
X= extract_title(X)  # Extract titles from names
cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
X = X.drop(cols_to_drop, axis=1)
x_train, x_val, y_train, y_val = split_data(X,y)
preprocessor.fit(x_train)
x_train = preprocessor.transform(x_train)
x_val = preprocessor.transform(x_val)
model = ensemble_model(x_train, y_train)
test_data= pd.read_csv('data/test.csv')
test_data = extract_title(test_data)  # Extract titles from names in test data
test_data = test_data.dropna(subset=['Fare'])  # Ensure 'Fare' is not NaN
passenger_ids = test_data['PassengerId']
test_data= test_data.drop(cols_to_drop, axis=1)
processed_test_data = preprocessor.transform(test_data)
preds = model_prediction(processed_test_data, model)
submission = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': preds})
submission.to_csv('data/submission.csv', index=False)
model_evaluate(model, x_val, y_val)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Model Accuracy: 0.84
