In [19]:
import pandas as pd
pd.set_option('display.max_columns', None)

import sys
sys.path.append(r'C:\Users\huzef\OneDrive\Documents\Projects\Projects\data_science_code')

from data_processor import DataProcessor
from classification_models import *
from convert_categorical_variables import CategoricalEncoder
from evaluate_classification import ClassificationEvaluator
from sklearn.preprocessing import StandardScaler

In [20]:
data_processor = DataProcessor()
train_df = data_processor.read_csv(r'C:\Users\huzef\OneDrive\Documents\Projects\Projects\data\titanic\train.csv')
test_df = data_processor.read_csv(r'C:\Users\huzef\OneDrive\Documents\Projects\Projects\data\titanic\test.csv')
gender_submission = data_processor.read_csv(r'C:\Users\huzef\OneDrive\Documents\Projects\Projects\data\titanic\gender_submission.csv')

In [21]:
train_data, test_data = data_processor.train_test_split(train_df, test_size=0.2, random_state=42)
X_train, y_train = data_processor.split_features_target(train_data, 'Survived')
X_test, y_test = data_processor.split_features_target(test_data, 'Survived')

Data split into training and testing sets successfully.
Features and target variable split successfully.
Features and target variable split successfully.


# EDA/ Feature Engineering

In [22]:
def eda(df_1):
    df = df_1.copy()
    df.drop('PassengerId', axis=1, inplace=True)

    df['Surname'] = df['Name'].str.split(',').str[0]
    df['FirstName'] = df['Name'].str.split(',').str[1]
    df['Title'] = df['FirstName'].str.split('.').str[0]
    df.drop(['Name', 'FirstName', 'Surname'], axis=1, inplace=True)
    rare_titles = ['Rev', 'Dr', 'Major', 'Col', 'Mlle', 'Capt', 'Mme', 'the Countess', 'Lady', 'Sir', 'Jonkheer', 'Don']
    df['Title'] = df['Title'].apply(lambda x: 'Miss' if x.strip() == 'Ms' else 'Rare' if x.strip() in rare_titles else x.strip())

    df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

    df['Ticket_string'] = df['Ticket'].str.contains('[a-zA-Z]').astype(int)

    df['Cabin'] = df['Cabin'].fillna('U')

    most_common = df['Embarked'].mode()[0]
    df['Embarked'] = df['Embarked'].fillna(most_common)

    df['Age'] = df['Age'].fillna(df['Age'].median())

    df['Cabin_Letter'] = df['Cabin'].str.extract(r'([A-Za-z])')
    df['Cabin_Number'] = df['Cabin'].str.extract(r'(\d+)')
    df['Cabin_Number'] = df['Cabin_Number'].fillna(0)
    
    df = df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Title','Ticket_string','Cabin_Letter','Cabin_Number']]

    encoder = CategoricalEncoder(df)
    df = encoder.one_hot_encoding(['Embarked', 'Title', 'Cabin_Letter'])
    df = df.drop(['Cabin_Letter_U'], axis=1)

    return df

X_train_filt = eda(X_train)
X_test_filt = eda(X_test)

X_train_filt = X_train_filt[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Ticket_string',
       'Cabin_Number', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare',
       'Cabin_Letter_A', 'Cabin_Letter_B', 'Cabin_Letter_C', 'Cabin_Letter_D',
       'Cabin_Letter_E', 'Cabin_Letter_F', 'Cabin_Letter_G']]

In [None]:
# Example usage with different models:
model_classes = {
    'LogisticRegression': LogisticRegression,
    'SGDClassifier': SGDClassifier,
    'KNeighborsClassifier': KNeighborsClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'GradientBoostingClassifier': GradientBoostingClassifier,
    'AdaBoostClassifier': AdaBoostClassifier,
    'GaussianNB': GaussianNB,
    'SVC': SVC,
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis,
    'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis,
    'MLPClassifier': MLPClassifier,
    'XGBClassifier': XGBClassifier
}

# Create a model instance and perform operations:
for model_name, model_class in model_classes.items():
    print(f"Using {model_name}")
    model = ModelTrainer(model_class, random_state=42)
    # Add data loading, training, prediction, etc.
    print(f"Model parameters: {model.get_params()}")

In [23]:
log_reg_model = LogisticRegressionModel()
log_reg_model.set_params(random_state=42, max_iter=1000) 
log_reg_model.train(X_train_filt, y_train)
log_reg_predict = log_reg_model.predict(X_test_filt)

In [24]:
rf_model = RandomForestModel(random_state=42)
rf_model.train(X_train_filt, y_train)
best_params = rf_model.tune_hyperparameters(X_train_filt, y_train, param_grid={'n_estimators': [100, 200], 'max_depth': [10, 20]})
rf_model_predict = rf_model.predict(X_test_filt)
(rf_model.get_feature_importances()).head()

Unnamed: 0,Feature,Importance
0,Fare,0.156086
1,Title_Mr,0.151405
2,Sex,0.142978
3,Age,0.121433
4,Pclass,0.06548


In [25]:
naive_bayes = NaiveBayesModel()
naive_bayes.train(X_train_filt, y_train)
naive_bayes_predict = naive_bayes.predict(X_test_filt)

In [26]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filt)
X_test_scaled = scaler.transform(X_test_filt)

svm_model = SVMModel(random_state=42)
svm_model.train(X_train_scaled, y_train)
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
best_params = svm_model.tune_hyperparameters(X_train_scaled, y_train, param_grid=param_grid)
svm_model_predict = svm_model.predict(X_test_scaled)

In [27]:
gbm_model = GBMModel(random_state=42)
gbm_model.train(X_train_filt, y_train)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}
best_params = gbm_model.tune_hyperparameters(X_train_filt, y_train, param_grid=param_grid)

gbm_model_predict = gbm_model.predict(X_test_filt)

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filt)
X_test_scaled = scaler.transform(X_test_filt)

knn_model = KNNModel()

knn_model.train(X_train_scaled, y_train)
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
best_params = knn_model.tune_hyperparameters(X_train_scaled, y_train, param_grid=param_grid)
knn_model_predict = knn_model.predict(X_test_scaled)

In [29]:
ada_model = AdaBoostModel(random_state=42)
ada_model.train(X_train_filt, y_train)
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}
best_params = ada_model.tune_hyperparameters(X_train_filt, y_train, param_grid=param_grid)
ada_model_predict = ada_model.predict(X_test_filt)



In [30]:
qda_model = QDAModel()
qda_model.train(X_train_filt, y_train)
param_grid = {
    'reg_param': [0.0, 0.01, 0.1, 1.0],
    'tol': [1e-4, 1e-3, 1e-2]
}
best_params = qda_model.tune_hyperparameters(X_train_filt, y_train, param_grid=param_grid)
qda_model_predict = qda_model.predict(X_test_filt)



In [None]:
lda_model = LDAModel()
lda_model.train(X_train_filt, y_train)

param_grid = [
    {'solver': ['svd'], 'shrinkage': [None]},  # 'svd' solver doesn't support shrinkage
    {'solver': ['lsqr', 'eigen'], 'shrinkage': ['auto', 0.1, 0.5]},
    {'solver': ['lsqr', 'eigen'], 'shrinkage': [None]}  # Optionally include no shrinkage for these solvers
]

best_params = lda_model.tune_hyperparameters(X_train_filt, y_train, param_grid=param_grid)
print("Best parameters: ", best_params)

lda_model_predict = lda_model.predict(X_test_filt)

In [32]:
X_train_filt = X_train_filt.apply(lambda col: col.astype('category').cat.codes if col.dtype == 'object' else col)
X_test_filt = X_test_filt.apply(lambda col: col.astype('category').cat.codes if col.dtype == 'object' else col)

xgb_model = XGBoostModel(random_state=42)
xgb_model.train(X_train_filt, y_train)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}
best_params = xgb_model.tune_hyperparameters(X_train_filt, y_train, param_grid=param_grid)
xgb_model_predict = xgb_model.predict(X_test_filt)

In [33]:
nn_model = NeuralNetworkModel(random_state=42)
nn_model.train(X_train_filt, y_train)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['tanh', 'relu'],
    'learning_rate_init': [0.001, 0.01, 0.1]
}
best_params = nn_model.tune_hyperparameters(X_train_filt, y_train, param_grid=param_grid)
nn_model_predict = nn_model.predict(X_test_filt)



In [34]:
evaluator = ClassificationEvaluator(y_true=y_test)
evaluator.add_model('Logistic Regression', log_reg_predict)
evaluator.add_model('RF Model', rf_model_predict)
evaluator.add_model('Naive Bayes', naive_bayes_predict)
evaluator.add_model('SVM', svm_model_predict)
evaluator.add_model('GBM', gbm_model_predict)
evaluator.add_model('KNN', knn_model_predict)
evaluator.add_model('AdaBoost', ada_model_predict)
evaluator.add_model('QDA', qda_model_predict)
evaluator.add_model('LDA', lda_model_predict)
evaluator.add_model('XGBoost', xgb_model_predict)
evaluator.add_model('Neural Network', nn_model_predict)
evaluator.evaluate_all_models()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Confusion Matrix,ROC AUC
9,XGBoost,0.826816,0.794521,0.783784,0.789116,90\t15\n16\t58,
0,Logistic Regression,0.821229,0.776316,0.797297,0.786667,88\t17\n15\t59,
3,SVM,0.821229,0.783784,0.783784,0.783784,89\t16\n16\t58,
8,LDA,0.815642,0.773333,0.783784,0.778523,88\t17\n16\t58,
4,GBM,0.821229,0.8,0.756757,0.777778,91\t14\n18\t56,
1,RF Model,0.821229,0.808824,0.743243,0.774648,92\t13\n19\t55,
6,AdaBoost,0.793296,0.734177,0.783784,0.75817,84\t21\n16\t58,
2,Naive Bayes,0.787709,0.719512,0.797297,0.75641,82\t23\n15\t59,
10,Neural Network,0.776536,0.7125,0.77027,0.74026,82\t23\n17\t57,
5,KNN,0.73743,0.670886,0.716216,0.69281,79\t26\n21\t53,
