In [57]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.preprocessing import OneHotEncoder # for encoding categorical features
from sklearn.impute import KNNImputer # for inputing missing values using KNN
from sklearn.model_selection import train_test_split # for splitting dataset into training and testing sets
from sklearn.compose import ColumnTransformer # for applying different transformers to different columns
from sklearn.metrics import accuracy_score # for evaluating model accuracy
from sklearn.model_selection import cross_val_score # for cross-validation scoring
import warnings
warnings.filterwarnings("ignore") # suppressing warnings

In [58]:
df_train = pd.read_csv("train.csv") # Load the training dataset
df_test = pd.read_csv("test.csv") # Load the test dataset

In [59]:
df_train.head() # Display first few rows of the training data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [60]:
df_test.head() # Display first few rows of the test data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [61]:
# A new DataFrame is created from the PassengerId column of the test dataset
results_df = pd.DataFrame(data=df_test["PassengerId"], columns=["PassengerId"]) # Create a DataFrame for submission
results_df.head() # Display the first few rows of the submission DataFrame

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [62]:
df_train.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"], inplace=True) # Drop unnecessary columns from the training data
df_test.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"], inplace=True) # Drop unnecessary columns from the test data

In [63]:
# X_train contains the feature columns (all columns except Survived), and Y_train contains the target variable Survived
X_train, Y_train = df_train.drop(columns=["Survived"]), df_train["Survived"] # Separate features and target for training

In [64]:
trf = ColumnTransformer(
    transformers=[
        ("trf1", KNNImputer(n_neighbors=5), ["Age"]), # Impute missing "Age" values using KNN
        ("trf2", OneHotEncoder(), ["Sex", "Embarked"]) # Apply OneHotEncoding to "Sex" and "Embarked" columns
    ],
    remainder="passthrough" # Keep other columns as they are without transformation
)

In [65]:
# .fit_transform() learns the transformations from the training set and applies them
# .transform() is applied to the test data using the learned transformations
X_train_trf = trf.fit_transform(X_train) # Apply transformation to the training features
X_test_trf = trf.transform(df_test) # Apply the same transformation to the test data

In [66]:
X_train_trf.shape # Check the shape of the transformed training data
X_test_trf.shape # Check the shape of the transformed test data

(418, 11)

In [67]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train_trf, Y_train, test_size=0.2) # Split the training data into train and test sets

Model training and evaluation

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [69]:
lr = LogisticRegression() # Logistic Regression
lr.fit(X_train, Y_train) # Fit the model on the training data
y_pred = lr.predict(X_test) # Predict on the test data
print("Logistic Regression")
print("Accuracy Score : ", accuracy_score(Y_test, y_pred)) # Print accuracy score
print("Cross Val Score : ", np.mean(cross_val_score(lr, X_test, y=Y_test, cv=5))) # Print cross-validation score

Logistic Regression
Accuracy Score :  0.8268156424581006
Cross Val Score :  0.7879365079365079


In [70]:
nb = GaussianNB() # Naive Bayes Classifier
nb.fit(X_train, Y_train) # Train the model
y_pred = nb.predict(X_test) # Predictions
print("Naive Bayes Classifier")
print("Accuracy Score : ", accuracy_score(Y_test, y_pred)) # Print accuracy score
print("Cross Val Score : ", np.mean(cross_val_score(nb, X_test, y=Y_test, cv=5))) # Print cross-validation score

Naive Bayes Classifier
Accuracy Score :  0.8100558659217877
Cross Val Score :  0.7988888888888889


In [71]:
dt = DecisionTreeClassifier() # Decision Tree Classifier
dt.fit(X_train, Y_train) # Train the model
y_pred = dt.predict(X_test) # Predictions
print("Decision Tree Classifier")
print("Accuracy Score : ", accuracy_score(Y_test, y_pred)) # Print accuracy score
print("Cross Val Score : ", np.mean(cross_val_score(dt, X_test, y=Y_test, cv=5))) # Print cross-validation score

Decision Tree Classifier
Accuracy Score :  0.8379888268156425
Cross Val Score :  0.8099999999999999


In [72]:
svc = LinearSVC() # Support Vector Classifier
svc.fit(X_train, Y_train) # Train the model
y_pred = svc.predict(X_test) # Predictions
print("Support Vector Classifier")
print("Accuracy Score : ", accuracy_score(Y_test, y_pred)) # Print accuracy score
print("Cross Val Score : ", np.mean(cross_val_score(svc, X_test, y=Y_test, cv=5))) # Print cross-validation score

Support Vector Classifier
Accuracy Score :  0.8435754189944135
Cross Val Score :  0.787936507936508


In [73]:
rf = RandomForestClassifier() # Random Forest Classifier
rf.fit(X_train, Y_train) # Train the model
y_pred = rf.predict(X_test) # Predictions
print("Random Forest Classifier")
print("Accuracy Score : ", accuracy_score(Y_test, y_pred)) # Print accuracy score
print("Cross Val Score : ", np.mean(cross_val_score(rf, X_test, y=Y_test, cv=5))) # Print cross-validation score

Random Forest Classifier
Accuracy Score :  0.8659217877094972
Cross Val Score :  0.8268253968253969


In [74]:
xgb = XGBClassifier() # XGBoost Classifier
xgb.fit(X_train, Y_train) # Train the model
y_pred = xgb.predict(X_test) # Predictions
print("XGBoost Classifier")
print("Accuracy Score : ", accuracy_score(Y_test, y_pred)) # Print accuracy score
print("Cross Val Score : ", np.mean(cross_val_score(xgb, X_test, y=Y_test, cv=5))) # Print cross-validation score

XGBoost Classifier
Accuracy Score :  0.8547486033519553
Cross Val Score :  0.8714285714285713


In [75]:
Y_pred = xgb.predict(X_test_trf) # Final predictions using the best model (XGBoost)

Hyperparameters Tuning

In [76]:
from sklearn.model_selection import GridSearchCV

In [77]:
# Logistic regression tuning
lr_grid = {
    "penalty": ["l2", "elasticnet"],
    "C": [0.001, 0.1, 0.3, 0.5, 1, 2],
    "fit_intercept": [True, False],
    "random_state": [None, 2],
    "solver": ["lbfgs", "saga"],
    "max_iter": [100, 200, 500]
}

In [51]:
g = GridSearchCV(lr, lr_grid, scoring="accuracy", cv=5) # Perform grid search with 5-fold cross-validation
g.fit(X_train, Y_train) # Fit the model
print("Logistic Regression")
print("Best Params : ", g.best_estimator_) # Display the best parameters
print("Tuned Accuracy : ", g.best_score_) # Display the tuned accuracy
accuracy_score(g.best_estimator_.predict(X_test), Y_test) # Evaluate the best model on the test set

Logistic Regression
Best Params :  LogisticRegression(C=0.1)
Tuned Accuracy :  0.8104895104895105


0.8100558659217877

In [52]:
# Decision Tree Classifier Tuning
dt_grid = {
    "criterion":["gini", "entropy"],
    "splitter" :["best", "random"],
    "max_features":["sqrt", "log2", None],
    "random_state":[1, None]
}

In [53]:
g = GridSearchCV(dt, dt_grid, scoring="accuracy", cv=5) # Perform grid search with 5-fold cross-validation
g.fit(X_train, Y_train) # Fit the model
print("Decision Tree Classifier")
print("Best Params : ", g.best_estimator_) # Display the best parameters
print("Tuned Accuracy : ", g.best_score_) # Display the tuned accuracy
accuracy_score(g.best_estimator_.predict(X_test), Y_test) # Evaluate the best model on the test set

Decision Tree Classifier
Best Params :  DecisionTreeClassifier()
Tuned Accuracy :  0.7920811582783414


0.7653631284916201

In [54]:
# Linear SVC Tuning
svc_grid = {
    "penalty": ["l1", "l2"],
    "loss": ["hinge", "squared_hinge"],
    "C": [0.1, 0.3, 0.5, 1, 2],
    "max_iter": [100, 200, 500],
    "dual": [False],
    "tol": [1e-4, 1e-5]
}

In [55]:
g = GridSearchCV(svc, svc_grid, scoring="accuracy", cv=5) # Perform grid search with 5-fold cross-validation
g.fit(X_train, Y_train) # Fit the model
print("Support Vector Classifier")
print("Best Params : ", g.best_estimator_) # Display the best parameters
print("Tuned Accuracy : ", g.best_score_) # Display the tuned accuracy
accuracy_score(g.best_estimator_.predict(X_test), Y_test) # Evaluate the best model on the test set

Support Vector Classifier
Best Params :  LinearSVC(C=0.5, dual=False, max_iter=200, penalty='l1')
Tuned Accuracy :  0.7992711513838275


0.8100558659217877

In [80]:
# RandomForest Classifier Tuning
rf_grid = {
    "n_estimators":[100, 200, 500],
    "criterion":["gini", "entropy"],
    "max_features":["sqrt", "log2", None],
    "random_state":[1, None],
    "warm_start":[True, False],
    "oob_score":[True, False],
    "bootstrap":[True, False]
}

In [81]:
g = GridSearchCV(rf, rf_grid, scoring="accuracy", cv=5) # Perform grid search with 5-fold cross-validation
g.fit(X_train, Y_train) # Fit the model
print("Random Forest Classifier")
print("Best Params : ", g.best_estimator_) # Display the best parameters
print("Tuned Accuracy : ", g.best_score_) # Display the tuned accuracy
accuracy_score(g.best_estimator_.predict(X_test), Y_test) # Evaluate the best model on the test set

Random Forest Classifier
Best Params :  RandomForestClassifier(criterion='entropy', max_features=None, n_estimators=200)
Tuned Accuracy :  0.7961981680291539


0.88268156424581

In [82]:
# XGBoost Tuning
xg_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.1, 0.01, 0.001, 0.05],
    'subsample': [0.5, 0.7, 0.8, 1.0],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'n_estimators': [100, 200, 500, 1000]
}

In [83]:
g = GridSearchCV(xgb, xg_grid, scoring="accuracy", cv=5) # Perform grid search with 5-fold cross-validation
g.fit(X_train, Y_train) # Fit the model
print("XGBoost Classifier")
print("Best Params : ", g.best_estimator_) # Display the best parameters
print("Tuned Accuracy : ", g.best_score_) # Display the tuned accuracy
accuracy_score(g.best_estimator_.predict(X_test), Y_test) # Evaluate the best model on the test set

XGBoost Classifier
Best Params :  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
Tuned Accuracy :  0.8159164778883088


0.88268156424581

In [84]:
results_df
results_df["Survived"] = g.best_estimator_.predict(X_test_trf) # Add predictions to the results DataFrame
results_df.to_csv("results.csv", index=False) # Save the results file as CSV