# **Imports**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

## **Functions for preprocessing**

In [None]:
def get_split(df, columns=[], splitstr="None", n=0, col_names=[]):
    for column in columns:
        split_cols = df[column].str.split(splitstr, expand=True)

        for i in range(min(n, len(split_cols.columns))):
            if i < len(col_names):
                df[col_names[i]] = split_cols[i]

    return df

In [None]:
def add_feature_sum(df, new_feature = [], col_names =[]):
  for column in col_names:
    df[new_feature] = df[column].sum()

  return df

In [None]:
def fill_vals(df, cols_to_fill = [], fill_val = 0):
  for col in cols_to_fill:
    df[col].fillna(fill_val, inplace = True)

  return df

In [None]:
def fill_missing_with_lr(df):
    df_filled = df.copy()
    target_cols = df.columns[df.isnull().any()].tolist()
    for col in target_cols:
        missing_rows = df_filled[df_filled[col].isnull()]

        for idx in missing_rows.index:

            row_with_nan = df_filled.loc[idx].copy()
            row_with_nan = row_with_nan.drop(labels=target_cols)
            X_missing = row_with_nan.values.reshape(1, -1)


            train_data = df_filled.dropna(subset=[col])
            X_train = train_data.drop(columns=target_cols)
            y_train = train_data[col]


            model = LinearRegression()
            model.fit(X_train, y_train)


            predicted_value = model.predict(X_missing)
            df_filled.loc[idx, col] = predicted_value

    return df_filled

In [None]:
def one_hot_encoder(df, columns=[], drop_first=False):

    df = df.copy()

    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column, drop_first=drop_first, dtype=int)
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)

    return df

In [None]:
def bool_encoder(df, columns=[]):
  for column in columns:
    df[column].replace({True: 1, False: 0}, inplace=True)

  return df

In [None]:
def map_cat(df, col, mapping):
    df[col] = df[col].map(mapping)
    return df

In [None]:
def cols_to_drop(df, cols_to_drop=[]):
  for col in cols_to_drop:
    df.drop(col, axis=1, inplace=True)

  return df

In [None]:
def discretize_data(df, cols_to_discretize, n_bins=4):
  df = df.copy()
  for column in cols_to_discretize:
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal')
    df[column] = discretizer.fit_transform(df[[column]])

  return df

# **1 step dataframe preprocessing**

In [None]:
def combined_preprocessing(
    df,
    split_cols_config=None,
    add_feature_config=None,
    fill_vals_config=None,
    fill_missing_with_lr_enabled=False,
    one_hot_encoder_config=None,
    bool_encoder_config=None,
    map_cat_config=None,
    cols_to_drop_config=None
):
    def get_split(df, columns=[], splitstr="None", n=0, col_names=[]):
        for column in columns:
            split_cols = df[column].str.split(splitstr, expand=True)
            for i in range(min(n, len(split_cols.columns))):
                if i < len(col_names):
                    df[col_names[i]] = split_cols[i]
        return df

    def add_feature_sum(df, new_feature=None, col_names=[]):
        if new_feature is not None:
            df[new_feature] = df[col_names].sum(axis=1)
        return df

    def fill_vals(df, cols_to_fill=[], fill_val=0):
        for col in cols_to_fill:
            df[col].fillna(fill_val, inplace=True)
        return df

    def fill_missing_with_lr(df):
        df_filled = df.copy()
        target_cols = df.columns[df.isnull().any()].tolist()
        for col in target_cols:
            missing_rows = df_filled[df_filled[col].isnull()]
            for idx in missing_rows.index:
                row_with_nan = df_filled.loc[idx].copy()
                row_with_nan = row_with_nan.drop(labels=target_cols)
                X_missing = row_with_nan.values.reshape(1, -1)
                train_data = df_filled.dropna(subset=[col])
                X_train = train_data.drop(columns=target_cols)
                y_train = train_data[col]
                model = LinearRegression()
                model.fit(X_train, y_train)
                predicted_value = model.predict(X_missing)
                df_filled.loc[idx, col] = predicted_value
        return df_filled

    def one_hot_encoder(df, columns=[], drop_first=False):
        df = df.copy()
        for column in columns:
            dummies = pd.get_dummies(df[column], prefix=column, drop_first=drop_first, dtype=int)
            df = pd.concat([df, dummies], axis=1)
            df.drop(column, axis=1, inplace=True)
        return df

    def bool_encoder(df, columns=[]):
        for column in columns:
            df[column].replace({True: 1, False: 0}, inplace=True)
        return df

    def map_cat(df, col, mapping):
        df[col] = df[col].map(mapping)
        return df

    def cols_to_drop(df, cols_to_drop=[]):
        df.drop(cols_to_drop, axis=1, inplace=True)
        return df

    # Apply each preprocessing step based on the configurations passed
    if split_cols_config:
        df = get_split(df, **split_cols_config)

    if add_feature_config:
        df = add_feature_sum(df, **add_feature_config)

    if fill_vals_config:
        df = fill_vals(df, **fill_vals_config)

    if fill_missing_with_lr_enabled:
        df = fill_missing_with_lr(df)

    if one_hot_encoder_config:
        df = one_hot_encoder(df, **one_hot_encoder_config)

    if bool_encoder_config:
        df = bool_encoder(df, **bool_encoder_config)

    if map_cat_config:
        for config in map_cat_config:
            df = map_cat(df, **config)

    if cols_to_drop_config:
        df = cols_to_drop(df, **cols_to_drop_config)

    return df


# **Models training**

In [None]:
X = df.drop("Transported", axis =1)
y = df["Transported"]

In [None]:
#For models using discretized data

X_d = dsc_fin.drop("Transported", axis =1)
y_d = dsc_fin["Transported"]

In [None]:
model_1 = LogisticRegression(solver = "liblinear", penalty = "l1", max_iter = 200)
model_1.fit(X, y)
y_pred_m1= model_1.predict(X)

In [None]:
model_2 = LogisticRegression(solver = "liblinear", penalty = "l1", max_iter = 200)
model_2.fit(X_d, y_d)
y_pred_m2= model_2.predict(X_d)


In [None]:
param_grid = {
    'n_estimators': 200,
    'max_features': 'sqrt',
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 2,
    'bootstrap': True,
    'criterion': 'entropy'
}

model_3 = RandomForestClassifier(**param_grid, random_state=42)
model_3.fit(X, y)

y_pred_m3 = model_3.predict(X)

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=2,
                            max_features='sqrt', bootstrap=True, criterion='entropy', random_state=42)
rf.fit(X, y)
model_4 = AdaBoostClassifier(base_estimator=rf, n_estimators=50, random_state=42)
model_4.fit(X, y)
y_pred_ada_nd= model_4.predict(X)

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=2,
                            max_features='sqrt', bootstrap=True, criterion='entropy', random_state=42)
rf.fit(X_d, y_d)

model_5 = AdaBoostClassifier(base_estimator=rf, n_estimators=50, random_state=42)
model_5.fit(X_d, y_d)
y_pred_ada_d = model_5.predict(X_d)

In [None]:
model_6 = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=2,
                            max_features='sqrt', bootstrap=True, criterion='entropy', random_state=42)
model_6.fit(X_d, y_d)
y_pred_rf_d= model_6.predict(X_d)

In [None]:
model_7 = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=2,
                            max_features='sqrt', bootstrap=True, criterion='entropy', random_state=42)
model_7.fit(X, y)
y_pred_rf_d= model_7.predict(X)

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=9, min_samples_split=2, min_samples_leaf=2,
                            max_features='sqrt', bootstrap=True, criterion='entropy', random_state=42)
rf.fit(X_d, y_d)

model_8 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_8.fit(X_d, y_d)
y_pred_gb_d= model_8.predict(X_d)

In [None]:
rf = RandomForestClassifier(n_estimators=200, max_depth=9, min_samples_split=2, min_samples_leaf=2,
                            max_features='sqrt', bootstrap=True, criterion='entropy', random_state=42)
rf.fit(X, y)

model_9 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_9.fit(X, y)
y_pred_gb_d= model_9.predict(X)

# **Ensemble model with Voting classifier**

In [None]:
ensemble_model = VotingClassifier(
    estimators=[
        ('log_reg', model_1),
        ('log_reg_dsc', model_2),
        ('rf', model_3),
        ('ada_boost_nd', model_4),
        ('ada_boost_d', model_5),
        ('rf_dsc', model_6),
        ('rf_nd', model_7),
        ('gb_dsc', model_8),
        ('gb_nd', model_9)
    ],
    voting='hard' )

In [None]:
ensemble_model.fit(X, y)
y_pred = ensemble_model.predict(X)
accuracy_score(y, y_pred)

In [None]:
submission_pred = ensemble_model.predict(df_kaggle)
submission = submission_pred.astype(bool)

In [None]:

passenger_ids = new_submission['PassengerId'].values
transported_predictions = submission

sub_df = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": transported_predictions
})

print(sub_df.head())

In [None]:
sub_df.to_csv("ensemble_model_submission.csv", index = False)