In [132]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [133]:
train_df = pd.read_csv("../dataset/train.csv")
train_labels = train_df.pop('Transported')

In [134]:
train_df.drop(["Name"], axis=1, inplace=True)

In [135]:
train_df[['deck', 'num', 'side']] = train_df['Cabin'].str.extract(r'(\w)(\d+)(\w)')

In [136]:
train_df.drop(["Cabin"], axis=1, inplace=True)

In [137]:
categorical_cols = train_df.select_dtypes(["bool_", "object_"]).columns

numeric_cols = train_df.select_dtypes(exclude=["bool_", "object_"]).columns

In [138]:
categorical_cols = categorical_cols.drop("PassengerId")

In [139]:
encoder = TargetEncoder()
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols], train_labels)

In [140]:
# encoder = OneHotEncoder(sparse=False)
# encoded_cols = encoder.fit_transform(train_df[categorical_cols])
# encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols), index=train_df.index)

# # Replace the original categorical columns in the DataFrame with the encoded columns
# train_df.drop(columns=categorical_cols, inplace=True)  # Drop original categorical columns
# train_df[encoded_df.columns] = encoded_df  # Assign encoded columns to DataFrame

# train_df.head()

In [141]:
train_df.isna().sum().sum()

1122

In [142]:
iterative_imputer = IterativeImputer()
train_df[numeric_cols] = pd.DataFrame(iterative_imputer.fit_transform(train_df[numeric_cols]), columns= numeric_cols)

interpolation
fwrd bkwrd fill

In [143]:
categorical_imputer = SimpleImputer(strategy= "most_frequent")
train_df[categorical_cols] = pd.DataFrame(categorical_imputer.fit_transform(train_df[categorical_cols]), columns= categorical_cols)

In [144]:
train_df.isna().sum().sum()

0

In [145]:
train_df["group"] = train_df["PassengerId"].str.split("_").str[0] 
train_df["group"] = pd.to_numeric(train_df["group"]) 

In [146]:
train_df.drop("PassengerId", axis= 1, inplace= True)

In [147]:
scaler = StandardScaler()
new_col_names = [col + "_scaled" for col in numeric_cols]

train_df[new_col_names] = scaler.fit_transform(train_df[numeric_cols]) 

View the scaled columns

PCA

In [148]:
mi_scores = mutual_info_classif(train_df, train_labels)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=train_df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

CryoSleep              0.112680
Spa                    0.080086
Spa_scaled             0.077050
RoomService_scaled     0.076004
RoomService            0.070961
VRDeck_scaled          0.066448
VRDeck                 0.066004
ShoppingMall_scaled    0.055964
ShoppingMall           0.049831
FoodCourt              0.046847
FoodCourt_scaled       0.045930
group                  0.023194
HomePlanet             0.021801
Age                    0.015640
Age_scaled             0.014939
num                    0.012624
Destination            0.010089
VIP                    0.000259
side                   0.000000
deck                   0.000000
Name: MI Scores, dtype: float64

In [149]:
train_df.drop(["Destination", "VIP"], axis= 1, inplace= True)

crossfold validation -> pemilihan model terbaik

In [150]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df, train_labels, train_size = 0.9)

In [151]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Define the number of folds for cross-validation
num_folds = 5

# Define the cross-validation method
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Define a range of hyperparameters to search
n_estimators_range = [100, 500, 1000]  # Example range for number of trees
# Add more hyperparameters like max_depth, min_samples_split, etc., as needed

# Store the mean cross-validation scores for each hyperparameter setting
cv_scores_mean = []

for n_estimators in n_estimators_range:
    # Create a Random Forest classifier with the current hyperparameter settings
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators)
    
    # Perform cross-validation
    cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    
    # Calculate the mean cross-validation score
    cv_scores_mean.append(np.mean(cv_scores))

# Find the hyperparameters with the highest mean cross-validation score
best_n_estimators = n_estimators_range[np.argmax(cv_scores_mean)]

# Train the final Random Forest classifier using the best hyperparameters
final_rf_classifier = RandomForestClassifier(n_estimators=best_n_estimators)
final_rf_classifier.fit(X_train, y_train)

# Evaluate the final classifier on the validation set
validation_accuracy = accuracy_score(y_valid, final_rf_classifier.predict(X_valid))
print("Validation Accuracy with Best Hyperparameters:", validation_accuracy)

Validation Accuracy with Best Hyperparameters: 0.78953421506613


In [152]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Define the number of folds for cross-validation
num_folds = 5

# Define the cross-validation method
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Define a range of hyperparameters to search
max_iter_range = [1000, 2000, 3000]  # Example range for maximum iterations
# You can add more hyperparameters like regularization strength (C), penalty type (l1 or l2), etc., as needed

# Store the mean cross-validation scores for each hyperparameter setting
cv_scores_mean = []

for max_iter in max_iter_range:
    # Create a Logistic Regression classifier with the current hyperparameter settings
    lr_classifier = LogisticRegression(max_iter=max_iter, random_state=42)
    
    # Perform cross-validation
    cv_scores = cross_val_score(lr_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    
    # Calculate the mean cross-validation score
    cv_scores_mean.append(np.mean(cv_scores))

# Find the hyperparameters with the highest mean cross-validation score
best_max_iter = max_iter_range[np.argmax(cv_scores_mean)]

# Train the final Logistic Regression classifier using the best hyperparameters
final_lr_classifier = LogisticRegression(max_iter=best_max_iter, random_state=42)
final_lr_classifier.fit(X_train, y_train)

# Evaluate the final classifier on the validation set
validation_accuracy = accuracy_score(y_valid, final_lr_classifier.predict(X_valid))
print("Validation Accuracy with Best Hyperparameters:", validation_accuracy)


Validation Accuracy with Best Hyperparameters: 0.7935595169637722


In [153]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Define the number of folds for cross-validation
num_folds = 5

# Define the cross-validation method
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Define a range of hyperparameters to search
n_estimators_range = [100, 200, 300, ]  # Example range for the number of estimators
# You can add more hyperparameters like learning rate, max_depth, etc., as needed

# Store the mean cross-validation scores for each hyperparameter setting
cv_scores_mean = []

for n_estimators in n_estimators_range:
    # Create a Gradient Boosting Classifier with the current hyperparameter settings
    gbc_classifier = GradientBoostingClassifier(n_estimators=n_estimators)
    
    # Perform cross-validation
    cv_scores = cross_val_score(gbc_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    
    # Calculate the mean cross-validation score
    cv_scores_mean.append(np.mean(cv_scores))

# Find the hyperparameters with the highest mean cross-validation score
best_n_estimators = n_estimators_range[np.argmax(cv_scores_mean)]

# Train the final Gradient Boosting Classifier using the best hyperparameters
final_gbc_classifier = GradientBoostingClassifier(n_estimators=best_n_estimators)
final_gbc_classifier.fit(X_train, y_train)

# Evaluate the final classifier on the validation set
validation_accuracy = accuracy_score(y_valid, final_gbc_classifier.predict(X_valid))
print("Validation Accuracy with Best Hyperparameters:", validation_accuracy)


Validation Accuracy with Best Hyperparameters: 0.79700977573318


In [154]:
from xgboost import XGBClassifier

class CustomXGBClassifier(XGBClassifier):
    
    def __init__(self, **params):
        
        super().__init__(**params)
        self.eval_set = params['eval_set']
    
    def fit(self, X, y):
        super().fit(X, y, eval_set=self.eval_set, verbose=100) 

In [155]:
# Define the number of folds for cross-validation
num_folds = 5

# Define the cross-validation method
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Define a range of hyperparameters to search
n_estimators_range = [100, 200, 300]  # Example range for the number of estimators
learning_rate_range = [0.01, 0.1, 0.2]  # Example range for the learning rate
# Add more hyperparameters as needed

# Store the mean cross-validation scores for each hyperparameter setting
cv_scores_mean = []

for n_estimators in n_estimators_range:
    for learning_rate in learning_rate_range:
        # Create a Custom XGBoost Classifier with the current hyperparameter settings
        xgb_classifier = CustomXGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40)
        
        # Perform cross-validation
        cv_scores = cross_val_score(xgb_classifier, X_train, y_train, cv=kf, scoring='accuracy')
        
        # Calculate the mean cross-validation score
        cv_scores_mean.append(np.mean(cv_scores))

# Find the hyperparameters with the highest mean cross-validation score
best_params_index = np.argmax(cv_scores_mean)
best_n_estimators = n_estimators_range[best_params_index // len(learning_rate_range)]
best_learning_rate = learning_rate_range[best_params_index % len(learning_rate_range)]

# Train the final Custom XGBoost Classifier using the best hyperparameters
final_xgb_classifier = CustomXGBClassifier(n_estimators=best_n_estimators, learning_rate=best_learning_rate, eval_set=[(X_valid, y_valid)], early_stopping_rounds=40)
final_xgb_classifier.fit(X_train, y_train)

# Evaluate the final classifier on the validation set
validation_accuracy = accuracy_score(y_valid, final_xgb_classifier.predict(X_valid))
print("Validation Accuracy with Best Hyperparameters:", validation_accuracy)

[0]	validation_0-logloss:0.68942


Parameters: { "eval_set" } are not used.



[99]	validation_0-logloss:0.49863
[0]	validation_0-logloss:0.68938


Parameters: { "eval_set" } are not used.



[99]	validation_0-logloss:0.49841
[0]	validation_0-logloss:0.68922


Parameters: { "eval_set" } are not used.



[99]	validation_0-logloss:0.49846
[0]	validation_0-logloss:0.68993


Parameters: { "eval_set" } are not used.



[99]	validation_0-logloss:0.50412
[0]	validation_0-logloss:0.68940


Parameters: { "eval_set" } are not used.



[99]	validation_0-logloss:0.49694
[0]	validation_0-logloss:0.65602


Parameters: { "eval_set" } are not used.



[99]	validation_0-logloss:0.43588
[0]	validation_0-logloss:0.65515
[79]	validation_0-logloss:0.43897


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.65517
[75]	validation_0-logloss:0.43605
[0]	validation_0-logloss:0.65638
[85]	validation_0-logloss:0.44342


Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.65500
[81]	validation_0-logloss:0.43544


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.62303
[71]	validation_0-logloss:0.44448
[0]	validation_0-logloss:0.62122
[62]	validation_0-logloss:0.45502
[0]	validation_0-logloss:0.62146


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[61]	validation_0-logloss:0.43628
[0]	validation_0-logloss:0.62318
[68]	validation_0-logloss:0.44952
[0]	validation_0-logloss:0.62090


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[66]	validation_0-logloss:0.44384
[0]	validation_0-logloss:0.68942


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.49780
[199]	validation_0-logloss:0.45466
[0]	validation_0-logloss:0.68938


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.49756
[199]	validation_0-logloss:0.45199
[0]	validation_0-logloss:0.68922
[100]	validation_0-logloss:0.49763


Parameters: { "eval_set" } are not used.



[199]	validation_0-logloss:0.45027
[0]	validation_0-logloss:0.68993
[100]	validation_0-logloss:0.50327


Parameters: { "eval_set" } are not used.



[199]	validation_0-logloss:0.45826
[0]	validation_0-logloss:0.68940


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.49615
[199]	validation_0-logloss:0.45038
[0]	validation_0-logloss:0.65602
[100]	validation_0-logloss:0.43544


Parameters: { "eval_set" } are not used.



[121]	validation_0-logloss:0.43900
[0]	validation_0-logloss:0.65515
[79]	validation_0-logloss:0.43897
[0]	validation_0-logloss:0.65517


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[76]	validation_0-logloss:0.43621
[0]	validation_0-logloss:0.65638
[85]	validation_0-logloss:0.44342


Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.65500
[81]	validation_0-logloss:0.43544


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.62303
[72]	validation_0-logloss:0.44448
[0]	validation_0-logloss:0.62122
[62]	validation_0-logloss:0.45502
[0]	validation_0-logloss:0.62146


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[61]	validation_0-logloss:0.43628
[0]	validation_0-logloss:0.62318
[69]	validation_0-logloss:0.44922
[0]	validation_0-logloss:0.62090

Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.




[65]	validation_0-logloss:0.44434
[0]	validation_0-logloss:0.68942


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.49780
[200]	validation_0-logloss:0.45443
[299]	validation_0-logloss:0.44179
[0]	validation_0-logloss:0.68938


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.49756
[200]	validation_0-logloss:0.45178
[299]	validation_0-logloss:0.43827
[0]	validation_0-logloss:0.68922
[100]	validation_0-logloss:0.49763

Parameters: { "eval_set" } are not used.




[200]	validation_0-logloss:0.44999
[299]	validation_0-logloss:0.43642
[0]	validation_0-logloss:0.68993


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.50327
[200]	validation_0-logloss:0.45799
[299]	validation_0-logloss:0.44461
[0]	validation_0-logloss:0.68940


Parameters: { "eval_set" } are not used.



[100]	validation_0-logloss:0.49615
[200]	validation_0-logloss:0.45016
[299]	validation_0-logloss:0.43545
[0]	validation_0-logloss:0.65602
[100]	validation_0-logloss:0.43544


Parameters: { "eval_set" } are not used.



[122]	validation_0-logloss:0.43896
[0]	validation_0-logloss:0.65515
[79]	validation_0-logloss:0.43897
[0]	validation_0-logloss:0.65517


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[75]	validation_0-logloss:0.43605
[0]	validation_0-logloss:0.65638
[86]	validation_0-logloss:0.44353


Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.65500
[81]	validation_0-logloss:0.43544


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[0]	validation_0-logloss:0.62303
[71]	validation_0-logloss:0.44448
[0]	validation_0-logloss:0.62122
[62]	validation_0-logloss:0.45502
[0]	validation_0-logloss:0.62146


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[61]	validation_0-logloss:0.43628
[0]	validation_0-logloss:0.62318
[69]	validation_0-logloss:0.44922
[0]	validation_0-logloss:0.62090


Parameters: { "eval_set" } are not used.

Parameters: { "eval_set" } are not used.



[66]	validation_0-logloss:0.44384
[0]	validation_0-logloss:0.62040
[78]	validation_0-logloss:0.44769
Validation Accuracy with Best Hyperparameters: 0.7901092581943646


Parameters: { "eval_set" } are not used.



In [156]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(
    estimators = [
        ('xgb', final_xgb_classifier),
        ('rf', final_rf_classifier),
        ('lr', final_rf_classifier),
        ('gbc', final_gbc_classifier)
    ],
    voting = 'soft'
)

model.fit(X_train, y_train)

print(accuracy_score(y_valid, model.predict(X_valid)))

[0]	validation_0-logloss:0.62040
[77]	validation_0-logloss:0.44759


Parameters: { "eval_set" } are not used.



0.7929844738355377


**Submit**

In [157]:
test_df = pd.read_csv("../dataset/test.csv")

In [158]:
test_df[['deck', 'num', 'side']] = test_df['Cabin'].str.extract(r'(\w)(\d+)(\w)')

In [159]:
test_df.drop(["Cabin"], axis=1, inplace=True)

In [160]:
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols])

In [161]:
test_df[numeric_cols] = pd.DataFrame(iterative_imputer.transform(test_df[numeric_cols]), columns= numeric_cols)
test_df[categorical_cols] = pd.DataFrame(categorical_imputer.transform(test_df[categorical_cols]), columns= categorical_cols)

In [162]:
test_df["group"] = test_df["PassengerId"].str.split("_").str[0]
test_df["group"] = pd.to_numeric(test_df["group"])

In [163]:
test_df[new_col_names] = scaler.transform(test_df[numeric_cols]) 

In [164]:
preds = [x == 1 for x in model.predict(test_df[train_df.columns])]

In [165]:
submission_df = pd.DataFrame({
    "PassengerId" : test_df["PassengerId"],
    "Transported" : preds
})

In [166]:
submission_df.to_csv("submission_iseng_xixixi2.csv", index=False)