In [None]:
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
def get_cabin_group(X):
    return X.iloc[:, 0].str.split("/", expand=True)[0].to_frame()

In [None]:
X=data.drop(['transported','passengerid','name'],axis=1)
y=data['transported']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
cabin_pipeline = Pipeline([
    ('extract_group', FunctionTransformer(get_cabin_group)),
    ('ohe', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
])

In [None]:
cat_cols=X_train.select_dtypes(include='object').columns
cat_cols

In [None]:
num_cols=X_train.select_dtypes(exclude='object').columns
num_cols

In [None]:
processor=ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'),['homeplanet', 'destination']),
    ('scale',StandardScaler(), ['age', 'roomservice', 'foodcourt', 'shoppingmall','spa', 'vrdeck'])
],remainder='passthrough')

In [None]:
processor.fit_transform(X_train)[1]

In [None]:
model_pipe=Pipeline([
    ('processor',processor),
    ('model',RandomForestClassifier())
])

In [None]:
model_pipe.fit(X_train,y_train)

In [None]:
pred=model_pipe.predict(X_test)

In [None]:
accuracy_score(pred,y_test)

In [None]:
param_grid = [
    {
        'model': [LogisticRegression()],
        'model__C': [0.01, 0.1, 1, 10],
        'model__penalty': ['l2'],
        'model__solver': ['lbfgs', 'liblinear'],
        'model__max_iter': [100, 200]
    },
    {
        'model': [DecisionTreeClassifier()],
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2]
    },
    {
        'model': [RandomForestClassifier()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5],
        'model__min_samples_leaf': [1, 2],
        'model__max_features': ['sqrt', 'log2']
    },
    {
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['rbf', 'linear', 'poly'],
        'model__gamma': ['scale', 'auto'],
        'model__degree': [2, 3]  # used only when kernel=poly
    }
]


In [None]:
grid_search = GridSearchCV(model_pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best model:", grid_search.best_estimator_)
print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)