In [None]:
#baseline modelfrom sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# select cat columns
categorical_columns = features_train.select_dtypes(include=["category"]).columns.tolist()

# build pipeline with scaler, encoder, logreg
pipeline = Pipeline([('preprocessor', ColumnTransformer([
                    ('encoder', OneHotEncoder(), categorical_columns)], remainder='passthrough')),
                    ('scaler', StandardScaler()),
                    ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))])
# use label encoding


pipeline.fit(features_train, target_train)

# Predict on the testing data
target_pred = pipeline.predict(features_test)

# report
report = classification_report(target_test, target_pred)
print("Classification Report:\n", report)

In [None]:
# feature importances

feature_importances = pipeline.named_steps['classifier'].coef_[0]
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False).head(10)

print("\nTop 10 Feature Importances:\n", importance_df)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor__encoder__drop': [None, 'first'],
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['liblinear', 'lbfgs'],
    'classifier__max_iter': [100, 200, 300]}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')

# Fit the grid search to find the best parameters
grid_search.fit(features_train, target_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best parameters and best score
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Predict on the testing data using the best estimator
best_estimator = grid_search.best_estimator_
target_pred = best_estimator.predict(features_test)