In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Sample dataset with both categorical and numerical data
data = pd.DataFrame({
    'color': ['red', 'green', 'blue', 'green', 'blue', 'red', 'yellow', 'purple', 'blue', 'yellow'],
    'size': ['S', 'M', 'L', 'L', 'M', 'S', 'S', 'M', 'L', 'S'],
    'price': [10, 20, 15, 25, 30, 12, 18, 22, 14, 20],
    'quantity': [1, 2, 3, 2, 1, 4, 5, 1, 3, 2]
})
target = [0, 1, 1, 0, 0, 1, 1, 0, 1, 0]  # Example target (classification)

# Step 1: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42)

# Step 2: Define the preprocessing for categorical and numerical features
categorical_features = ['color', 'size']
numerical_features = ['price', 'quantity']

# Define a preprocessing pipeline for numerical features (scaling and imputation)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Define a preprocessing pipeline for categorical features (encoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncoding for categorical features
])

# Step 3: Combine both preprocessing pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Step 4: Create a full pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('classifier', LogisticRegression())  # Model step (Logistic Regression)
])

# Step 5: Define hyperparameters for GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1, 10],  # Regularization strength
    'classifier__solver': ['lbfgs', 'liblinear'],  # Solvers to choose from
}

# Step 6: Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline,  # The pipeline to tune
    param_grid,  # Hyperparameter grid
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,  # Use all CPU cores for parallel processing
    verbose=1  # Print progress
)

# Step 7: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Step 8: Get the best hyperparameters
print("Best hyperparameters found: ", grid_search.best_params_)

# Step 9: Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Step 10: Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")
