In [21]:
import pandas as pd
import pipes as pdp
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import KBinsDiscretizer

In [22]:
# Set a seed for reproducibility
np.random.seed(0)

# Create a simple DataFrame
df = pd.DataFrame({
    'age': np.random.randint(20, 70, size=1000),
    'gender': np.random.choice(['male', 'female'], size=1000),
    'income': np.random.normal(50000, 10000, size=1000),
    'education_level': np.random.choice(['high_school', 'college', 'graduate'], size=1000),
    'purchased': np.random.choice([0, 1], size=1000)
})

df.head()


Unnamed: 0,age,gender,income,education_level,purchased
0,64,female,61494.406231,high_school,1
1,67,male,35916.732068,graduate,0
2,20,female,50963.107897,high_school,1
3,23,male,49528.705592,graduate,0
4,23,female,41040.729464,graduate,0


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

numeric_features = ['age', 'income']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])


categorical_features = ['gender', 'education_level']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])


X = df.drop('purchased', axis=1)
y = df['purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


param_grid = {'classifier__C': [0.1, 1.0, 10, 100]}
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_C = grid_search.best_params_['classifier__C']


numeric_transformer_impute_scale = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

preprocessor_impute_scale = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_impute_scale, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

from sklearn.feature_selection import mutual_info_classif


clf_kbest = Pipeline(steps=[('preprocessor', preprocessor_impute_scale),
                            ('kbest', SelectKBest(mutual_info_classif, k=2)),
                            ('classifier', LogisticRegression())])
clf_kbest.fit(X_train, y_train)



age_bin_transformer = Pipeline(steps=[
    ('age_binner', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform'))])

preprocessor_age_bin = ColumnTransformer(
    transformers=[
        ('age', age_bin_transformer, ['age']),
        ('num', numeric_transformer, ['income']),
        ('cat', categorical_transformer, ['gender', 'education_level'])])


rf_clf = Pipeline(steps=[('preprocessor', preprocessor_age_bin),
                         ('classifier', RandomForestClassifier())])

param_dist = {'classifier__n_estimators': [50, 100, 200],
              'classifier__max_depth': [None, 10, 20, 30],
              'classifier__min_samples_split': [2, 5, 10]}

random_search = RandomizedSearchCV(rf_clf, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)
best_rf_params = random_search.best_params_

predictions, best_C, clf_kbest.score(X_test, y_test), best_rf_params

