# Preprocessing

## Data Loading

In [None]:
import numpy as np
data = np.loadtxt('file_name.csv', delimiter=',')


In [None]:
import pandas as pd
df=pd.read_csv("file_name.csv",header=0)

## Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X,y, random_state=0)

## Data Preparation

### Standardization

In [None]:
# minmaxscaler, standardscalar, normalizer
    """Use 
    minmaxscaler - if you have many outliers and you want to preserve the distribution
    standardscaler -  to reshape to a normal/gaussian distribution
    """

In [None]:
from sklearn.preprocessing import StandardScaler
get_names = df.columns
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=get_names)

In [None]:
X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)

In [None]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=get_names)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
norm_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)


### Missing Values

In [None]:
# Drop Values

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)

In [None]:
#Imputation
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns



In [None]:
#Imputation with extension

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

for col in cols_with_missing:
    imputed_X_train[col + 'was_missing'] = X_train[col].isnull()
    imputed_X_valid[col + 'was_missing'] = X_valid[col].isnull()

### Categorical Encoding

In [None]:
# Drop Categorical column
drop_X_train = X_train.select_dtypes(exclude=['object'])

In [None]:
#Create Discrete Features
# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [None]:
# One Hot Encoding
X_train = pd.get_dummies(X_train)
X_train, X_valid = X_train.align(X_valid, join="left", axis=1)

In [None]:
# One Hot Encoding
X_train = pd.get_dummies(df.categorical, prefix='cat')

In [None]:
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

In [None]:
# One Hot Encoding
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
oh_cols_train = pd.DataFrame(oh_encoder.fit_transform(X_train[object_cols]))
oh_cols_train.index = X_train.index
num_X_train = X_train.drop(object_cols, axis=1)
oh_X_train = pd.concat([num_X_train, oh_cols_train], axis=1)

oh_cols_valid = pd.DataFrame(oh_encoder.transform(X_valid[object_cols]))
oh_cols_valid.index = X_valid.index
num_X_valid = X_valid.drop(object_cols, axis=1)
oh_X_valid = pd.concat([num_X_valid, oh_cols_valid], axis=1)

In [None]:
# Ordinal Encoding
#for categories more than 10

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])



In [None]:

##### Selecting good labels
good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))]
bad_label_cols = list(set(object_cols) - set(good_label_cols))

In [None]:
#### Low cardinality
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique()<10]
high_cardinality_cols = list(set(object_cols) - set(low_cardinality_cols))

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X_train[object_cols] = enc.fit_transform(X_train[object_cols])

### OverSampling

In [None]:
from imblearn.over_sampling import SMOTE
X, y = SMOTE(sampling_strategy=1, random_state=0).fit_resample(X, y)

### Mutual Information

In [None]:
# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

### Clustering

In [None]:
from sklearn.cluster import KMeans

# Create cluster feature
kmeans = KMeans(n_clusters=6, n_init=10, random_state=0)
X["Cluster"] = kmeans.fit_predict(X)
X["Cluster"] = X["Cluster"].astype("category")

X.head()

X_cd = kmeans.fit_transform(X_scaled)


# Label features and join to dataset
X_cd = pd.DataFrame(X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])])
X = X.join(X_cd)

### Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

X_pca.head()

loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=X.columns,  # and the rows are the original features
)
loadings

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.feature_engineering_new.ex5 import *

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)


def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings


def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs


def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


df = pd.read_csv("../input/fe-course-data/ames.csv")

### Target Encoding

In [None]:
X = df.copy()
y = X.pop('Rating')

X_encode = X.sample(frac=0.25)
y_encode = y[X_encode.index]
X_pretrain = X.drop(X_encode.index)
y_train = y[X_pretrain.index]

from category_encoders import MEstimateEncoder

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["Zipcode"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X_train = encoder.transform(X_pretrain)

# Modelling

## Pipelines

In [None]:
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype=='object']
numerical_cols = [cname for cname in X_train.columns if X_train.dtype in ['int64', 'float64']]
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

## XGBoost

In [None]:
from xgboost import XGBRegressor
my_model = XGBRegressor(
    n_estimators=1000,
    learning_rate = 0.001
    n_jobs = 4
    random_state = 1
)

my_model.fit(X_train, y_train, early_stopping_rounds = 5, eval_set=[(X_valid, y_valid)], verbose = False)

## Grid Search

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.datasets import load_iris

# Load dataset (for demonstration, you can replace this with your dataset)
iris = load_iris()
X = iris.data
y = iris.target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost classifier
xgb_clf = XGBClassifier()

# Define hyperparameters grid for Grid Search
param_grid = {
    'alpha': [0, 0.1, 0.5, 1, 5],  # L1 regularization
    'lambda': [0, 0.1, 0.5, 1, 5],  # L2 regularization
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='accuracy', verbose = 2)

# Perform Grid Search to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and corresponding accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Accuracy Score:", best_score)

# Train XGBoost classifier with the best hyperparameters
best_xgb_clf = XGBClassifier(**best_params)
best_xgb_clf.fit(X_train, y_train,
    early_stopping_rounds=5,
    eval_set=[(X_valid, y_valid)],
    verbose=True)

# Evaluate the trained model on the test set
test_accuracy = best_xgb_clf.score(X_test, y_test)
print("Test Set Accuracy:", test_accuracy)


In [None]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor

# Load dataset (for demonstration, you can replace this with your dataset)
boston = load_boston()
X = boston.data
y = boston.target

# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'alpha': [0, 0.1, 0.5],  # L1 regularization
    'lambda': [0, 0.1, 0.5],  # L2 regularization
}

# Initialize the XGBoost regressor
xgb_reg = XGBRegressor(n_jobs=4, random_state=1)

# Define the GridSearchCV with early stopping
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=1
)

# Add early stopping to the fit parameters
fit_params = {
    'early_stopping_rounds': 5,
    'eval_set': [(X_valid, y_valid)],
    'verbose': False
}

# Fit the grid search with early stopping
grid_search.fit(X_train, y_train, **fit_params)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best estimator and evaluate on the validation set
best_xgb_reg = grid_search.best_estimator_
y_pred = best_xgb_reg.predict(X_valid)

# Calculate R² score
from sklearn.metrics import r2_score
r2 = r2_score(y_valid, y_pred)
print("R² Score on Validation Set:", r2)


## Tuning on a metric with Grid Search

In [None]:
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=3, scoring='r2')

my_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.001,
    n_jobs=4,
    random_state=1,
    eval_metric='r2'
)

## Grid Search on preprocessing

In [1]:
# numerical features from the dataset
numerical_features = ['age', 'fare']

# categorical features from the dataset
categorical_features = ['embarked', 'sex', 'pclass']

# Applying SimpleImputer and will search for different scalers using GridSearchCV
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', 'passthrough')])

# Applying SimpleImputer and then OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

data_transformer = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)])

# Creating preprocessor pipeline which will first transform the data
# and then apply PCA.
preprocessor = Pipeline(steps=[('data_transformer', data_transformer),
                             ('reduce_dim',PCA())])


# we are using Logistics Regression here
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(random_state=0, max_iter=10000))])

# We can utilize params grid to check for best hyperparameters or transformers
# The syntax here is pipeline_step_name__parameters and we need to chain if we have nested pipelines 
param_grid = {
    'preprocessor__data_transformer__numerical__imputer__strategy': ['mean', 'median'],
    'preprocessor__data_transformer__categorical__imputer__strategy': ['constant','most_frequent'],
    'preprocessor__data_transformer__numerical__scaler': [StandardScaler(), RobustScaler(), \
                                                          MinMaxScaler()],
    'classifier__C': [0.1, 1.0, 10, 100],
    'preprocessor__reduce_dim__n_components': [2, 5, 10],
    'classifier__solver': ['liblinear','newton-cg', 'lbfgs','sag','saga']
}

# Doing a Grid Search
grid_search = GridSearchCV(classifier, param_grid=param_grid)

# fitting on our dataset
grid_search.fit(X_train, y_train); # Semicolon to not print estimator in notebook



# imports 
from sklearn import set_config                      # to change the display
from sklearn.utils import estimator_html_repr       # to save the diagram into HTML format

# set config to diagram for visualizing the pipelines/composite estimators
set_config(display='diagram')

# Lets visualize the best estimator from grid search.
grid_search.best_estimator_

# saving pipeline as html format
with open('titanic_data_pipeline_estimator.html', 'w') as f:  
    f.write(estimator_html_repr(grid_search.best_estimator_))

NameError: name 'Pipeline' is not defined

## L1 and L2 Regularization

xgb_clf_l1 with L1 regularization by setting alpha=1 and reg_lambda=0.

xgb_clf_l2 with L2 regularization by setting alpha=0 and reg_lambda=1.

xgb_clf_l1_l2 with both L1 and L2 regularization by setting alpha=1 and reg_lambda=1.


xgb_reg_l1 with L1 regularization by setting alpha=1 and lambda=0.
xgb_reg_l2 with L2 regularization by setting alpha=0 and lambda=1.
xgb_reg_l1_l2 with both L1 and L2 regularization by setting alpha=1 and lambda=1.

In [None]:
xgb_clf_l1 = XGBClassifier(objective='multi:softmax', num_class=3, alpha=1)
xgb_clf_l2 = XGBClassifier(objective='multi:softmax', num_class=3, reg_lambda=1)
xgb_clf_l1_l2 = XGBClassifier(objective='multi:softmax', num_class=3, alpha=1, reg_lambda=1)

In [None]:
xgb_reg_l1 = XGBRegressor(alpha=1, lambda=0)
xgb_reg_l2 = XGBRegressor(alpha=0, lambda=1)
xgb_reg_l1_l2 = XGBRegressor(alpha=1, lambda=1)


## Random Search

In [None]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from scipy.stats import uniform

# Load dataset (for demonstration, you can replace this with your dataset)
boston = load_boston()
X = boston.data
y = boston.target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost regressor
xgb_reg = XGBRegressor()

# Define hyperparameters distribution for Random Search
param_dist = {
    'alpha': uniform(0, 5),  # L1 regularization
    'lambda': uniform(0, 5),  # L2 regularization
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': [3, 5, 7, 10],
    'n_estimators': [50, 100, 200, 300]
}

# Initialize Random Search with cross-validation
random_search = RandomizedSearchCV(estimator=xgb_reg, param_distributions=param_dist, n_iter=50, cv=3, scoring='r2', random_state=42)

# Perform Random Search to find the best hyperparameters
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train XGBoost regressor with the best hyperparameters
best_xgb_reg = XGBRegressor(**best_params)
best_xgb_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_xgb_reg.predict(X_test)

# Calculate R² score
r2 = r2_score(y_test, y_pred)
print("R² Score on Test Set:", r2)


# Post Modelling

## Evaluation

### Classification

In [None]:
# Confusion Matrix:

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


#Accuracy Score

knn.score(X_test, y_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### Regression

In [None]:
# Mean Absolute Error:

from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2]
mean_absolute_error(y_true, y_predict)


# Mean Squared Error:

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_predict)


# R² Score

from sklearn.metrics import r2_score
r2_score(y_true, y_predict)

### Clustering

In [None]:
# Homogeneity:

from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_predict)


# V-measure:

from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_predict)



### Cross Validation

In [None]:
# Cross-validation:

from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(new_lr, X, y, cv=2))

scores = -1 * cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')

### Score Dataset

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

# Summary


1. Data Collection and Exploration

Load your data and perform initial exploration (e.g., summary statistics, data types, missing values, class distribution).

2. Data Preprocessing

Handle Missing Values: Impute missing values using methods like mean/mode/median imputation or more advanced techniques like k-NN imputation.
Category Encoding: Encode categorical variables using techniques like one-hot encoding or target encoding (for high cardinality features).
Standardization: Standardize numerical features to have zero mean and unit variance, especially if algorithms like SVM, KNN, or PCA are used.

3. Train-Test Split

Split your data into training and testing sets (e.g., 70-30 or 80-20) to evaluate the model's performance on unseen data.

4. Feature Engineering

Mutual Information: Assess the importance of features and select the most relevant ones based on mutual information scores.
Clustering for New Features: Use clustering algorithms (e.g., K-means) to create new features that capture the inherent structure of the data.
Principal Component Analysis (PCA): Reduce dimensionality and remove multicollinearity by transforming features into principal components.

5. Handling Imbalanced Data

Oversampling: Use techniques like SMOTE (Synthetic Minority Over-sampling Technique) to balance the class distribution in the training data.

6. Model Training

L1/L2 Regularization: Incorporate regularization to prevent overfitting and improve model generalization. For example, L1 for feature selection (Lasso) and L2 for ridge regression.
Using XGBoost: Train an XGBoost model, which is robust and handles various types of data well.

7. Hyperparameter Tuning

Grid Search/Random Search: Optimize hyperparameters using Grid Search or Random Search with cross-validation to find the best model parameters.
Cross-Validation: Use k-fold cross-validation to evaluate the model performance and ensure it generalizes well to unseen data.

8. Model Evaluation

Scoring: Assess the model using appropriate metrics (e.g., accuracy, precision, recall, F1-score, ROC-AUC) based on the problem context.

9. Model Interpretation and Deployment

Interpret the model results, understand feature importance, and deploy the model for production use if it meets the desired performance criteria.

In [None]:
# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load data
data = pd.read_csv('your_data.csv')
X = data.drop('target', axis=1)
y = data['target']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline for preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature Engineering
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Compute mutual information
mi = mutual_info_classif(X_train, y_train)

# PCA for dimensionality reduction
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Clustering
kmeans = KMeans(n_clusters=5)
X_train['cluster'] = kmeans.fit_predict(X_train)
X_test['cluster'] = kmeans.predict(X_test)

# Model Training and Hyperparameter Tuning
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

# Define XGBoost model
model = XGBClassifier()

# Define hyperparameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_pca, y_train)

# Evaluate model
best_model = grid_search.best_estimator_
scores = cross_val_score(best_model, X_train_pca, y_train, cv=5, scoring='accuracy')
print('Cross-validation accuracy:', scores.mean())

# Final evaluation on test set
test_score = best_model.score(X_test_pca, y_test)
print('Test set accuracy:', test_score)
