<a href="https://colab.research.google.com/github/M-Amrollahi/Personal-Notes/blob/master/ML-notes/sklearn_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## How to use FunctionTransformer to set custom transformer

In [None]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

# Generate some random data
X = np.random.normal(0, 5, size=(100,))

transformer = FunctionTransformer(func=np.exp,
                                  inverse_func=np.log,
                                  )

x_log = transformer.transform(X)
X_new = transformer.inverse_transform(x_log)

np.allclose(X, X_new)

True

## How to set parameters for FunctionTransformer

In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# Define a custom function to normalize data
def normalize(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0), np.mean(X, axis=0),np.std(X, axis=0)

def inverse_normalize(X_norm, mu, sigma):
    X = X_norm * sigma + mu
    return X

# Create a FunctionTransformer object to apply normalization
transformer = FunctionTransformer(normalize, inverse_func=inverse_normalize)

# Generate some random data
X = np.random.normal(0, 1, size=(100, 3))

# Apply the transformer to the input data
X_transformed,x_mean,x_std = transformer.transform(X)
transformer.inv_kw_args = {"mu": x_mean, "sigma": x_std}

# Apply the inverse transformation to the transformed data
X_inverse = transformer.inverse_transform(X_transformed)

# Check if the original data is equal to the inverse-transformed data
np.allclose(X, X_inverse)

True

In [None]:
from sklearn.preprocessing import PowerTransformer
import numpy as np

# Generate some random data with a non-Gaussian distribution
X = np.random.gamma(1, size=(100, 3))

# Create a PowerTransformer object
transformer = PowerTransformer()

# Apply the transformer to the input data
X_transformed = transformer.fit_transform(X)

# Print the mean and standard deviation of the transformed data
print("Mean:", np.mean(X_transformed, axis=0))
print("Standard deviation:", np.std(X_transformed, axis=0))

Mean: [-2.49800181e-17  1.20681243e-15 -3.04756220e-16]
Standard deviation: [1. 1. 1.]


## Voting Classifier , StackingClassifier

In [3]:
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000)

ensemble = VotingClassifier(
    estimators=[
        ("xgb", xgb.XGBClassifier(eval_metric="auc")),
        ("lgbm", lgbm.LGBMClassifier()),
        ("cb", cb.CatBoostClassifier(verbose=False)),
    ],
    voting="soft",
    # n_jobs=-1,
)

_ = ensemble.fit(X, y)

#######
ensemble = StackingClassifier(
    estimators=[
        ("xgb", xgb.XGBClassifier(eval_metric="auc")),
        ("lgbm", lgbm.LGBMClassifier()),
        ("cb", cb.CatBoostClassifier(verbose=False)),
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    passthrough=False
    # n_jobs=-1,
)

_ = ensemble.fit(X, y)

NameError: ignored

## PCA, tSNE, UMAP for dim resuction

In [5]:
#######

## Detecting outliers
As for LocalOutlierFactor, it is a neighbors-based algorithm designed to work fast with large datasets.

In [None]:
import umap  # pip install umap
from sklearn.neighbors import LocalOutlierFactor

X, y = make_classification(n_samples=5000, n_classes=2, n_features=10)
X_reduced = umap.UMAP(n_components=2).fit_transform(X, y)

lof = LocalOutlierFactor()
labels = lof.fit_predict(X_reduced, y)

## Using joblib module to save the model in sklearn

In [None]:
from sklearn import svm
from sklearn import datasets
from joblib import dump

# Load a sample dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

# Train a model
model = svm.SVC()
model.fit(X, y)

# Save the model to a file
dump(model, 'model.joblib')
