In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [3, 5, 7], 'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)  # exhaustive grid search
grid_search.fit(X_train, y_train)
print(f"Best parameters found: {grid_search.best_params_}")

# PARAMETERS:
# estimator --> model to optimize
# param_grid --> dictionary with parameters and values to test
# cv --> number of folds for cross-validation
# scoring --> strategy for evaluating the model
# n_jobs --> number of parallel processes to run

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC(kernel='rbf', C=1))])  # sequentially run some processes

param_grid = {'svc__C': [0.1, 1, 10], 'svc__kernel': ['linear', 'rbf']}  # use the name of the process followed by '__' and the parameter name

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# PARAMETERS:
# list of tuples: (process_name, transformation_or_estimator)
# each name must be unique and without a final underscore

In [None]:
make_classification()

# PARAMETERS:
# n_samples --> total number of samples
# n_features --> total number of features
# n_informative --> number of informative features
# random_state --> seed

In [None]:
SVC()  # Support Vector Classification

# PARAMETERS:
# C --> regularization parameter
# kernel --> kernel type ('linear', 'poly', 'rbf', 'sigmoid')
# gamma --> kernel coefficient

In [1]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

# stratify --> if y, maintains class proportions in the splits

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline([('scaler', StandardScaler()), ('pca', PCA()), ('gbc', GradientBoostingClassifier(random_state=123))])

# standardize -> apply PCA -> apply gradient boosting classifier, a classification model based on boosting and ensembles

In [None]:
param_dist = {
    'pca__n_components': sp_randint(5, 13),  # integer distribution between 5 (inclusive) and 13 (exclusive)
    'gbc__n_estimators': sp_randint(50, 200),
    'gbc__learning_rate': uniform(0.01, 0.2),  # uniform distribution [0.01, 0.01+0.2]
    'gbc__max_depth': sp_randint(1, 5),
    'gbc__subsample': uniform(0.6, 0.4),
    'gbc__min_samples_split': sp_randint(2, 10),
    'gbc__min_samples_leaf': sp_randint(1, 10),
    'gbc__max_features': ['auto', 'sqrt', 'log2', None]
}

# PARAMETERS:
# PCA
#   n_components --> number of principal components to keep
# GradientBoostingClassifier
#   n_estimators --> number of boosting rounds
#   learning_rate
#   max_depth --> maximum depth for each tree
#   subsample --> percentage of samples to use for each tree
#   min_samples_split --> minimum samples required to split a node
#   min_samples_leaf --> minimum samples required in each leaf (result of splitting a node)
#   max_features --> maximum number of features to consider when planning the next split

In [None]:
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)  # configure the cross-validation splitter, ensuring class proportions are maintained

random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=cv, scoring='accuracy', random_state=42, n_jobs=-1)

# PARAMETERS:
# pipeline --> a previously-defined pipeline of processes
# param_distributions --> dictionary of parameters to search
# n_iter --> number of combinations of parameters to sample and test
# cv --> cross-validation method
# scoring --> performance metric to report
# random_state --> seed
# n_jobs --> number of cores to use for parallel computation; if -1, it will use all cores available

In [4]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# PARAMETER:
# fit_intercept: (default True) --> if True, it will include an intercept
# normalize: (deprecated) --> if you need to standardize, do it BEFORE regressing
# copy_X: (default True) --> if True, it will copy the input data, otherwise it will overwrite it
# n_jobs --> number of cores to use for parallel computation; if -1, it will use all cores available

Coefficients: [ 4.39988248e-01  9.15770358e-03 -1.11827735e-01  6.47857908e-01
 -6.55068105e-06 -3.92330215e-03 -4.17033805e-01 -4.27676550e-01]
Intercept: -36.25561939898517
