## Supervised Learning 
1. Import Libraries
2. Import and Read the Data 
3. Split the Data Sets into: 
	a. Training Set 
	b. Cross-validation/Validation/Dev Set 
	c. Test Set
4. Modify or Feature Engineer the data set 
    - Handle missing data, create new features, encode categorical variables, etc.
5. (If Needed) Set feature scaling to the three data sets 
6. Create the Model 
	- linear output for 
7. Train the Model 
	- Set the loss function (SparseCategoricalCrossEntropy, BinaryCrossEntropy, etc.)
	- Set the Use adam algorithm
8. Evaluate the Model
    - Assess model performance using metrics (e.g., accuracy, F1-score, RMSE) on the test set.
    - Generate confusion matrices, ROC curves, etc., as needed.
9. Hyperparameter Tuning
    - Optimize hyperparameters (learning rate, regularization strength, etc.) 
    - Using grid search, random search, or manual tuning on the validation set.
10. Cross-Validation (if needed)
    - Use k-fold cross-validation to further validate the robustness of the model.
11. Model Deployment (optional)
    - Save the model, implement it in a production environment, and monitor its performance.


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np

# Models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Dot
from tensorflow.keras.activations import linear, relu, softmax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.linalg import norm, l2_normalize

# Data and Evaluation Tools
from sklearn.model_selection import train_test_split, GridSearchCV,
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

%matplotlib inline
tf.autograph.set_verbosity(0)
np.set_printoptions(precision = 2)
## IMPORT LIBRARIES ##

## Decision Trees
1. Import Libraries
2. Import and Read the Data 
3. Split the Data Sets into: 
	a. Training Set 
	b. Test Set
4. Modify or Feature Engineer the data set 
    - Handle missing data, create new features, encode categorical variables, etc.
5. (If Needed) Set feature scaling to the three data sets 
6. Create the Model 
	- linear output for 
7. Train the Model 
	- Set the loss function (SparseCategoricalCrossEntropy, BinaryCrossEntropy, etc.)
	- Set the Use adam algorithm
8. Hyperparameter Tuning
    - Optimize hyperparameters (e.g., max depth, min samples split) using grid search, random search, or cross-validation.
9. Pruning (optional)
    - Apply pruning techniques to avoid overfitting and improve generalization.
10. Model Interpretation
    - Interpret the decision tree model by visualizing the tree or using feature importance.
11. Model Deployment (optional)
    - Save the model, implement it in a production environment, and monitor its performance.


In [None]:
# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import xgboost

# Data and Evaluation Tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Randomized Search CV
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

## Typical Data Cleaning and Preprocessing

In [None]:
# Checking for duplicates
duplicate_rows = data[data.duplicated()]

# Dropping duplicates
data = data.drop_duplicates()

In [None]:
# Checking for missing values
missing_values = data.isnull().sum()

# Dropping columns
df = df.drop(columns = ['column1', 'column2', 'column3'])

# Dropping rows with missing values
df = df.dropna(subset = ['column1', 'column2', 'column3'])

### Identifying garbage values

In [None]:
for i in df.select_dtypes(include='object').columns:
    print(df[i].value_counts())
    print("***"*10)

### Selecting Features and Target

In [None]:
# Selecting columns for features and target
X = df.iloc[:, START_COL:END_COL]
y = df.iloc[:, TARGET_COL]

### Imputing Missing Data

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values
imp_mean = SimpleImputer(strategy = 'mean') # Mean for numerical values
X = imp_mean.fit_transform(X)

imp_median = SimpleImputer(strategy = 'median') # Median for numerical values
X = imp_median.fit_transform(X)

imp_mode = SimpleImputer(strategy = 'most_frequent') # Most frequent for categorical values
X = imp_mode.fit_transform(X)

imp_constant_cat = SimpleImputer(strategy = 'constant', fill_value = 0) # Constant for categorical and/or numerical values
X = imp_constant_cat.fit_transform(X)

imp_mean_marked = SimpleImputer(strategy = 'mean', add_indicator = True) # Add indicator for missing values
X = imp_mean_marked.fit_transform(X)

### One-hot Encoding 

In [None]:
# Pandas Implementation
cat_variables = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'] # Columns with categorical variables
df = pd.get_dummies(data = df, prefix = cat_variables, columns = cat_variables)

######################################################################################

# SciKit Learn Implementation
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False) # Sparse output is False for a DataFrame
df_encoded = ohe.fit_transform(df[cat_variables])
df_encoded = pd.DataFrame(df_encoded, columns = ohe.get_feature_names(cat_variables))
df = pd.concat([df, df_encoded], axis=1)
df = df.drop(columns = cat_variables)

### Ordinal Encoding 

In [None]:
# Pandas Implementation
df['column'] = df['column'].astype('category').cat.codes

# for loop
for categ in cat_variables:
    df[categ] = df[categ].astype('category').cat.codes

######################################################################################

# Scikit Learn Implementation
from sklearn.preprocessing import OrdinalEncoder

ode = OrdinalEncoder()
df['column'] = ode.fit_transform(df['column'].values.reshape(-1, 1))

# for loop
for categ in cat_variables:
    df[categ] = ode.fit_transform(df[categ].values.reshape(-1, 1))

### Column Transformer

In [None]:
from sklearn.compose import make_column_transformer

df = pd.read_csv('data.csv') # Load data

# Column transformer for imputing missing values one hot encoding data types (categorical and numerical)
ct = make_column_transformer(
    (ode, ["Sport-type"]), # Ordinal encode 'Sport-type' column
    (ohe, ["gender"]), # One-hot encode 'gender' column
    remainder = 'passthrough' # Pass through the remaining columns
)

ct = make_column_transformer(
    (imp_constant_cat, ['Name']), # Impute missing values in 'Name' column with a constant
    (imp_mean, ["farthest_run_mi"]), # Impute missing values in 'farthest_run_mi' column with the mean
    remainder = 'passthrough' # Pass through the remaining columns
)

ct.set_output(transform='pandas')
df = ct.fit_transform(df)

Note: Normally, Pipelines are used to allow chain multiple transformations together for a single column. Further shown later.

### Removing Outliers

In [None]:
# Define a function to remove outliers based on the IQR method
def remove_outliers(df, feature):
    """
    Remove outliers from a specified feature in the DataFrame using the IQR method.

    Parameters:
    df (pd.DataFrame): The DataFrame from which to remove outliers.
    feature (str): The name of the column (feature) to check for outliers.

    Returns:
    pd.DataFrame: The DataFrame with outliers removed for the specified feature.
    """
    # Calculate the first quartile (Q1) and third quartile (Q3) of the feature
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Define the outlier bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter the DataFrame to exclude outliers
    df_cleaned = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

    return df_cleaned

# List of features to check for outliers
features = ['Age', 'Weight', 'Height', 'BMI']

# Apply the outlier removal function to each feature
for feature in features:
    data = remove_outliers(data, feature)

# Display the cleaned DataFrame shape
print("Shape of DataFrame after outlier removal:", data.shape)

### 

### Splitting Datasets 
into (1) Training Set and (2) Test Set using Scikit-learn train_test_split()

In [None]:
from sklearn.model_selection import train_test_split

# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

# Delete temporary variables
del x_, y_

### Feature Scaling

### a) Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
# MinMaxScaler for target variables
scaler = MinMaxScaler((-1, 1)) # Feature range parameter: from -1 to 1, typically used for y_train
scaler.fit(y_train.reshape(-1, 1))
y_train = scaler.transform(y_train.reshape(-1, 1))

### b) Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

# X_train = df.iloc[:, 1:].to_numpy()
# labels = df.columns.to_numpy()

# Standard Scaler for features
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# X_train = pd.DataFrame(X_train, columns = labels) # If converted back to a DataFrame

## Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preoprocessing import StandardScaler, OneHotEncoder

In [None]:
num_cols = ["Social_media_followers"] # numerical columns
cat_cols = ["Genre"] # categorical columns

In [None]:
# Pipeline for numerical columns
num_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy = 'mean')),
        ('scale', StandardScaler())
    ]
)

# Pipeline for categorical columns
cat_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy = 'most_frequent')),
        ('encode', OneHotEncoder(handle_unknown = 'ignore', ))
    ]
)

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
# Column Transformer with Pipelines
col_transformer = ColumnTransformer(
    transformers = [
        ('num_pipeline', num_pipeline, num_cols), # Apply the num_pipeline to the numerical columns
        ('cat_pipeline', cat_pipeline, cat_cols) # Apply the cat_pipeline to the categorical columns
    ],
    remainder = 'drop', # Drop columns not specified
    n_jobs = -1         # Use all processors
)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_pipeline

In [None]:
dtc = DecisionTreeClassifier()
pipe_final = make_pipeline(col_transformer, dtc) # Combine the column transformer and the model
pipe_final.fit(X_train, y_train)
pipe_final.score(X_test, y_test)

In [None]:
# Saving Pipeline
import joblib
joblib.dump(pipe_final, "pipe.jolib")

# Loading Pipeline
pipe_final2 = joblib.load('pipe.joblib')

## Hyper-parameter Tuning

Resource for parameters can be always found through docs: 
<ul>
<li><a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html">Random Forest</a></li>
<li><a href="https://xgboost.readthedocs.io/en/stable/parameter.html">XGBoost</a></li>
<!-- <li><a href=""></a></li> -->
</ul>

#### 1) Grid Search Hyperparameter Tuning

In [None]:
param_grid = {
    "n_estimators": ['entropy', 'gain'],
    "min_samples_split": [5, 10, 15],
    "min_samples_leaf": [1,2,4],
    "max_depth": [10, 20, 30]
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv = 5,
    scoring = 'accuracy' # for regression task use "neg_mean_squared_error" or look at DOCS: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
)

grid_search.fit(X_train, y_train) # make sure y_train is a 1D array by using y_train.values.ravel() or y_train.reshape(-1,)
# grid_search.best_score_ and grid_search.best_params_ will give you the best score and the best parameters respectively

In [1]:
grid_search.best_score_ # Best score (according to the scoring parameter, if "neg_mean_squared_error" is used, negate the value)
grid_search.best_estimator_ # Best estimator
grid_search.best_params_ # Best parameters

NameError: name 'grid_search' is not defined

#### 2) Randomized Search Hyperparameter Tuning

In [9]:
random_param_grid = [{
    "n_estimators": ['entropy', 'gain'],
    "min_samples_split": [5, 10, 15],
    "min_samples_leaf": [1,2,4],
    "max_depth": [10, 20, 30]
}]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    RandomForestClassifier(),
    random_param_grid,
    n_iter = 10,
    cv = 5,
    scoring = 'accuracy'
)

# random_search.best_score_ and random_search.best_params_ will give you the best score and the best parameters respectively

# 3. Hyperparameter Tuning for Neural Networks

> Using Scikeras wrapper for implementing a neural network model built from keras for Scikit-learn to be used for cross-validation search 

In [None]:
%pip install scikeras # !pip install scikeras for Kaggle

In [None]:
# Define the model build function for KerasRegressor
def build_regressor_model(n_hidden, n_neurons, learning_rate):
    model = Sequential()
    model.add(Dense(n_neurons, activation = 'relu', input_shape = (X_train.shape[0],)))
    for layer in range(n_hidden):
        model.add(Dense(n_neurons, activation = 'relu'))
    model.add(Dense(1))
    model.compile(optimizer = Adam(learning_rate = learning_rate), loss = 'mean_squared_error')
    return model

def build_binary_classifier(n_hidden, n_neurons, learning_rate):
    model = Sequential()
    model.add(Dense(n_neurons, activation = 'relu', input_shape = (X_train.shape[0],)))
    for layer in range(n_hidden):
        model.add(Dense(n_neurons))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer = Adam(learning_rate = learning_rate), loss = 'binary_crossentropy')
    return model

def build_multiclass_classifier(n_hidden, n_neurons, learning_rate, num_classes):
    model = Sequential()
    model.add(Dense(n_neurons, activation = 'relu', input_shape = (X_train.shape[0],)))
    for layer in range(n_hidden):
        model.add(Dense(n_neurons, activation = 'relu'))
    model.add(Dense(num_classes))
    model.compile(optimizer = Adam(learning_rate = learning_rate), loss = 'categorical_crossentropy')
    return model

## Cross-Validation 