# Title: ABC

Problem Statement (What we are predicting/clustering?)

Type of Machine Learning (Regression/Classification/Clustering)

Success Metrices (RMSE, R2, Accuracy, Precision, Recall etc.)

Constraints or Assumptions

## Enviroment setup

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import *
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

## Data loading

In [None]:
df = pd.read_csv('path_to_data.csv')
df.head()

In [None]:
df.shape

from the df.shape, write how many features and rows we have in a markdown as a conclusion of this step

## Data Understanding - EDA

Basic info

In [None]:
df.info()

how many null values in each features

In [None]:
df.isna().sum()

identifying the numerical columns and categorical columns in our dataframe

In [None]:
numerical_columns = df.select_dtypes(include='number').columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

print(f'Numerical Columns: {numerical_columns}')
print(f'Categorical Columns: {categorical_columns}')

statistical inference of numerical features

In [None]:
df.describe()

### Visuals

distribution of our target variable

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['target_variable'], bins=30, kde=True)
plt.title('Distribution of Target Variable')
plt.show()

now let's visualise our categorical columns

In [None]:
def eda_plots(df):
    """
    Plots:
    1. Pie charts for all categorical columns
    2. Histograms for all numerical columns
    3. Boxplots for all numerical columns

    Parameters:
    df (pd.DataFrame): Input dataframe
    """

    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df.select_dtypes(include='number').columns

    # -----------------------------
    # Pie Charts (Categorical)
    # -----------------------------
    if len(categorical_cols) > 0:
        n_cols = 3
        n_rows = math.ceil(len(categorical_cols) / n_cols)

        plt.figure(figsize=(5 * n_cols, 5 * n_rows))
        for i, col in enumerate(categorical_cols, 1):
            plt.subplot(n_rows, n_cols, i)
            values = df[col].value_counts()
            top = values.head(5)
            if len(values) > 5:
                others = values[5:].sum()
                top['Others'] = others
            plt.pie(top, labels=top.index, autopct='%1.1f%%', startangle=140)
            plt.ylabel('')
            plt.title(f"distribution of {col}")

        plt.suptitle('Categorical Columns - Pie Charts', fontsize=16)
        plt.tight_layout()
        plt.show()

    # -----------------------------
    # Histograms (Numerical)
    # -----------------------------
    if len(numerical_cols) > 0:
        n_cols = 3
        n_rows = math.ceil(len(numerical_cols) / n_cols)

        plt.figure(figsize=(5 * n_cols, 4 * n_rows))
        for i, col in enumerate(numerical_cols, 1):
            plt.subplot(n_rows, n_cols, i)
            mean = df[col].mean()
            plt.axvline(mean, color='r', linestyle='dashed', linewidth=1)
            plt.hist(df[col].dropna(), bins=30)
            plt.title(f'Histogram of {col}')
            plt.xlabel(col)
            plt.ylabel('Frequency')

        plt.suptitle('Numerical Columns - Histograms', fontsize=16)
        plt.tight_layout()
        plt.show()

    # -----------------------------
    # Boxplots (Numerical)
    # -----------------------------
    if len(numerical_cols) > 0:
        n_cols = 3
        n_rows = math.ceil(len(numerical_cols) / n_cols)

        plt.figure(figsize=(5 * n_cols, 4 * n_rows))
        for i, col in enumerate(numerical_cols, 1):
            plt.subplot(n_rows, n_cols, i)
            plt.boxplot(df[col].dropna())
            plt.title(f'Boxplot of {col}')
            plt.ylabel(col)

        plt.suptitle('Numerical Columns - Boxplots', fontsize=16)
        plt.tight_layout()
        plt.show()

eda_plots(df)

pairplot of our dataframe to check correlation of each feature with each other feature

In [None]:
sns.pairplot(df.select_dtypes(include="number"))
plt.show()

## Data preprocessing

first make a copy of our dataframe so that our original dataframe is as it is

In [None]:
df_copy = df.copy(deep=True)

now we have to remove the null values from our data which is very common in a lot of datasets

we have two ways to handle null values.

1. Drop the null values: do this if you have around 5% of null values as total rows, we have a large dataset, if the missingness isn't related to other features at all, and drop the entire feature if it has too many (> 50% or 70%) null values.
2. Impute the null values: do this if you have 5 to 30% of rows are null, the dataset is small, if the missing values are somewhat related to other important features or our target variable.
    - there are two ways to impute data:
        1. Univariate
            - Mean/Median/Mode
            - Arbitary value
        2. Multivariate
            - Regression
            - KNN
            - Train a machine learning model to handle those values

In [None]:
# -------------------
# 1. Drop null values
# -------------------
df_copy.dropna(inplace=True)

# ----------------------------
# 2. Univariate imputation for
# ----------------------------

# numerical columns can be imputed with mean or median
for col in numerical_columns:
    mean_value = df_copy[col].mean()
    median_value = df_copy[col].median()
    # You can choose either mean or median for imputation
    df_copy[col].fillna(median_value, inplace=True)

# categorical columns can be imputed with mode
for col in categorical_columns:
    mode_value = df_copy[col].mode()[0]
    df_copy[col].fillna(mode_value, inplace=True)

# ---------------------------------------------
# 2. Multi-variate imputation using KNN Imputer
# ---------------------------------------------
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df_copy[numerical_columns] = imputer.fit_transform(df_copy[numerical_columns])

handle outliers

In [None]:
numerical_columns = df_copy.select_dtypes(include='number').columns

for col in numerical_columns:
    Q1 = df_copy[col].quantile(0.25)
    Q3 = df_copy[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

encode our categorical variable

In [None]:
le = LabelEncoder()

for col in categorical_columns:
    df_copy[col] = le.fit_transform(df_copy[col])

## Feature selection 1

1. if a numeric feature has very low variance then it is probably useless.
2. make a heatmap of dataframe and remove features that are highly correlated with each other, weakly correlated with the target variable.

In [None]:
sns.heatmap(df_copy.corr(), annot=True, fmt=".2f", cmap='coolwarm', cbar=True, yticklabels=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
columns_to_drop = ['unnecessary_column1', 'unnecessary_column2']
df_copy = df_copy.drop(columns = columns_to_drop, inplace=True, axis=1)

In [None]:
X = df_copy.drop('target_variable', axis=1)
y = df_copy['target_variable']

## Data splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# also add `stratify=y` if classification problem

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

we also do scaling after the splitting so that our test data is scaled differently and our model doesn't know that, so our testing is more accurate.

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

print("Scaling completed.")

## Model training - Base model

train a very basic model such as linear regression or logistic regression to check your performance on very basic models, the reason is that sometimes basic models don't capture too much variance and can prevent overfitting that is caused on fancier models.

In [None]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)
y_pred = lr.predict(X_test_sc)
print(f"RMSE: {root_mean_squared_error(y_test, y_pred, squared=False)}")
print(f"R2: {r2_score(y_test, y_pred)}")

## Model training - selecting a model from a range of models

In [None]:
models = {
    # for classification
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Support Vector Classifier': SVC(),

    # for regression
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Support Vector Regressor': SVR()
}

In [None]:
for model_name, model in models.items():
    model.fit(X_train_sc, y_train)
    y_pred = model.predict(X_test_sc)

    if 'Classifier' in model_name or 'Logistic' in model_name:
        print(classification_report(y_test, y_pred))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
    else:
        print(f"{model_name} - RMSE: {root_mean_squared_error(y_test, y_pred, squared=False)}")
        print(f"{model_name} - R2: {r2_score(y_test, y_pred)}")

## Feature selection 2

in this we will perform forward selection or backward elimination to find out the best features for our best model

In [None]:
sfs = SequentialFeatureSelector(
    estimator=RandomForestRegressor(random_state=42),
    n_features_to_select='auto',
    direction='forward',
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1
)

sfs.fit(X_train, y_train)

In [None]:
# get the selected feature names
selected_features = X.columns[sfs.get_support()]
print(f'Selected Features: {selected_features}')

In [None]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

print("Feature selection completed.")

we will also have to rescale our data

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
print(f"Original number of features: {X.shape[1]}")
print(f"Reduced number of features after selection: {X_train.shape[1]}")

Now pick the model with best performance and we will do another feature selection and hypertune its parameters for optimal performance

## Hyperparameter tuning

now we will test which hyperparameters are the best for our model and gives us the best performance

GridSearchCV is used to systematically search for the best hyperparameters of a model using cross-validation and a chosen evaluation metric. but we only use GridSearchCV on small datasets

In [None]:
GridSearchCV = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid={
        'n_estimators': [50, 100, 200, 300, 400],
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10, 15, 20]
    },
    scoring='root_mean_squared_error',
    cv=5,       # 5-Fold Cross Validation
    n_jobs=-1,  # Use all available cores
    verbose=2   # Verbosity level
)

In [None]:
GridSearchCV.fit(X_train_sc, y_train)

In [None]:
print(f"Best Parameters: {GridSearchCV.best_params_}")
print(f"Best RMSE: {abs(GridSearchCV.best_score_)}")

In [None]:
best_model = GridSearchCV.best_estimator_
y_pred_best = best_model.predict(X_test_sc)
print(f"Tuned Model RMSE: {root_mean_squared_error(y_test, y_pred_best, squared=False)}")
print(f"Tuned Model R2: {r2_score(y_test, y_pred_best)}")

In [None]:
# to see why these parameters were chosen

results_df = pd.DataFrame(GridSearchCV.cv_results_)

results_df = results_df[
    ["params", "mean_test_score", "std_test_score", "rank_test_score"]
].sort_values("rank_test_score")

results_df.head()

## Make the pipeline

In [None]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(**GridSearchCV.best_params_))
])
pipeline.fit(X_train, y_train)
y_pred_pipeline = pipeline.predict(X_test)
print(f"Pipeline Model RMSE: {root_mean_squared_error(y_test, y_pred_pipeline, squared=False)}")
print(f"Pipeline Model R2: {r2_score(y_test, y_pred_pipeline)}")

## Exporting our best model

we export our model so that we can use integrate model in a python backend and use our model for making predictions on live data, this is what the entire training was for.

we can export our entire pipeline or model as we seem fit

In [None]:
import joblib
joblib.dump(pipeline, 'final_model_pipeline.pkl')