# **___ DataSet Description**

| Feature Name    | Description                                                                                     |
| --------------- | ----------------------------------------------------------------------------------------------- |
| **column1** | description. |



# **Libraries**


In [None]:
# Pandas : Data Manipulation
import pandas as pd


# NumPy : Math
import numpy as np
import math


# MatPlotLib
import matplotlib
from matplotlib import style

import matplotlib.pyplot as plt

import seaborn as sns

import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import skew


# Missing Value Imputation
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer


# handle imbalanced datasets
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline


# additional preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline


# Additional Metrics
from sklearn.metrics import log_loss, balanced_accuracy_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_log_error


# Additional Statistical & Utility Imports
from scipy import stats
from scipy.stats import norm, boxcox
import warnings
warnings.filterwarnings('ignore')


# Spliting
from sklearn.model_selection import train_test_split


# Scaling
from sklearn.preprocessing import RobustScaler , MinMaxScaler , StandardScaler


# Encoding
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# from category_encoders import OneHotEncoder,BinaryEncoder


# Correlation
# Feature Selection : Categorical
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest


# Feature Selection : Numerical
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

# Feature Selection : Recursive Feature Elimination
from sklearn.feature_selection import RFE

# Advanced Feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE


# Model Selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.model_selection import RandomizedSearchCV


# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


# regressors
from sklearn.ensemble import RandomForestRegressor , RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor , DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression


# classifiers & regressors
from sklearn.neighbors import KNeighborsClassifier , KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
# from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor


# clustering
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN


# classification
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score , accuracy_score
from sklearn.metrics import confusion_matrix , classification_report


# regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# unsupervised
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score , adjusted_rand_score


# handle overfitting
from sklearn.linear_model import Ridge, Lasso


# Utility for saving models
import joblib
import pickle

# **Class**


## **Data Information**

In [None]:
class MachineLearning:

  def data_information (self , data_frame):

    """
        Steps of understanding data to build the model ?

          Analysis :
            - plots of data frame for categorical & numerical
            - Some Math -> describe()

          Preprocessing :
            - Names of columns
            - Data types -> info() or here
            - nulls
            - duplicates
            - unique (num & values)
            - mean & median & mode : -> describe()
            - outliers -> box plot & math ( describe() )


    """


    """
        Just to know the information of each column.
        I get each name of it.
        Also each data type.
    """
    name_of_each_column = [col for col in data_frame]
    data_types_of_each_column = [data_frame[col].dtype for col in data_frame.columns]


    """
        Number of null values in each column is not enough to know if it is huge or not.
        I need to calculate the percentage of it based on length of data frame to make it clear.
    """
    null_values_of_each_column = [data_frame[col].isnull().sum() for col in data_frame.columns]
    percentage_of_null_values_of_each_column = [data_frame[col].isnull().sum() / len(data_frame) * 100 for col in data_frame.columns]


    """
        Unique values make me know :

          - What are the exact values of each column ?
          - Column is categorical or numerical ?

    """
    num_of_unique_values_of_each_column = [data_frame[col].nunique() for col in data_frame.columns]
    unique_values_of_each_column = [data_frame[col].unique() for col in data_frame.columns]


    """
        What are the num of duplicates in data frame ?
    """
    duplicates = data_frame.duplicated().sum()



    information_of_data = pd.DataFrame(
        {
            'Names' : name_of_each_column ,
            'Values' : unique_values_of_each_column ,
            'Data Type' : data_types_of_each_column ,
            'Unique Num' : num_of_unique_values_of_each_column ,
            'Null Num' : null_values_of_each_column ,
            'Null Percentage' : percentage_of_null_values_of_each_column ,
            'Duplicates' : duplicates
        }
    )

    return information_of_data

## **Plots**

Categorical plots: count, bar, box, violin, strip, swarm

Distribution plots: hist, kde, violin

Relational plots: scatter, line, joint, pair

Matrix plots: heatmap

### **Bar Plot**

In [None]:
  def bar_plot(self , column , data_frame):
    sns.set_style("whitegrid")
    surviver_counts = data_frame[column].value_counts(normalize=True)*100

    plt.figure(figsize=(6, 6))
    ax = sns.barplot(x=surviver_counts.index, y=data_frame[column].value_counts(), palette="rocket")

    plt.title(f"Percentage of {column} and Non-{column}", fontsize=14, fontweight='bold')
    plt.xlabel(f"{column} Status", fontsize=12, fontweight='bold')
    plt.ylabel("Count", fontsize=12, fontweight='bold')

    for p, percentage in zip(ax.patches, surviver_counts.values):
      ax.annotate(f'{percentage:.1f}%',
                  (p.get_x() + p.get_width() / 2, p.get_height()),
                  ha='center', va='bottom',
                  fontsize=12, fontweight='bold', color='black')
    plt.show()

### **Histogram**

In [None]:
  def histogram_plot(self , column , data_frame):
    sns.set_style("whitegrid")
    plt.figure(figsize=(10, 6))

    skewness = skew(data_frame[column], nan_policy="omit")
    sns.histplot(data_frame[column], bins=50, kde=True, color="navy", edgecolor="black")

    plt.xlabel(column, fontsize=14, fontweight='bold')
    plt.ylabel("Frequency", fontsize=14, fontweight='bold')
    plt.title(f"Distribution of {column}", fontsize=16, fontweight='bold')

    plt.text(
        x=data_frame[column].max() * 0.7,
        y=plt.gca().get_ylim()[1] * 0.7,
        s=f"Skewness: {skewness:.2f}",
        fontsize=15, fontweight="bold", color="navy"
    )

    plt.show()

### **Strip Plot**

In [None]:
  def strip_plot(self , column_x , column_y , data_frame):
    plt.figure(figsize=(10, 6))
    sns.stripplot(x=data_frame[column_x], y=data_frame[column_y], jitter=True, alpha=0.7, palette=["#1f77b4", "#ff7f0e"])
    plt.xlabel(column_x, fontsize=14, fontweight="bold")
    plt.ylabel(column_y, fontsize=14, fontweight="bold")
    plt.title(f"Strip Plot of {column_y} by {column_x}", fontsize=16, fontweight="bold")
    plt.show()

### **Pie Plot**

In [None]:
  def pie_chart(self , column , data_frame):
    satisfaction_counts = data_frame[column].value_counts()

    plt.figure(figsize=(8, 6))
    plt.pie(satisfaction_counts,
            labels=satisfaction_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            colors=plt.cm.Dark2.colors)

    plt.title(f"Distribution of {column}")
    plt.axis('equal')
    plt.show()

### **Box Plot**

In [None]:
  def box_plot(self , column_x , column_y , data_frame):
    plt.figure(figsize=(10,5))
    sns.boxplot(x = column_x, y = column_y, data = data_frame, palette="rocket")

    plt.xlabel(column_x)
    plt.ylabel(column_y)
    plt.title(f'Box Plot of {column_y} by {column_x}')
    plt.show()

### **Count Plot**

In [None]:
  def count_plot(self , column , hue , data_frame):
    ax = sns.countplot(x = column , data=data_frame, palette='rocket',hue=hue)

    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            ax.text(p.get_x() + p.get_width() / 2, height,
                    f'{height/len(data_frame)*100:.2f}%', ha='center', va='bottom', fontsize=10, color='black')

    plt.title(f'Count of {column} with {hue} Status')
    plt.show()

### **Scatter Plot**

In [None]:
   def scatter_plot(self, x, y, hue, data_frame):
        ax = sns.scatterplot(x=x, y=y, data=data_frame, hue=hue, palette=self.palette)
        plt.title(f'Scatter plot of {x} vs {y} by {hue}')
        plt.show()

### **Heatmap Plot**

In [None]:
    def heatmap(self, data_frame):
        corr = data_frame.corr()
        ax = sns.heatmap(corr, annot=True, cmap="rocket", fmt=".2f")
        plt.title("Correlation Heatmap")
        plt.show()

### **Violin Plot**

In [None]:
  def violin_plot(self, x, y, hue, data_frame):
        ax = sns.violinplot(x=x, y=y, data=data_frame, hue=hue, palette=self.palette, split=True)
        plt.title(f'Violin plot of {y} across {x} grouped by {hue}')
        plt.show()

### **Swarm Plot**

In [None]:
    def swarm_plot(self, x, y, hue, data_frame):
        sns.swarmplot(x=x, y=y, data=data_frame, hue=hue, palette=self.palette, dodge=True)
        plt.title(f'Swarm plot of {y} across {x} grouped by {hue}')
        plt.show()

### **KDE Plot**

In [None]:
    def kde_plot(self, column, hue, data_frame):
        sns.kdeplot(data=data_frame, x=column, hue=hue, fill=True, palette=self.palette)
        plt.title(f'KDE plot of {column} by {hue}')
        plt.show()

### **Line Plot**

In [None]:
    def line_plot(self, x, y, hue, data_frame):
        sns.lineplot(x=x, y=y, data=data_frame, hue=hue, palette=self.palette)
        plt.title(f'Line plot of {y} vs {x} by {hue}')
        plt.show()

### **Pair Plot**

In [None]:
   def pair_plot(self, data_frame, hue=None):
        sns.pairplot(data_frame, hue=hue, palette=self.palette)
        plt.suptitle("Pairplot of features", y=1.02)
        plt.show()

### **Joint Plot**

In [None]:
    def joint_plot(self, x, y, data_frame, kind="scatter"):
        sns.jointplot(x=x, y=y, data=data_frame, kind=kind, palette=self.palette)
        plt.suptitle(f'Joint plot of {x} vs {y}', y=1.02)
        plt.show()

## **Preprocessing**

### **Null**

In [None]:
  def handle_null_values(self , handling_type , columns , data_frame):

    # handling_type -> mode , mean (not prefered) , median , knn imputer

    if handling_type == 'mode' :

      for col in columns:
        # replace it with the most frequent value :
        data_frame[col] = data_frame[col].fillna( data_frame[col].mode()[0] )

    elif handling_type == 'knn imputer' :

      for col in columns:
        # replaces it with the previous value, and if it can't find it, then with the next one.
        data_frame[col] = data_frame[col].fillna(method='ffill').fillna(method='bfill')


    elif handling_type == 'median' :

      for col in columns :
        # replace it with the median because if the data is not normally distributed, the mean will be a problem.
        data_frame[col] = data_frame[col].fillna(data_frame[col].median())

    else :
      print("Invalid Value")























### **Outliers**

#### **Checking**

In [None]:
  def check_outliers(self , columns , data_frame ,whis = 1.5):

    fig, axes = plt.subplots(3, 3, figsize=(20, 5 * 3))
    axes = axes.flatten()

    for i, col in enumerate(columns):
        sns.boxplot(data=data_frame, y=col, ax=axes[i], palette='magma',whis=whis)
        axes[i].set_title(f'Boxplot of {col}', fontsize=12)
        axes[i].set_xlabel('')
        axes[i].set_ylabel(col)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

#### **Handling**

In [None]:
  def handle_outliers(self , data_frame , column, upper_value = 1.5 , lower_value = 1.5 , handle = 'no'):

      for col in column:

          Q1=data_frame[col].quantile(0.25)
          Q3=data_frame[col].quantile(0.75)
          IQR=Q3-Q1

          lower_bound=Q1-(lower_value*IQR)
          upper_bound=Q3+(upper_value*IQR)

          outliers_mask_lower = (data_frame['Age'] < lower_bound)
          outliers_mask_upper = (data_frame['Age'] > upper_bound)
          outliers_count_lower = outliers_mask_lower.sum()
          outliers_count_upper = outliers_mask_upper.sum()
          print(f"Number of lower Outliers in {col} : {outliers_count_lower}")
          print(f"Number of upper Outliers in {col} : {outliers_count_upper}")

          if handle == 'yes' :
            data_frame[col]=np.where(data_frame[col]<lower_bound,lower_bound,data_frame[col])
            data_frame[col]=np.where(data_frame[col]>upper_bound,upper_bound,data_frame[col])

      return data_frame

## **Scaling**

In [None]:
  def scaling_data(self , scaler_type , data_frame , features_train , features_test , columns_list):

    if scaler_type == 'standard scaler':
      # Standard Scalar :
      standard_scaler = StandardScaler()
      features_train[columns_list] = standard_scaler.fit_transform(features_train[columns_list])
      features_test[columns_list] = standard_scaler.transform(features_test[columns_list])


    elif scaler_type == 'min max scaler':
      # Min Max Scalar :
      min_max_scaler = MinMaxScaler()
      features_train[columns_list] = min_max_scaler.fit_transform(features_train[columns_list])
      features_test[columns_list] = min_max_scaler.transform(features_test[columns_list])


    elif scaler_type == 'robust scaler':
      # Robust Scaler :
      robust_scaler = RobustScaler()
      features_train[columns_list] = robust_scaler.fit_transform(features_train[columns_list])
      features_test[columns_list] = robust_scaler.transform(features_test[columns_list])

    else:
      print("There is no scaler type with this name.")


    return features_train , features_test

## **Encoding**

In [None]:
  def encoding_data(self , encoding_type , features_train , features_test , data_frame , columns_list):

    if encoding_type == 'label':

      label_encoding = LabelEncoder()
      features_train=label_encoding.fit_transform(features_train)
      features_test=label_encoding.transform(features_test)

    elif encoding_type == 'ordinal':

      # handling unseen data
      all_categories = {}
      for col in columns_list:
          train_cats = features_train[col].unique()
          all_categories[col] = sorted(set(train_cats))

      ordinal_encoder = OrdinalEncoder(
          categories=[all_categories[col] for col in columns_list],
          handle_unknown='use_encoded_value',
          unknown_value=len(all_categories[col])
        )

      features_train[columns_list] = ordinal_encoder.fit_transform(features_train[columns_list])
      features_test[columns_list] = ordinal_encoder.transform(features_test[columns_list])

      return features_train , features_test

    elif encoding_type == 'onehot':

      data_frame = pd.get_dummies(data_frame, columns = columns_list)
      return data_frame

## **Spliting**

In [None]:
  def spliting_data(self , data_frame , label , test_size = 0.2 , random_state = 42):

    features = data_frame.drop([label],axis=1)
    target =data_frame[label]
    # random_state = 42 -> to make the split the same every time
    features_train , features_test , target_train , target_test = train_test_split(features, target, test_size=test_size, random_state=random_state)

    return features_train , features_test , target_train , target_test



## **Correlation**

In [None]:
  def correlation(self , features_train , target_train , data_frame , numerical_columns , categorical_columns):

    # Numerical : anova
    x = features_train[numerical_columns]
    y = target_train

    f_values, p_values = f_classif(x, y)

    numerical_anova_data_frame = pd.DataFrame({
        'Feature': numerical_columns,
        'F-Score': f_values,
        'P-Value': p_values
    }).sort_values(by='F-Score', ascending=False)


    # # Categorical : chi2
    x=features_train[categorical_columns]
    y=target_train

    chi2_selector = SelectKBest(score_func=chi2, k='all')
    chi2_selector.fit(x, y)

    categorical_chi2_data_frame = pd.DataFrame({
        'Feature': x.columns,
        'Chi2 Score': chi2_selector.scores_,
        'P-Value': chi2_selector.pvalues_
    }).sort_values(by='Chi2 Score', ascending=False)

    return numerical_anova_data_frame , categorical_chi2_data_frame

## **Grid Search**

In [None]:
  def best_parameters(self , text , model_type , params , model , x_train, x_test, y_train, y_test):

    result = {}

    # grid Search : to find the best hyperparameters
    grid = GridSearchCV(
      # random_state = 42 -> same initial weights for comparsion
        estimator = model,
        # what parameters to try
        param_grid = params,
        # cross validation
        cv=5,
        # get the accuracy
        scoring='accuracy',
        # run on cpu
        n_jobs=-1,

        verbose=1
    )

    grid.fit(x_train, y_train)

    # best_estimator_ : returns the model with the best hyperparameters found during grid search
    best = grid.best_estimator_
    result[f'Best {text} Model'] = best

    y_train_pred = best.predict(x_train)

    y_test_pred = best.predict(x_test)

    if model_type == 'classification':
      # calculate metrics -> accuracy , precision , recall , f1 , auc
      metrics_train = {
          # accuracy : (TP + TN) / (TP + TN + FP + FN)
          'accuracy': accuracy_score(y_train, y_train_pred),
          # precision : TP / (TP + FP)
          'precision': precision_score(y_train, y_train_pred),
          # recall : TP / (TP + FN)
          'recall': recall_score(y_train, y_train_pred),
          # f1 score : 2 * (precision * recall) / (precision + recall)
          'f1': f1_score(y_train, y_train_pred),
          # auc : area under the roc curve
          'auc': roc_auc_score(y_train, y_train_pred),
          # best parameters from grid search
          'best_params': grid.best_params_
      }
      metrics_test = {
          'accuracy': accuracy_score(y_test, y_test_pred),
          'precision': precision_score(y_test, y_test_pred),
          'recall': recall_score(y_test, y_test_pred),
          'f1': f1_score(y_test, y_test_pred),
          'auc': roc_auc_score(y_test, y_test_pred),
          'best_params': grid.best_params_
      }
      print(f"Best {text}")
      print(f"Best Parameters: {grid.best_params_}")
      print(f"Silhouette Score: {metrics_train['accuracy']:.4f}")
      print(f"Test Accuracy: {metrics_test['accuracy']:.4f}")


    elif model_type == 'regression':
      metrics_train = {
          # R¬≤ : proportion of variance explained by the model
          'r2': r2_score(y_train, y_train_pred),
          # MAE : mean absolute error
          'mae': mean_absolute_error(y_train, y_train_pred),
          # MSE : mean squared error
          'mse': mean_squared_error(y_train, y_train_pred),
          # RMSE : root mean squared error
          'rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
          # best parameters from grid search
          'best_params': grid.best_params_
      }
      metrics_test = {
          'r2': r2_score(y_test, y_test_pred),
          'mae': mean_absolute_error(y_test, y_test_pred),
          'mse': mean_squared_error(y_test, y_test_pred),
          'rmse': np.sqrt(mean_squared_error(y_test, y_test_pred)),
          'best_params': grid.best_params_
      }
      print(f"Best {text}")
      print(f"Best Parameters: {grid.best_params_}")
      print(f"Silhouette Score: {metrics_train['r2']:.4f}")
      print(f"Test Accuracy: {metrics_test['r2']:.4f}")


    elif model_type == 'unsupervised':
      # calculate metrics -> silhouette , davies-bouldin , calinski-harabasz
      metrics_train = {
          # silhouette : cohesion vs separation (-1 to 1, higher is better)
          'silhouette': silhouette_score(x_train, y_train_pred),
          # davies-bouldin : average similarity between clusters (lower is better)
          'davies_bouldin': davies_bouldin_score(x_train, y_train_pred),
          # calinski-harabasz : ratio of between-cluster dispersion to within-cluster dispersion (higher is better)
          'calinski_harabasz': calinski_harabasz_score(x_train, y_train_pred),
          # best parameters from grid search
          'best_params': grid.best_params_
      }
      metrics_test = {
          'silhouette': silhouette_score(x_test, y_test_pred),
          'davies_bouldin': davies_bouldin_score(x_test, y_test_pred),
          'calinski_harabasz': calinski_harabasz_score(x_test, y_test_pred),
          'best_params': grid.best_params_
      }
      print(f"Best {text}")
      print(f"Best Parameters: {grid.best_params_}")
      print(f"Silhouette Score: {metrics_train['silhouette']:.4f}")
      print(f"Test Accuracy: {metrics_test['silhouette']:.4f}")



    result[f'{text} Train Metrics'] = metrics_train
    result[f'{text} Test Metrics'] = metrics_test

    return result



### **Parameters Guide**

#### **Regression**

In [None]:
ridge = {
    'alpha': [0.01, 0.1, 1, 10],
    # 'solver': ['auto', 'svd', 'cholesky'],
}

In [None]:
lasso = {
    'alpha': [0.01, 0.1, 1, 10],
    # 'max_iter': [1000, 2000],
}

#### **Classification**

In [None]:
logistic_regression = {
    'C': [0.01, 0.1, 1, 10],   # Regularization strength (smaller = stronger regularization)
    'penalty': ['l1', 'l2', 'elasticnet', 'none'], # l1 = Lasso (feature selection), l2 = Ridge (spread error)
    'solver': ['saga' , 'liblinear'],  # saga supports l1, l2, elasticnet
    # 'class_weight': [None, 'balanced'], # Handle class imbalance
    # 'max_iter': [100, 200, 500],  # less critical
}

In [None]:
xg_boost = {
    'n_estimators': [100, 200, 500],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage
    'max_depth': [3, 5, 7],           # Tree depth
    'subsample': [0.8, 1.0],          # Fraction of samples
    'colsample_bytree': [0.8, 1.0],   # Fraction of features per tree
    'gamma': [0, 0.1, 0.5]            # Minimum loss reduction for split
}

In [None]:
naive_bayes_or_GaussianNB = {
    'var_smoothing': [1e-09, 1e-08, 1e-07] # Stability parameter to avoid division by zero
}

#### **Supervised**

In [None]:
support_vector_machine = {
    'C': [0.1, 1, 10],                # Regularization strength
    'kernel': ['linear', 'rbf', 'poly'], # Kernel type
    'gamma': ['scale', 'auto'],       # Kernel coefficient (rbf/poly)
    # 'class_weight': [None, 'balanced'] # Handle imbalance
    # 'degree': [2, 3, 4]
}

In [None]:
decision_tree = {
    'criterion': ['gini', 'entropy'], # Split quality measure
    'max_depth': [None, 5, 10, 20],   # Limit depth to avoid overfitting
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],    # Minimum samples per leaf
    'class_weight': [None, 'balanced'] # Handle imbalance
}

In [None]:
random_forest = {
    'n_estimators': [100, 200, 500],  # Number of trees
    'criterion': ['gini', 'entropy'], # Split quality measure
    'max_depth': [None, 10, 20],      # Tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4],    # Minimum samples per leaf
    # 'bootstrap': [True, False],       # Whether to use bootstrapped samples
    # 'class_weight': [None, 'balanced'] # Handle imbalance
}

In [None]:
k_nearest_neighbors  = {
    'n_neighbors': [3, 5, 7, 11],     # Number of neighbors
    'weights': ['uniform', 'distance'], # Uniform = equal weight, distance = closer neighbors matter more
    'metric': ['euclidean', 'manhattan', 'minkowski'] # Distance metric
}

In [None]:
gradient_boosting = {
    'n_estimators': [100, 200, 500],  # Number of boosting stages
    'learning_rate': [0.01, 0.1, 0.2], # Shrinks contribution of each tree
    'max_depth': [3, 5, 7],           # Depth of individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]     # Minimum samples per leaf
    # 'subsample': [0.8, 1.0]          # Fraction of samples used per tree
}

#### **Unsupervised**

In [None]:
kmeans = {
    'n_clusters': [2, 3, 5, 10],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20],
    # 'max_iter': [300, 500],
}

In [None]:
agglomerative_clustering = {
    'n_clusters': [2, 3, 5, 10],
    'linkage': ['ward', 'complete', 'average'],
    # 'affinity': ['euclidean', 'manhattan'],
}

In [None]:
dbscan = {
    'eps': [0.3, 0.5, 1.0],
    'min_samples': [5, 10],
    # 'metric': ['euclidean', 'manhattan'],
}

## **Train & Test**

In [15]:
  def train_test_evaluate(self , caler_type , x_train , x_test , y_train , y_test , model):

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    print(f"{scaler_type} Scaler")
    print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
    print("Test Accuracy:", accuracy_score(y_test, y_pred))

## **Evaluation**

In [None]:
def evaluate_classification(self , y_test, y_pred, labels=None):
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted")
    rec = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
def evaluate_regression(self , y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R¬≤ Score: {r2:.4f}")


In [None]:
def evaluate_clustering(self , X, labels, true_labels=None):
    # Internal metrics (no ground truth needed)
    sil = silhouette_score(X, labels)
    db = davies_bouldin_score(X, labels)

    print(f"Silhouette Score: {sil:.4f}")
    print(f"Davies-Bouldin Index: {db:.4f}")

    # External metric (only if true labels are available)
    if true_labels is not None:
        ari = adjusted_rand_score(true_labels, labels)
        print(f"Adjusted Rand Index (vs true labels): {ari:.4f}")

# **Code**

## **Loading Data**


In [None]:
data_frame = pd.read_csv('file.csv')

In [None]:
algorithm = MachineLearning()

## **Information**


In [None]:
data_frame.shape

In [None]:
data_frame.info()

In [None]:
algorithm.data_information(data_frame)

In [None]:
numerical_columns = [ ]
categorical_columns = [ ]

In [None]:
print(data_frame.isnull().sum().sum())

In [None]:
data_frame.describe().T.style.bar(subset=['mean'], color='#FFA07A').background_gradient(
    subset=['std', '50%', 'max'], cmap='Blues').set_properties(
        **{'font-size': '12pt', 'border': '1.5px solid black'}).set_caption("üîç Summary Statistics of the Dataset")

In [None]:
algorithm.data_information(data_frame)

### **Data Overview Insights**


1. Shape of data set is - rows & - columns

2. Nulls are - null values

3. there are _ integer columns + _ float columns + _ object

4. Outliers :

   - _ Column has a miximum value of _ , much higher than the 75th percentile (30) , but it is _ a problem.

6. Target Variable: _
   - _% of _ _ (mean = _)


## **Data Analysis**

## **Preprocessing**

### **Irrelevant Columns**

### **Data type inspection**

### **Nulls**

### **Outliers**

### **Feature Engineering**

### **Feature Extraction**

### **Dimensionality Reduction**

### **Duplicates**

## **Ensemble Methods**

Bagging , Boosting , Stacking , Voting

### **Draft**

In [None]:
# # Bagging: Random Forest
# rf = Pipeline(steps=[("preprocessor", preprocessor),
#                      ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))])

# # Boosting: Gradient Boosting
# gb = Pipeline(steps=[("preprocessor", preprocessor),
#                      ("classifier", GradientBoostingClassifier(n_estimators=200, random_state=42))])

# # Boosting: AdaBoost
# ab = Pipeline(steps=[("preprocessor", preprocessor),
#                      ("classifier", AdaBoostClassifier(n_estimators=200, random_state=42))])

# # Voting Ensemble (Hard Voting)
# voting = VotingClassifier(
#     estimators=[("rf", RandomForestClassifier(random_state=42)),
#                 ("gb", GradientBoostingClassifier(random_state=42)),
#                 ("lr", LogisticRegression(max_iter=1000))],
#     voting="hard"
# )

# # Stacking Ensemble
# stacking = StackingClassifier(
#     estimators=[("rf", RandomForestClassifier(random_state=42)),
#                 ("gb", GradientBoostingClassifier(random_state=42))],
#     final_estimator=LogisticRegression(max_iter=1000)
# )

## **Spliting**

## **Scaling**

## **Encoding**

## **Feature Engineering After Encoding ?**

## **Feature Selection / Correlation**

## **Best Case**

## **Models**

In [None]:
    """
    Best _ : -
    Best Parameters: { }
    Train Accuracy: _
    Test Accuracy: _
    """

## **Regularization**