# ETL Process
---
 In this part, it will 
 * read data inside 'fall2024data/'
 * convert data into one dataframe
 * change the name of features
 * drop rows which containing Nan or Inf value
 * save plot of features to 'Featrues_plot/'
 * save Analystic data to 'Analysis/'
 * save processed data to csv in 'Datasets/'

    'Dataset.csv'           - whole dataset of traffic\
    'BENIGN.csv'            - set of data labeled 'BENIGN'\
    'DoS_GoldenEye.csv'     - set ofdata labeled 'DoS_GoldenEye'\
    'DoS_Hulk.csv'          - set of data labeled 'DoS_Hulk'\
    'DoS_Slowhttptest.csv'  - set of data labeled 'DoS_Slowttptest'
    
 ...

## Extraction
---
This part will load data from the folder and concatenate them into one DataFrame


In [None]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import glob
import time
from functools import wraps

In [None]:
def logged(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        start_time_formatted = time.strftime('%H:%M:%S', time.localtime(start_time))
        print(f"[{start_time_formatted}] Function '{func.__name__}' start.")
        
        result = func(*args, **kwargs)
        
        end_time = time.time()
        end_time_formatted = time.strftime('%H:%M:%S', time.localtime(end_time))
        execution_time = end_time - start_time
        print(f"[{end_time_formatted}] Function End, Time Elapsed: {execution_time:.4f}Sec")
        
        return result
    return wrapper

In [None]:
@logged
def get_Dset(fpath:str)->pd.DataFrame:
    _ids = list()
    try:
        # get csv files
        for csvfile in glob.glob(f'{fpath}/*.csv'):
            print('{:30s}'.format(csvfile), 'found')
            _ids.append(pd.read_csv(csvfile, sep=','))

        # get json files
        for jsonfile in glob.glob(f'{fpath}/*.json'):
            print('{:30s}'.format(jsonfile), 'found')
            _ids.append(pd.read_json(jsonfile, lines=True))

        # get parquet files
        for pqfile in glob.glob(f'{fpath}/*.parquet'):
            buff = pq.read_table(pqfile)
            print('{:30s}'.format(pqfile), 'found')
            _ids.append(buff.to_pandas())

        return pd.concat(_ids, ignore_index=True)
    
    except Exception as e:
        print('Exception:', e)
        return
    
    

In [None]:
ids = get_Dset('fall2024data')

In [None]:
ids.shape

## Transform
---
In this part, data will be separated by its Label and processed to show some insight
* Data types conversion
* Data format conversion (cm to inches, etc.)
* Identifying errors in data
* Handling out-of-range and outlier data
* Add any other transformations you find necessary.

Also, Drop Label 'Heartbleed'


In [None]:
@logged
def drop_Heartbleed(data:pd.DataFrame):
    # Drop Label 'Heartbleed'
    Hbd = (data.iloc[:,-1] == 'Heartbleed')
    H_idx = Hbd[Hbd == True].index
    print(H_idx.shape[0], 'items dropped')
    df = data.drop(H_idx)
    return df

df = drop_Heartbleed(ids)


### General Info about data

In [None]:
# 61117 samples with 78 features and 1 label
print(df.shape)

In [None]:
#All of features are in numerical type, thus, we don't need to transform it.
df.info()

In [None]:
print(pd.unique(df.iloc[:,-1]))

### Change the name of the features

In [None]:
@logged
def strip_cols(df:pd.DataFrame)->pd.DataFrame:
    # Some of features have confusing spaces in their name
    cols = df.columns.to_list()
    
    for i in range(len(cols)):
        print('{:30} ->'.format(cols[i]), end= ' ')
        cols[i] = cols[i].strip()
        print('{:30}'.format(cols[i]))
        
    return df.set_axis(cols, axis=1)
        

In [None]:
df_stripped = strip_cols(df)

In [None]:
df_stripped.columns

### Drop rows which contains Nan or Inf value

In [None]:
@logged
def drop_anomaly(df:pd.DataFrame)->pd.DataFrame:
    # Convert Inf value into Nan
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Columns containing NaN value
    print(df.isna().sum().sum(), 'items dropped')
    
    # 'DoS Hulk' and 'BENIGN' contains Nan or Inf value
    print(np.unique(df.loc[(ids.count(axis=1) < df.shape[1]), :].to_numpy()[:,-1], return_counts=True))
    return df.dropna()


In [None]:
df_dropped = drop_anomaly(df_stripped)

In [None]:
df_dropped.isna().sum().sum()

In [None]:
df_dropped.shape

In [None]:
BENIGN = df_dropped.loc[df_dropped['Label'] == 'BENIGN']
DoS_GoldenEye = df_dropped.loc[df_dropped['Label'] == 'DoS GoldenEye']
DoS_Hulk = df_dropped.loc[df_dropped['Label'] == 'DoS Hulk']
DoS_Slowhttptest = df_dropped.loc[df_dropped['Label'] == 'DoS Slowhttptest']

### Plotting each features
Just so watch distribution

In [None]:
import matplotlib.pyplot as plt

@logged
def show_fig(target:list):
    for j in range(len(BENIGN.columns)-1):
        fig, ax = plt.subplots(len(target), 1, constrained_layout=True)
        fig.set_dpi(600)
    
        target_col = j
        fig.suptitle(BENIGN.columns[target_col])
    
        for i in range(len(target)):
            
            ax[i].set_title(target[i].iloc[0,-1])
            ax[i].scatter(range(target[i].shape[0]), 
                          target[i].iloc[:,target_col].to_numpy(),
                          marker='x', 
                          s=[5 for _ in range(target[i].shape[0])])
        
        fig.savefig(f"Features_plot/{j}_{BENIGN.columns[j].replace('/', '')}.jpeg", dpi=600)
        plt.close(fig)
        print(f"Features_plot/{j}_{BENIGN.columns[j].replace('/', '')}.jpeg")

In [None]:
show_fig([BENIGN, DoS_GoldenEye, DoS_Hulk, DoS_Slowhttptest])

In [None]:
df_dropped.describe().to_csv('Analysis/ids_describe.csv')

In [None]:
DoS_GoldenEye.describe().to_csv('Analysis/GoldenEye_describe.csv')
DoS_Hulk.describe().to_csv('Analysis/Hulk_describe.csv')
DoS_Slowhttptest.describe().to_csv('Analysis/Slowhttptest_describe.csv')

## Load
---


In [None]:
DoS_Slowhttptest.to_csv('Datasets/DoS_Slowhttptest.csv')
DoS_Hulk.to_csv('Datasets/DoS_Hulk.csv')
DoS_GoldenEye.to_csv('Datasets/DoS_GoldenEye.csv')

In [None]:
df_dropped.to_csv('Datasets/Dataset.csv')

# Model Part
---
In this part, It will
* Read data from .csv file
* Exploratory Data Analysis
* Data Preprocessing
* Feature Engineering
* Model Selecting


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 6. Read Data
---

In [None]:
# Reading the dataset
file_path = 'Datasets/Dataset.csv'
data = pd.read_csv(file_path, index_col=0)

## 7. Exploratory Data Analysis (EDA)
---

### 1. Identify the shape of the dataset

In [None]:
print(f"Dataset shape: {data.shape}")

### 2. Unify the columns/features names

In [None]:
data.columns = [col.strip() for col in data.columns]
data.columns

### 3. Unique values in the class label

In [None]:
print("Unique class labels:", data['Label'].unique())

### 4. Checking for missing data


In [None]:
missing_data = data.isnull().sum()
print("Missing data per column:", missing_data[missing_data > 0])

### 5. Columns with highly missing data

In [None]:
threshold = 0.5  # 50% threshold
high_missing = missing_data[missing_data / data.shape[0] > threshold]
print("Columns with >50% missing data:", high_missing)

### 6. Univariate Analysis: Statistics and Boxplots


In [None]:
import seaborn as sns

In [None]:
# Statisctics
print(data.describe())

In [None]:
# Verifying the data
print(data.info())

In [None]:
# Boxplot for first numerical feature
sns.boxplot(data=data.iloc[:, :-1])
plt.title("Boxplot of numerical features")
plt.show()

### 7. Bivariate Analysis: Correlation Matrix


In [None]:
# Exclude non-numeric columns
numeric_data = data.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr_matrix = numeric_data.corr()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix (Numeric Features Only)")
plt.show()

In [None]:
# Sort correlation matrix
@logged
def corr_sort(corr_matrix:pd.DataFrame, get_val=True)->pd.DataFrame:
    """sort correlation matrix 

    Args:
        corr_matrix (pd.DataFrame): square correlation matrix

    Returns:
        corr_sorted(pd.DataFrame): sorted, unpacked correlation matrix
    """
    
    # Convert correlation matrix to long-form DataFrame
    unpacked = corr_matrix.unstack().reset_index()
    unpacked.columns = ['Feature_1', 'Feature_2', 'Corr']
    
    # Remove self-correlations
    unpacked = unpacked[unpacked['Feature_1'] != unpacked['Feature_2']]
    
    # Drop duplicate pairs (e.g., A-B and B-A are the same)
    unpacked['Pair'] = unpacked.apply(lambda row: frozenset([row['Feature_1'], row['Feature_2']]), axis=1)
    unpacked = unpacked.drop_duplicates(subset='Pair')
    unpacked.drop(columns=['Pair'], inplace=True)
    
    # Sort by absolute correlation values in descending order
    unpacked['Abs_Corr'] = unpacked['Corr'].abs()
    corr_sorted = unpacked.sort_values(by='Abs_Corr', ascending=False)
    
    # Drop helper column
    corr_sorted.drop(columns=['Abs_Corr'], inplace=True)
    corr_sorted.reset_index(drop=True, inplace=True)
    
    print(corr_sorted.head(10))
    
    if(get_val):
        return corr_sorted
    else:
        return corr_sorted[['Feature_1', 'Feature_2']]

In [None]:
# get top 10 correlated features
top = corr_sort(corr_matrix, get_val=False)
top_10_corr_features = top.head(10).to_numpy().flatten()
top_10_corr_features

### 8. Multivariate Analysis: Clustering with HDBSCAN


In [None]:
from sklearn.cluster import HDBSCAN

In [None]:
# Convert to numpy array for DBSCAN
features = numeric_data.to_numpy()

# Apply DBSCAN clustering
hdbscan = HDBSCAN(min_samples=5000, store_centers='centroid', n_jobs=-1)
cluster_res = hdbscan.fit_predict(features)

# Visualize the clusters (using the first two numeric features)
plt.figure(figsize=(8, 6))
plt.scatter(features[:, 0], features[:, 1], c=cluster_res, cmap='viridis', s=10)
plt.title("HDBSCAN Clustering Visualization (First Two Features)")
plt.xlabel(numeric_data.columns[0])
plt.ylabel(numeric_data.columns[1])
plt.colorbar(label="Cluster")
plt.show()

# Analyze DBSCAN results
unique_clusters = set(cluster_res)
print("Unique clusters identified by DBSCAN:", unique_clusters)
print("Number of outliers (label = -1):", list(cluster_res).count(-1))

In [None]:
# compare cluster with actual label
comparison_table = pd.crosstab(cluster_res, data['Label'])
print(comparison_table)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

In [None]:
# map cluster with label * manually *
cluster_to_label_map = {-1: 'Others',0: 'DoS Slowhttptest', 1: 'DoS Hulk', 2: 'BENIGN', 3: 'DoS GoldenEye'}  # 예시 매핑
mapped_clusters = [cluster_to_label_map[c] for c in cluster_res]

# Score accuracy
accuracy = accuracy_score(data['Label'], mapped_clusters) 
print("Accuracy:", accuracy)

# Visualize confusion matrix
ConfusionMatrixDisplay.from_predictions(data['Label'], mapped_clusters)

In [None]:
# result is positive,
# HDBSCAN found some structures among features
from sklearn.metrics import silhouette_score
print('Silhouette Score:', silhouette_score(data.iloc[:,:-1], cluster_res))

In [None]:
# Some insights
print("Cluster Labels:", cluster_res)
print("Probability of Membership:", hdbscan.probabilities_)
print("Centroids:", hdbscan.centroids_)
# This information can be used to determine correlations between labels and features.

### 9. Outlier Detection

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1

# Determine outliers
outliers = ((numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR))).sum()

# Display results
print("Number of outliers per column:")
print(outliers)

print("Total outliers:", outliers.sum(), 'among', numeric_data.shape[0])

# IQR Calculation
---
The interquartile range (IQR) is calculated based on numeric columns only:

IQR = 𝑄3−𝑄1

IQR = 𝑄3−𝑄1

Outliers are values outside:

Lower Bound = 𝑄1−1.5×IQR

Upper Bound = 𝑄3+1.5×IQR


# Justifications
---
EDA Techniques: These methods provide a comprehensive overview of the dataset.

For example, correlation matrices help identify relationships between features, while boxplots reveal outliers.

Tools: Seaborn is ideal for creating advanced visualizations with minimal code,

We used HDBSCAN from Scikit-learn for clustering since our dataset is not scaled

also Scikit-learn is a reliable library for clustering and machine learning tasks.

# Machine Learning Pipeline

## 8 Data Preprocessing
---


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### 1. Duplicate Removal


In [None]:
data = data.drop_duplicates()

### 2. Splitting Dataset and Unify Dtype


In [None]:
X = data.select_dtypes(include=[np.number]).astype(np.float64)  # Select Numeric features and unify datatypes
                                                                # Ensure 'Label' or any categorical columns are excluded from features
y = data['Label']  # Target variable (categorical)

### 3. Encoding Categorical Data

In [None]:
label_encoder = LabelEncoder()
print(pd.unique(y))
y = label_encoder.fit_transform(y)
print(pd.unique(y))

### 4. Train, Validation, and Test Split

In [None]:
# Train set split (70%)
X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=0.7, random_state=42, stratify=y)

# Validation, Test set split (15% each)
X_valid, X_test, y_valid, y_test = train_test_split(X_rest, y_rest, test_size=0.5, stratify=y_rest)

### 5. Handling Missing Data


In [None]:
# Compute the mean from the training data only
train_mean = X_train.mean()

# Fill missing values using the training data mean
X_train.fillna(train_mean, inplace=True)
X_valid.fillna(train_mean, inplace=True)
X_test.fillna(train_mean, inplace=True)
print('Missing in X_train:', X_train.isnull().sum().sum())
print('Missing in X_valid:', X_valid.isnull().sum().sum())
print('Missing in X_test:', X_test.isnull().sum().sum())

### 6. Verify the processed data


In [None]:
print("X_train shape:", X_train.shape)
print("X_train shape:", X_valid.shape)
print("X_test shape:", X_test.shape)
print("y_train unique labels:", label_encoder.classes_)

8.2 Justifications
---
Preprocessing Choices

* Duplicate Removal:
 Ensures that redundant records do not bias the model or distort statistical properties of the dataset.
 Reduces computational overhead during training.

* Encoding Categorical Data:
 LabelEncoder is used to transform categorical target labels into numeric values required for machine learning models.
 Ensures that class labels are represented consistently without introducing unnecessary dimensions.

* Splitting the Dataset:
 A 70-30 split ensures sufficient data for both training and testing while balancing computational feasibility.
 This is a standard practice for general model validation.

* Handling Missing Data:
 Filling missing values with the mean is computationally efficient and maintains the dataset's distribution for numeric features.
 Ensures that no test statistics are leaked during preprocessing, as means are calculated separately for training and testing sets.

* Technology Choices:
 Pandas: For efficient preprocessing operations like deduplication and handling missing values.
 Scikit-learn: For splitting datasets and encoding categorical variables, ensuring compatibility with downstream modeling.

## 9. Feature Engineering
---

In a Nutshell

* X_train, X_valid, X_test: Splited datasets
* X__scaled: Standardized datasets
* X__reduced_var: Datasets of selected features, standardized
* X__reduced_RF_imp: 


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
@logged
def train_RF(n_estimators=200, xtrain=X_train, 
             ytrain=y_train, xtest=X_test, ytest=y_test)->RandomForestClassifier:
    
    """train RandomForest Classifier and show metrics
    I chose recall score since I think the less False Positive the better.

    Args:
        n_estimators (int, optional): Defaults to 100.
        xtrain (_type_, optional): Defaults to X_train.
        ytrain (_type_, optional): Defaults to y_train.

    Returns:
        RandomForestClassifier: trained model
    """
    # Training
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_model.fit(xtrain, ytrain)
    
    # Predicting
    res = rf_model.predict(xtest)
    print(res)
    
    # Scoring
    y_re = np.where(ytest >= 1, 1, ytest)
    res = np.where(res >= 1, 1, res)
    
    # Plotting
    ConfusionMatrixDisplay.from_predictions(y_re, res)
    
    print('Accuracy:', accuracy_score(y_re, res))
    print('Recall:', recall_score(y_re, res))
    
    return rf_model

In [None]:
# Reference model without any tweak
# Fit model
rf_model = train_RF(200, X_train, y_train, X_valid, y_valid)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

In [None]:

print('Mean of X_train_scaled:', X_train_scaled.mean())
print('Mean of X_valid_scaled:', X_valid_scaled.mean())
print('Mean of X_test_scaled:', X_test_scaled.mean())
print('Std of X_train_scaled:', X_train_scaled.std())
print('Std of X_valid_scaled:', X_valid_scaled.std())
print('Std of X_test_scaled:', X_test_scaled.std())

Feature Selection

In [None]:
from sklearn.feature_selection import VarianceThreshold

2.1 Remove features with near-zero variance


In [None]:
# Fit VarianceThreshold on X_train only
variance_filter = VarianceThreshold(threshold=0.01)  # Features with >1% variance
X_train_reduced_var = variance_filter.fit_transform(X_train_scaled)

# Apply the same filter to X_valid and X_test
X_valid_reduced_var = variance_filter.transform(X_valid_scaled)
X_test_reduced_var = variance_filter.transform(X_test_scaled)

variance_filter.feature_names_in_ = X_train.columns
print('Selected Features:', variance_filter.get_feature_names_out())

In [None]:
# Fit model
rf_model_reduced_var = train_RF(200, X_train_reduced_var, y_train, X_valid_reduced_var, y_valid)

In [None]:
# Feature importance
importances = rf_model_reduced_var.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort by importance

# Select dominant 20 features
num_features = 20
sel_features = X_train.columns[indices[:num_features]]

print("Selected Features:", sel_features)

# Reduce dataset
X_train_reduced_RF_imp = X_train[sel_features]
X_valid_reduced_RF_imp = X_valid[sel_features]
X_test_reduced_RF_imp = X_test[sel_features]

In [None]:
# Train RandomForest model
rf_model_reduced_imp = train_RF(200, X_train_reduced_RF_imp, y_train, X_valid_reduced_RF_imp, y_valid)

In [None]:
# Compute correlation matrix on X_train_reduced
correlation_matrix = np.corrcoef(X_train_reduced_var, rowvar=False)
correlated_features = set()
threshold = 0.9

for i in range(correlation_matrix.shape[0]):
    for j in range(i + 1, correlation_matrix.shape[1]):
        if abs(correlation_matrix[i, j]) > threshold:
            correlated_features.add(j)
            
print("X_train shape before filtering:", X_train_reduced_var.shape)
print("X_valid shape before filtering:", X_train_reduced_var.shape)
print("X_test shape before filtering:", X_test_reduced_var.shape)

X_train_uncorrelated = np.delete(X_train_reduced_var, list(correlated_features), axis=1)
X_valid_uncorrelated = np.delete(X_valid_reduced_var, list(correlated_features), axis=1)
X_test_uncorrelated = np.delete(X_test_reduced_var, list(correlated_features), axis=1)

# Final datasets
print("X_train shape after filtering:", X_train_uncorrelated.shape)
print("X_valid shape after filtering:", X_valid_uncorrelated.shape)
print("X_test shape after filtering:", X_test_uncorrelated.shape)


In [None]:
# Fit model with uncorrelated
rf_model_uncorrelated = train_RF(200, X_train_uncorrelated, y_train, X_valid_uncorrelated, y_valid)

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
# LASSO for feature selection
lasso = LassoCV(random_state=42)
lasso.fit(X_train_uncorrelated, y_train)

# Select features with non-zero coefficients
selected_features = np.where(lasso.coef_ != 0)[0]
X_train_lasso = X_train_uncorrelated[:, selected_features]
X_valid_lasso = X_valid_uncorrelated[:, selected_features]
X_test_lasso = X_test_uncorrelated[:, selected_features]

print("X_train shape after filtering:", X_train_lasso.shape)
print("X_valid shape after filtering:", X_valid_lasso.shape)
print("X_test shape after filtering:", X_test_lasso.shape)

In [None]:
# Fit model with lasso
rf_model_lasso = train_RF(200, X_train_lasso, y_train, X_valid_lasso, y_valid)

In [None]:
# Feature importance
importances = rf_model_lasso.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort by importance

# Select dominant 20 features
num_features = 20
sel_features = X_train.columns[indices[:num_features]]

print("Selected Features:", sel_features)
X_train_reduced_RF_imp2 = X_train[sel_features]
X_valid_reduced_RF_imp2 = X_valid[sel_features]
X_test_reduced_RF_imp2 = X_test[sel_features]

In [None]:
rf_model_reduced_imp2 = train_RF(200, X_train_reduced_RF_imp2, y_train, X_valid_reduced_RF_imp2, y_valid)

Feature Extraction

In [None]:
from sklearn.decomposition import PCA

# PCA for dimensionality reduction
pca = PCA(n_components=5)  # Retain 5 principal components
X_train_pca = pca.fit_transform(X_train_lasso)
X_valid_pca = pca.transform(X_valid_lasso)
X_test_pca = pca.transform(X_train_lasso)

print("X_train shape after filtering:", X_train_pca.shape)
print("X_valid shape after filtering:", X_valid_pca.shape)
print("X_test shape after filtering:", X_test_pca.shape)

In [None]:
rf_model_pca = train_RF(200, X_train_pca, y_train, X_valid_pca, y_valid)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# LDA for dimensionality reduction
lda = LDA(n_components=3)  # Number of classes - 1
X_train_lda = lda.fit_transform(X_train_lasso, y_train)
X_valid_lda = lda.transform(X_valid_lasso)
X_test_lda = lda.transform(X_test_lasso)

print("X_train shape after filtering:", X_train_lda.shape)
print("X_valid shape after filtering:", X_valid_lda.shape)
print("X_test shape after filtering:", X_test_lda.shape)

In [None]:
rf_model_lda = train_RF(200, X_train_lda, y_train, X_valid_lda, y_valid)

# Why Random Forest ?
---
* Ease of Use: Random Forest requires less hyperparameter tuning compared to XGBoost, making it quicker to implement for feature selection.
* Feature Importance: Random Forest provides clear rankings of feature importance, which is directly used for selecting the top features.
* Efficiency: It handles large datasets with many features effectively and is robust to overfitting when used for feature selection.
* Interpretability: The feature importance scores are straightforward to interpret, unlike the complexity of interpreting gradient-boosted trees.

In [None]:
from xgboost import XGBClassifier

In [None]:
@logged
def train_XG(n_estimators=200, xtrain=X_train, 
             ytrain=y_train, xtest=X_test, ytest=y_test)->XGBClassifier:
    
    """train XGBoost Classifier and show metrics
    I chose recall score since I think the less False Positive the better.

    Args:
        n_estimators (int, optional): Defaults to 100.
        xtrain (_type_, optional): Defaults to X_train.
        ytrain (_type_, optional): Defaults to y_train.

    Returns:
        XGBClassifier: trained model
    """
    # Training
    xg_model = XGBClassifier(n_estimators=n_estimators, random_state=42)
    xg_model.fit(xtrain, ytrain)
    
    # Predicting
    res = xg_model.predict(xtest)
    print(res)
    
    # Scoring
    y_re = np.where(ytest >= 1, 1, ytest)
    res = np.where(res >= 1, 1, res)
    
    # Plotting
    ConfusionMatrixDisplay.from_predictions(y_re, res)
    
    print('Accuracy:', accuracy_score(y_re, res))
    print('Recall:', recall_score(y_re, res))
    
    return xg_model

In [None]:
xg_model = train_XG(200, X_train, y_train, X_valid, y_valid)

In [None]:
xg_model_reduced_var = train_XG(200, X_train_reduced_var, y_train,
                                X_valid_reduced_var, y_valid)

In [None]:
# Feature importance
importances = xg_model_reduced_var.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort by importance

# Select dominant 20 features
num_features = 20
sel_features = X_train.columns[indices[:num_features]]

print("Selected Features:", sel_features)
X_train_reduced_XG_imp = X_train[sel_features]
X_valid_reduced_XG_imp = X_valid[sel_features]
X_test_reduced_XG_imp = X_test[sel_features]

In [None]:
xg_model_reduced_imp = train_XG(200, X_train_reduced_XG_imp, y_train, X_valid_reduced_XG_imp, y_valid)

In [None]:
xg_model_uncorrelated = train_XG(200, X_train_uncorrelated, y_train, X_valid_uncorrelated, y_valid)

In [None]:
xg_model_lasso = train_XG(200, X_train_lasso, y_train, X_valid_lasso, y_valid)


In [None]:
xg_model_pca = train_XG(200, X_train_pca, y_train, X_valid_pca, y_valid)

In [None]:
xg_model_lda = train_XG(200, X_train_lda, y_train, X_valid_lda, y_valid)

# Why XGBoost?
---
* High Performance: XGBoost often achieves superior accuracy due to its sophisticated boosting algorithm, making it ideal for complex datasets.
* Handles Non-Linear Features: XGBoost excels at capturing intricate relationships between features, making it suitable for datasets with non-linear dependencies.
* Feature Importance: Like Random Forest, XGBoost provides feature importance rankings but also supports advanced metrics like SHAP values for deeper interpretability.
* Efficiency with Sparse Data: XGBoost is optimized for handling missing or sparse data, making it robust in real-world scenarios.
* Customizability: It offers extensive hyperparameter tuning options, allowing for precise control over model behavior and better optimization.

In [None]:
# Final datasets after feature engineering
print("X_train_lasso shape:", X_train_lasso.shape)
print("X_test_valid shape:", X_valid_lasso.shape)
print("X_test_lasso shape:", X_test_lasso.shape)

In [None]:
# Convert processed arrays to DataFrames
X_train_df = pd.DataFrame(X_train_lasso, columns=[f"Lasso_{i+1}" for i in range(X_train_lasso.shape[1])])
X_valid_df = pd.DataFrame(X_valid_lasso, columns=[f"Lasso_{i+1}" for i in range(X_valid_lasso.shape[1])])
X_test_df = pd.DataFrame(X_test_lasso, columns=[f"Lasso_{i+1}" for i in range(X_test_lasso.shape[1])])

# Include target variable
X_train_df['Label'] = y_train
X_valid_df['Label'] = y_valid
X_test_df['Label'] = y_test

# Save to CSV
X_train_df.to_csv("processed_X_train.csv", index=False)
X_valid_df.to_csv("processed_X_valid.csv", index=False)
X_test_df.to_csv("processed_X_test.csv", index=False)

print("Processed data saved to 'processed_X_train.csv', 'processed_X_valid.csv', and 'processed_X_test.csv'")


In [None]:
# Save to Parquet for more efficient storage
X_train_df.to_parquet("processed_X_train.parquet", index=False)
X_valid_df.to_parquet("processed_X_valid.parquet", index=False)
X_test_df.to_parquet("processed_X_test.parquet", index=False)

print("Processed data saved to 'processed_X_train.parquet', 'processed_X_test.parquet', and 'processed_X_test.parquet'")


# Model Selection and Training
---

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
@logged
def mass_train(name, model, xtrain, ytrain, xtest, yvalid, results):
    print(f"Training {model}...")
    model.fit(xtrain, ytrain)       # Use lasso-transformed data or top features
    y_pred = model.predict(xtest)   # Predict on the test set
    
    y_class = np.where(yvalid >= 1, 1, yvalid)
    y_pred = np.where(y_pred >= 1, 1, y_pred)
    
    # Evaluate performance
    accuracy = accuracy_score(y_class, y_pred)
    precision = precision_score(y_class, y_pred, average="weighted")
    recall = recall_score(y_class, y_pred, average="weighted")
    f1 = f1_score(y_class, y_pred, average="weighted")
    
    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    }
    
    print(f"{name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(kernel="linear", random_state=42),
}

In [None]:
# Initialize a dictionary to store performance metrics
results = {}

for name, model in models.items():
    mass_train(name, model, X_train_lasso, y_train, X_valid_lasso, y_valid, results)


In [None]:
# Convert results to a DataFrame for easier comparison
results_df = pd.DataFrame(results).T
print("Model Comparison:")
print(results_df)

In [None]:
from sklearn.model_selection import GridSearchCV

# Example: Tuning Random Forest
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 15, None],
}

grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5
                           , scoring="recall_weighted", verbose=2)
grid_search.fit(X_train_lasso, y_train)

# Get best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Recall Score:", grid_search.best_score_)

In [None]:
xg_model_final = XGBClassifier(n_estimators=100, max_depth=5, random_state=42)
xg_model_final.fit(X_train_lasso, y_train)
y_pred = xg_model_final.predict(X_test_lasso)

In [None]:
y_class = np.where(y_test >= 1, 1, y_test)
y_pred = np.where(y_pred >= 1, 1, y_pred)

# Evaluate performance
accuracy = accuracy_score(y_class, y_pred)
precision = precision_score(y_class, y_pred, average="weighted")
recall = recall_score(y_class, y_pred, average="weighted")
f1 = f1_score(y_class, y_pred, average="weighted")

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

In [None]:
ConfusionMatrixDisplay.from_predictions(y_class, y_pred)