<a href="https://colab.research.google.com/github/MorganChidley/Final-Year-Project/blob/main/ml_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EDA/Feature Engineering

**Importing original dataset**

In [1]:
import pandas as pd

# Load the dataset
file_path = "MyDataSET.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the data
data.head()

Unnamed: 0,URL,ClassLabel
0,https://keraekken-loagginnusa.godaddysites.com/,0
1,https://metamsk01lgiix.godaddysites.com/,0
2,http://myglobaltech.in/,0
3,http://djtool-for-spotify.com/,0
4,https://scearmcoommunnlty.com/invent/freind/get,0


**Displaying number of Rows and Columns**

In [2]:
import pandas as pd

# Load the dataset
file_path = "MyDataSET.csv"
data = pd.read_csv(file_path)

# Get the shape of the data (number of rows and columns)
data.shape

(905, 2)

**Checking dataset for missing values**

In [3]:
import pandas as pd

# Load the dataset
file_path = "MyDataSET.csv"
data= pd.read_csv(file_path)

# Check for missing values
data.isnull().sum()

Unnamed: 0,0
URL,0
ClassLabel,0


**Removing duplicates from dataset**

In [4]:
import pandas as pd

# Load the dataset
file_path = "MyDataSET.csv"
data= pd.read_csv(file_path)

# Remove duplicate rows
data.drop_duplicates(inplace=True)

# Check the new shape of the dataset
data.shape



(903, 2)

**Feature Extraction**

In [5]:
import pandas as pd
import re
from urllib.parse import urlparse

# Feature extraction functions

def url_length(url):
    """Returns the length of the URL."""
    return len(url)

def has_ip_address(url):
    """Checks if the URL contains an IP address."""
    ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
    return 1 if re.search(ip_pattern, url) else 0

def count_special_chars(url):
    """Counts occurrences of special characters."""
    special_chars = ["-", "_", "%", "/", ".", "#"]
    return {char: url.count(char) for char in special_chars}

def count_subdomains(url):
    """Counts the number of subdomains."""
    parsed_url = urlparse(url).netloc
    return parsed_url.count('.')

def has_https(url):
    """Checks if the URL uses HTTPS."""
    return 1 if urlparse(url).scheme == "https" else 0


def count_query_parameters(url):
    """Counts the number of query parameters in the URL."""
    query = urlparse(url).query
    return len(query.split("&")) if query else 0

# Apply feature extraction

data["url_length"] = data["URL"].apply(url_length)
data["subdomain_count"] = data["URL"].apply(count_subdomains)
data["https"] = data["URL"].apply(has_https)
data["has_ip_address"] = data["URL"].apply(has_ip_address)
data["query_parameters_count"] = data["URL"].apply(count_query_parameters)

# Extract special character counts only once
if not all(char in data.columns for char in ["-", "_", "%", "/", ".", "#"]):
    special_chars_df = data["URL"].apply(count_special_chars).apply(pd.Series)
    data = pd.concat([data, special_chars_df], axis=1)

# Remove duplicate columns
data = data.loc[:, ~data.columns.duplicated()]

# Converts all feature changes to a new csv file
data.to_csv("modified_dataset.csv", index=False)

# Display first few rows with new features
data.head()

Unnamed: 0,URL,ClassLabel,url_length,subdomain_count,https,has_ip_address,query_parameters_count,-,_,%,/,.,#
0,https://keraekken-loagginnusa.godaddysites.com/,0,47,2,1,0,0,1,0,0,3,2,0
1,https://metamsk01lgiix.godaddysites.com/,0,40,2,1,0,0,0,0,0,3,2,0
2,http://myglobaltech.in/,0,23,1,0,0,0,0,0,0,3,1,0
3,http://djtool-for-spotify.com/,0,30,1,0,0,0,2,0,0,3,1,0
4,https://scearmcoommunnlty.com/invent/freind/get,0,47,1,1,0,0,0,0,0,5,1,0


**Deploying Histogram**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Drop the 'ClassLabel' column
data = data.drop('ClassLabel', axis=1)

data.hist(figsize=(15, 30))  # Adjust figsize as needed
plt.show()

**Deploying Correlation matrix**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Drop the 'ClassLabel' column
data = data.drop('ClassLabel', axis=1)

# Select numeric columns
numeric_data = data.select_dtypes(include=np.number)

# Calculate correlation matrix
correlation_matrix = numeric_data.corr()

# Display correlation matrix as a styled table
correlation_table = correlation_matrix.style.background_gradient(cmap='coolwarm')

correlation_table

**Deploying Mutual Information**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from IPython.display import display # Import for display function

# Load dataset
data = pd.read_csv("modified_dataset.csv")

X = data.drop(['ClassLabel', "URL"], axis=1)
y = data['ClassLabel']

# Mutual Information
mutual_info = mutual_info_classif(X, y)
feature_scores = pd.DataFrame({'Feature': X.columns, 'Mutual_Information': mutual_info})
feature_scores = feature_scores.sort_values(by=['Mutual_Information'], ascending=False)


# Display mutual information scores in a table
display(feature_scores)



# Baseline Model Development

**Training Models with basic parameters**

In [6]:
from pickle import TRUE
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.display import display

# Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#",
            "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']  # Target variable

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train and evaluate multiple models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(probability=True),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
}

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate models using cross-validation
kfold_results = {}
for model_name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')
    kfold_results[model_name] = {
        "Model": model_name,
        "Mean Accuracy": np.mean(scores),
        "Standard Deviation": np.std(scores)
    }

# Store results in a list of dictionaries
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
# Display the DataFrame

   # Extract precision, recall, and F1-score for each class
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    precision = report_dict['weighted avg']['precision']
    recall = report_dict['weighted avg']['recall']
    f1_score = report_dict['weighted avg']['f1-score']

    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1_score,
        'ROC-AUC': roc_auc,
        'Confusion Matrix': conf_matrix,
    })

# Create a Pandas DataFrame from the results
results_df = pd.DataFrame(results)
kfold_results_df = pd.DataFrame(kfold_results.values())

# Reorder columns
kfold_results_df = kfold_results_df[['Model', 'Mean Accuracy', 'Standard Deviation']]

# Display the DataFrame
display(results_df)
display(kfold_results_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC-AUC,Confusion Matrix
0,Logistic Regression,0.906077,0.921702,0.906077,0.909989,0.963705,"[[128, 14], [3, 36]]"
1,Decision Tree,0.961326,0.963061,0.961326,0.961836,0.985645,"[[137, 5], [2, 37]]"
2,Random Forest,0.966851,0.966851,0.966851,0.966851,0.994222,"[[139, 3], [3, 36]]"
3,SVM,0.961326,0.961761,0.961326,0.961502,0.993319,"[[138, 4], [3, 36]]"
4,MLP,0.955801,0.958564,0.955801,0.956563,0.994583,"[[136, 6], [2, 37]]"


Unnamed: 0,Model,Mean Accuracy,Standard Deviation
0,Logistic Regression,0.947968,0.021678
1,Decision Tree,0.956839,0.012725
2,Random Forest,0.964579,0.010789
3,SVM,0.961246,0.006042
4,MLP,0.964567,0.004384


# Hyperparameter Tuning

**Hyperparamter Tuning using grid search**

In [7]:
from pickle import TRUE
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.display import display

# Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#",
            "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']  # Target variable

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models and parameter grids for tuning
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(probability=True),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
}

# Define models and parameter grids for tuning
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'lbfgs']
    },
    "Decision Tree": {
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    "MLP": {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01]
    }
}


# Perform hyperparameter tuning and evaluate models
cv_results = {}
for model_name, param_grid in param_grids.items():
    print(f"Tuning {model_name}...")
    model = models[model_name]  # Get model instance from dictionary
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(X_scaled, y)

    # Store results
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    cv_results[model_name] = {
        "Best Parameters": best_params,
        "Best Accuracy": best_score
    }

# Print and log results
log_data = []
for model, result in cv_results.items():
    print(f"{model}: Best Accuracy = {result['Best Accuracy']:.4f}, Best Parameters = {result['Best Parameters']}")
    log_data.append([model, result['Best Accuracy'], result['Best Parameters']])

# Save tuning results to CSV
df_log = pd.DataFrame(log_data, columns=['Model', 'Best Accuracy', 'Best Parameters'])
df_log.to_csv("Model_Tuning_Results.csv", index=False)

print("Hyperparameter tuning complete! Results saved in Model_Tuning_Results.csv")


Tuning Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)


Tuning Decision Tree...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Tuning Random Forest...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Tuning SVM...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Tuning MLP...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Logistic Regression: Best Accuracy = 0.9568, Best Parameters = {'C': 0.1, 'solver': 'lbfgs'}
Decision Tree: Best Accuracy = 0.9602, Best Parameters = {'max_depth': 10, 'min_samples_split': 10}
Random Forest: Best Accuracy = 0.9679, Best Parameters = {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
SVM: Best Accuracy = 0.9723, Best Parameters = {'C': 10, 'kernel': 'rbf'}
MLP: Best Accuracy = 0.9734, Best Parameters = {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100, 50), 'solver': 'adam'}
Hyperparameter tuning complete! Results saved in Model_Tuning_Results.csv


**Model Optimisation, Validation Curves, Over/Under fitting**

In [None]:
from pickle import TRUE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.display import display

# Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#",
            "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']  # Target variable

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models and parameter grids for tuning
models = {
    "Logistic Regression": LogisticRegression(C=0.1, solver='lbfgs'),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10),
    "SVM": SVC(C=10, kernel='rbf'),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', solver='adam', alpha=0.001, max_iter=500)
}

# Function to plot learning curves
def plot_learning_curve(model, X, y, model_name):
    train_sizes, train_scores, val_scores = learning_curve(model, X, y, cv=5, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Accuracy')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.2)
    plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Accuracy')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, color='red', alpha=0.2)

    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy")
    plt.title(f"Learning Curve: {model_name}")
    plt.legend()
    plt.show()

# Plot learning curves for all models
for model_name, model in models.items():
    plot_learning_curve(model, X_scaled, y, model_name)

# Feature Importance (for Decision Tree & Random Forest)
def plot_feature_importance(model, model_name):
    feature_importance = model.feature_importances_
    sorted_idx = np.argsort(feature_importance)
    plt.figure(figsize=(8, 5))
    plt.barh(np.array(features)[sorted_idx], feature_importance[sorted_idx], color='teal')
    plt.xlabel("Feature Importance")
    plt.title(f"Feature Importance: {model_name}")
    plt.show()

# Train tree-based models and plot feature importance
tree_models = ["Decision Tree", "Random Forest"]
for model_name in tree_models:
    model = models[model_name].fit(X_scaled, y)
    plot_feature_importance(model, model_name)

print("Analysis complete! Check the plots for insights.")



**Validation for performance esitmates**

In [None]:
from pickle import TRUE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from IPython.display import display
from sklearn.pipeline import Pipeline

# Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#",
            "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']  # Target variable

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define models and parameter grids for tuning
models = {
    "Logistic Regression": LogisticRegression(C=0.1, solver='lbfgs'),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10),
    "SVM": SVC(C=10, kernel='rbf'),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', solver='adam', alpha=0.001, max_iter=500)
}

# Define models with pipelines
models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(C=0.1, solver='lbfgs'))
    ]),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(C=10, kernel='rbf'))
    ]),
    "MLP": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', solver='adam', alpha=0.001, max_iter=500))
    ])
}

# Nested Cross-Validation Setup
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Perform Nested CV Evaluation
cv_results = {}
for model_name, model in models.items():
    print(f"Evaluating {model_name} using Nested Cross-Validation...")
    scores = cross_val_score(model, X_train, y_train, cv=outer_cv, scoring='accuracy', n_jobs=-1)
    mean_score, std_score = np.mean(scores), np.std(scores)

    # Store results
    cv_results[model_name] = {
        "Mean Accuracy": mean_score,
        "Std Deviation": std_score
    }

# Print results
for model, result in cv_results.items():
    print(f"{model}: Mean Accuracy = {result['Mean Accuracy']:.4f}, Std Dev = {result['Std Deviation']:.4f}")

# Evaluate final models on the validation set
val_results = {}
for model_name, model in models.items():
       model.fit(X_train, y_train)
       val_accuracy = model.score(X_val, y_val)  # Use X_val and y_val here
       val_results[model_name] = val_accuracy


# Print validation results
print("\nFinal Validation Set Performance:")
for model, acc in val_results.items():
    print(f"{model}: Validation Accuracy = {acc:.4f}")

print("Nested Cross-Validation and Validation Set Evaluation Complete!")




# Advanced Modelling & Ensemble Techniques

**Implementing Ensemble methods**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import Pipeline

# Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#",
            "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']  # Target variable

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define base models for stacking
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5)),
    ('svm', SVC(C=1, kernel='rbf', probability=True)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.001, max_iter=500))
]

# Meta-model (Logistic Regression) for stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())

# Boosting models
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6)

# Train and evaluate ensemble models
ensemble_models = {
    "Stacking Classifier": stacking_model,
    "XGBoost": xgb_model,
    "LightGBM": lgb_model
}

ensemble_results = {}
for model_name, model in ensemble_models.items():
    model.fit(X_train, y_train)
    val_accuracy = model.score(X_val, y_val)
    ensemble_results[model_name] = val_accuracy

# Print validation results
print("\nEnsemble Model Performance on Validation Set:")
for model, acc in ensemble_results.items():
    print(f"{model}: Validation Accuracy = {acc:.4f}")

print("Ensemble Learning Implementation Complete!")


**Cross-validation to benchmark the performance of ensemble models against individual models**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import Pipeline

# Load Dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#",
            "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']  # Target variable

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define individual models
individual_models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(C=1, solver='lbfgs'))
    ]),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=10,),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=10),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(C=10, kernel='rbf', probability=True))
    ]),
    "MLP": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MLPClassifier(hidden_layer_sizes=(100,50), activation='tanh', solver='adam', alpha=0.001, max_iter=500))
    ])
}

# Define ensemble models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5)),
    ('svm', SVC(C=1, kernel='rbf', probability=True)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.001, max_iter=500))
]
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression())
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, use_label_encoder=False, eval_metric='logloss')
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6)

ensemble_models = {
    "Stacking Classifier": stacking_model,
    "XGBoost": xgb_model,
    "LightGBM": lgb_model
}

# Cross-validation setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate individual models
cv_results = {}
for model_name, model in {**individual_models, **ensemble_models}.items():
    print(f"Evaluating {model_name} using Cross-Validation...")
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    mean_score, std_score = np.mean(scores), np.std(scores)

    # Store results
    cv_results[model_name] = {
        "Mean Accuracy": mean_score,
        "Std Deviation": std_score
    }

# Print results
print("\nCross-Validation Performance Comparison:")
for model, result in cv_results.items():
    print(f"{model}: Mean Accuracy = {result['Mean Accuracy']:.4f}, Std Dev = {result['Std Deviation']:.4f}")

print("Cross-Validation Benchmarking Complete!")





In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pickle

# Load the dataset
data = pd.read_csv("modified_dataset.csv")

# Feature Selection
features = ['url_length', 'subdomain_count', 'https', "-", "_", "%", "/", ".", "#", "has_ip_address", "query_parameters_count"]
X = data[features]
y = data['ClassLabel']

# Handle missing values (fill with median)
X.fillna(X.median(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the SVM model with the best parameters
svm_model = SVC(C=1, kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)

# Save the trained model as a pickle file
filename = 'trained_svm_model.pkl'
pickle.dump(svm_model, open(filename, 'wb'))

print(f"Trained SVM model saved as {filename}")

Trained SVM model saved as trained_svm_model.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)
