In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.datasets import load_breast_cancer, load_iris
# Load breast cancer dataset from scikit-learn
breast_cancer_data = load_breast_cancer()
breast_cancer_df = pd.DataFrame(data=breast_cancer_data.data, columns=breast_cancer_data.feature_names)
breast_cancer_df['target'] = breast_cancer_data.target

# Load iris dataset from scikit-learn
iris_data = load_iris()
iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
iris_df['target'] = iris_data.target

# Load phishing dataset from local CSV file
phishing_df = pd.read_csv(r'C:\Users\diljo\Downloads\Phishing_Legitimate_full.csv')

# Display the shape of each dataset
print("Breast Cancer Dataset Shape:", breast_cancer_df.shape)
print("Iris Dataset Shape:", iris_df.shape)
print("Phishing Dataset Shape:", phishing_df.shape)

Breast Cancer Dataset Shape: (569, 31)
Iris Dataset Shape: (150, 5)
Phishing Dataset Shape: (10000, 50)


In [4]:
# Breast Cancer dataset
X_bc = breast_cancer_df.drop('target', axis=1)
y_bc = breast_cancer_df['target']
X_bc_train, X_bc_test, y_bc_train, y_bc_test = train_test_split(X_bc, y_bc, test_size=0.2, random_state=42)

# Train XGBoost model on Breast Cancer dataset
bc_dtrain = xgb.DMatrix(X_bc_train, label=y_bc_train)
bc_dtest = xgb.DMatrix(X_bc_test, label=y_bc_test)
params_bc = {'objective': 'binary:logistic', 'eval_metric': 'error'}
num_rounds_bc = 100
xgb_model_bc = xgb.train(params_bc, bc_dtrain, num_rounds_bc)

In [5]:
# Predictions on Breast Cancer test set
y_bc_pred = xgb_model_bc.predict(bc_dtest)
y_bc_pred_labels = [round(value) for value in y_bc_pred]
bc_accuracy = accuracy_score(y_bc_test, y_bc_pred_labels)
print("Breast Cancer XGBoost Accuracy:", bc_accuracy)

Breast Cancer XGBoost Accuracy: 0.956140350877193


In [6]:
# Iris dataset
X_iris = iris_df.drop('target', axis=1)
y_iris = iris_df['target']
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

In [7]:
# Train XGBoost model on Iris dataset
iris_dtrain = xgb.DMatrix(X_iris_train, label=y_iris_train)
iris_dtest = xgb.DMatrix(X_iris_test, label=y_iris_test)
params_iris = {'objective': 'multi:softmax', 'num_class': 3, 'eval_metric': 'merror'}
num_rounds_iris = 100
xgb_model_iris = xgb.train(params_iris, iris_dtrain, num_rounds_iris)

In [8]:
# Predictions on Iris test set
y_iris_pred = xgb_model_iris.predict(iris_dtest)
iris_accuracy = accuracy_score(y_iris_test, y_iris_pred)
print("Iris XGBoost Accuracy:", iris_accuracy)

Iris XGBoost Accuracy: 1.0


In [12]:
# Split dataset into features (X) and target variable (y)
X_phishing = phishing_df.drop(columns=['CLASS_LABEL'])
y_phishing = phishing_df['CLASS_LABEL']

# Split data into training and testing sets
X_phishing_train, X_phishing_test, y_phishing_train, y_phishing_test = train_test_split(X_phishing, y_phishing, test_size=0.3, random_state=42)

# Train XGBoost model
phishing_dtrain = xgb.DMatrix(X_phishing_train, label=y_phishing_train)
phishing_dtest = xgb.DMatrix(X_phishing_test, label=y_phishing_test)
params_phishing = {'objective': 'binary:logistic', 'eval_metric': 'error'}
num_rounds_phishing = 24
xgb_model_phishing = xgb.train(params_phishing, phishing_dtrain, num_rounds_phishing)

In [14]:
# Make predictions on the test data
y_pred_phishing = xgb_model_phishing.predict(phishing_dtest)

# Convert probabilities to binary predictions (0 or 1)
y_pred_binary_phishing = [1 if pred > 0.5 else 0 for pred in y_pred_phishing]

# Calculate accuracy
accuracy_phishing = sum(y_pred_binary_phishing == y_phishing_test) / len(y_phishing_test)
print("Phishing Xgboost accuracy:", accuracy_phishing)

Phishing Xgboost accuracy: 1.0
