In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler


df = pd.read_csv("uci-ml-phishing-dataset.csv")
df = df.drop(columns=['id'])


X = df.drop(columns=['Result'])  # All columns except the target
y = df['Result']  # Target column


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

print("Class distribution before undersampling:")
print(Counter(y_train))  # Check imbalance in training set

Class distribution before undersampling:
Counter({1: 5541, -1: 4408})


In [3]:
# Apply Random Undersampling only on training set
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)

print("\nClass distribution after undersampling:")
print(Counter(y_resampled))  # Verify class balance


Class distribution after undersampling:
Counter({-1: 4408, 1: 4408})


In [4]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Initialize Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Apply RFE to select top 16 features
rfe = RFE(estimator=rf, n_features_to_select=16)
rfe.fit(X_resampled, y_resampled)

# Get selected features
selected_features = X.columns[rfe.support_]
print("Top 16 Selected Features:", selected_features)

Top 16 Selected Features: Index(['having_IP_Address', 'Prefix_Suffix', 'having_Sub_Domain',
       'SSLfinal_State', 'Domain_registeration_length', 'Request_URL',
       'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Google_Index', 'Links_pointing_to_page'],
      dtype='object')


In [5]:
X_train_selected = X_resampled[selected_features] 
X_test_selected = X_test[selected_features]

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42,min_samples_leaf=1,min_samples_split=2 )
rf_model.fit(X_train_selected, y_resampled)

# Make predictions
rf_predictions = rf_model.predict(X_test_selected)

# Evaluate model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_report = classification_report(y_test, rf_predictions)

print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Classification Report:\n", rf_report)

import pickle

# The name of the file you want to save your model to
filename = 'random_forest_model.pkl'

# Use a 'with' statement to open the file in write-binary mode ('wb')
with open(filename, 'wb') as file:
    # Use pickle.dump() to write the RF_model object to the file
    pickle.dump(rf_model, file)

print(f"Model successfully saved to {filename}")

Random Forest Accuracy: 97.83%
Random Forest Classification Report:
               precision    recall  f1-score   support

          -1       0.98      0.98      0.98       490
           1       0.98      0.98      0.98       616

    accuracy                           0.98      1106
   macro avg       0.98      0.98      0.98      1106
weighted avg       0.98      0.98      0.98      1106

Model successfully saved to random_forest_model.pkl
