In [1]:
!pip install pandas scikit-learn joblib


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib


In [3]:
# Load the CSV file
df = pd.read_csv("phishing.csv")  # Ensure it's in the same directory
df = df.drop(["Index"], axis=1)
# Preview the dataset
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (11054, 31)


Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,Favicon,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1


In [4]:
# Separate features and label
X = df.drop("class", axis=1)
y = df["class"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (11054, 30)
Target shape: (11054,)


In [6]:
# Normalize features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to DataFrame for readability
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [7]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (8843, 30)
Test shape: (2211, 30)


In [8]:
# Initialize and train the classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model training complete.")


Model training complete.


In [10]:
# Predict on test set
y_pred = model.predict(X_test)

# Print accuracy and metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9692446856625961
Confusion Matrix:
 [[ 937   39]
 [  29 1206]]
Classification Report:
               precision    recall  f1-score   support

          -1       0.97      0.96      0.96       976
           1       0.97      0.98      0.97      1235

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211



In [11]:
# Save the model and scaler
joblib.dump(model, "phishing_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved as phishing_model.pkl and scaler.pkl")


Model and scaler saved as phishing_model.pkl and scaler.pkl


In [12]:
# Predict for a new sample
sample = X_test.iloc[0:1]
true_label = y_test.iloc[0]
predicted = model.predict(sample)

print(f"Predicted: {predicted[0]}, Actual: {true_label}")


Predicted: -1, Actual: -1


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import joblib

# 1. Load the dataset
df = pd.read_csv("phishing.csv")
df = df.drop(["Index"], axis=1)
# 2. Separate features and target
X = df.drop("class", axis=1)
y = df["class"]

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Train the model with balanced class weights
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

# 6. Evaluate on test data
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# 7. Save model and scaler
joblib.dump(model, "phishing_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("✅ Model and scaler saved!")


Accuracy: 0.9665
Classification Report:
              precision    recall  f1-score   support

          -1       0.97      0.96      0.96       976
           1       0.97      0.97      0.97      1235

    accuracy                           0.97      2211
   macro avg       0.97      0.97      0.97      2211
weighted avg       0.97      0.97      0.97      2211

✅ Model and scaler saved!
