In [1]:

# Step 1: Import libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import plotly.express as px
import plotly.graph_objects as go


In [2]:

# Step 2: Load dataset
df = pd.read_csv('/content/PhiUSIIL_Phishing_URL_Dataset.csv')
df.head()


Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


In [3]:

# Step 3: Preprocess data
features = df.drop(columns=['FILENAME', 'URL', 'Domain', 'TLD', 'Title', 'label'])
labels = df['label']

le = LabelEncoder()
for col in features.columns:
    if features[col].dtype == 'object':
        features[col] = le.fit_transform(features[col])


In [4]:

# Step 4: Train-test split and model training
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20124
           1       1.00      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



In [5]:

# Step 5: Show top 15 important features
importances = model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': features.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False).head(15)

fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h',
             title="Top 15 Most Important Features", color='Importance')
fig.show()


In [7]:

# Step 6: URL detection logic with explanation
input_url = input("Enter a URL to check: ")

# Match URL in the dataset
matched_row = df[df['URL'].str.contains(input_url, case=False, na=False)]

if not matched_row.empty:
    sample = matched_row.drop(columns=['FILENAME', 'URL', 'Domain', 'TLD', 'Title', 'label'])
    for col in sample.columns:
        if sample[col].dtype == 'object':
            sample[col] = le.fit_transform(sample[col])
    prediction = model.predict(sample)[0]
    prediction_text = "Phishing 🚨" if prediction == 1 else "Legitimate ✅"

    print("\nMatched URL:", matched_row.iloc[0]['URL'])
    print("Prediction:", prediction_text)

    # Show top contributing features
    importance_df = pd.DataFrame({
        'Feature': sample.columns,
        'Value': sample.values[0],
        'Importance': model.feature_importances_
    }).sort_values(by='Importance', ascending=False).head(5)

    print("\nTop features influencing this prediction:")
    print(importance_df[['Feature', 'Value', 'Importance']])

    fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h',
                 title="Top Contributing Features for This URL", color='Importance')
    fig.show()
else:
    print("URL not found in dataset. Try one that exists in the dataset.")


Enter a URL to check: http://www.f0534243.xsph.ru

Matched URL: http://www.f0534243.xsph.ru
Prediction: Legitimate ✅

Top features influencing this prediction:
               Feature      Value  Importance
3   URLSimilarityIndex  61.580882    0.173672
49     NoOfExternalRef   0.000000    0.169276
22          LineOfCode   9.000000    0.153836
47         NoOfSelfRef   0.000000    0.106813
44           NoOfImage   0.000000    0.093526
