In [1]:
import pandas as pd

# Load only 5000 rows from the CSV
df = pd.read_csv("malicious_phish.csv").sample(n=5000, random_state=42)

print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (5000, 2)
                                                      url        type
536448             http://37.49.226.178/deusbins/deus.sh4     malware
40630   medical-dictionary.thefreedictionary.com/Galt+...      benign
630496                         www.jscape.com/sshfactory/    phishing
426724  http://www.wsnc.org.au/component/jcalpro/view/983  defacement
184034  virtualtourist.com/travel/North_America/Canada...      benign


In [2]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['type'] = encoder.fit_transform(df['type'])
print(df.head())





                                                      url  type
536448             http://37.49.226.178/deusbins/deus.sh4     2
40630   medical-dictionary.thefreedictionary.com/Galt+...     0
630496                         www.jscape.com/sshfactory/     3
426724  http://www.wsnc.org.au/component/jcalpro/view/983     1
184034  virtualtourist.com/travel/North_America/Canada...     0


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['url'])
y = df['type']

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.905


In [5]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.98      0.94       661
           1       0.91      0.94      0.93       145
           2       0.98      0.82      0.89        50
           3       0.86      0.56      0.68       144

    accuracy                           0.91      1000
   macro avg       0.91      0.83      0.86      1000
weighted avg       0.90      0.91      0.90      1000



In [None]:
# 🔹 Step 8: User Input Prediction
while True:
    user_url = input("Enter a URL to check (or type 'exit' to quit): ")
    if user_url.lower() == 'exit':
        break
    
    # Convert user input into same feature format (TF-IDF)
    user_feature = vectorizer.transform([user_url])
    
    # Predict
    pred = model.predict(user_feature)[0]
    
    # Convert back to label name
    result = encoder.inverse_transform([pred])[0]
    print("Prediction:", result)


Prediction: phishing
Prediction: phishing
