# Import Required Libraries

In [1]:
# Import necessary libraries
import numpy as np  # numerical operations
import pandas as pd  #handling dataframes
import re  # Regular expressions for pattern matching
import joblib  # saving and loading models

# Import Machine Learning libraries
from sklearn.model_selection import train_test_split  # Splitting data
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.metrics import classification_report, accuracy_score  # Model evaluation

# Load and Explore the Dataset

In [2]:
# Load the dataset from CSV file
url_data = pd.read_csv("malicious_phish1.csv") 

In [3]:
# Display first 10 rows to understand the data
print(url_data.head(10))  

                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
5  http://buzzfil.net/m/show-art/ils-etaient-loin...      benign
6      espn.go.com/nba/player/_/id/3457/brandon-rush      benign
7     yourbittorrent.com/?q=anthony-hamilton-soulife      benign
8       http://www.pashminaonline.com/pure-pashminas  defacement
9      allmusic.com/album/crazy-from-the-heat-r16990      benign


In [4]:
# Check basic dataset information (columns, datatypes, missing values)
print(url_data.info())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  object
dtypes: object(2)
memory usage: 9.9+ MB
None


In [5]:
# Check for missing values in each column
print(url_data.isnull().sum())  

url     0
type    0
dtype: int64


In [6]:
# Drop any missing values (if they exist)
url_data = url_data.dropna()  

In [7]:
# Display summary statistics of the dataset
print(url_data.describe()) 

                                                      url    type
count                                              651191  651191
unique                                             641119       4
top     http://style.org.hc360.com/css/detail/mysite/s...  benign
freq                                                  180  428103


# Convert Categorical Labels into Numerical Values

In [8]:
# Define a mapping dictionary to convert text labels to numbers
label_mapping = {'phishing': 1, 'benign': 0, 'defacement': 2, 'malware': 3}  

# Apply the mapping to the "type" column
url_data["type"] = url_data["type"].map(label_mapping)  

# Display first 10 rows after label conversion
print(url_data.head(10))  

                                                 url  type
0                                   br-icloud.com.br     1
1                mp3raid.com/music/krizz_kaliko.html     0
2                    bopsecrets.org/rexroth/cr/1.htm     0
3  http://www.garage-pirenne.be/index.php?option=...     2
4  http://adventure-nicaragua.net/index.php?optio...     2
5  http://buzzfil.net/m/show-art/ils-etaient-loin...     0
6      espn.go.com/nba/player/_/id/3457/brandon-rush     0
7     yourbittorrent.com/?q=anthony-hamilton-soulife     0
8       http://www.pashminaonline.com/pure-pashminas     2
9      allmusic.com/album/crazy-from-the-heat-r16990     0


# Define Feature Extraction Function

In [9]:
def extract_feature(url):  
    features = {}  

    # Feature 1: Length of URL
    features["length"] = len(url)  

    # Feature 2: Count special characters
    special_chars = ['@', '?', '&', '=', '#', '%']
    features["num_special_chars"] = sum(url.count(char) for char in special_chars)

    # Feature 3: Count subdomains (dots)
    features["num_subdomain"] = url.count(".")  

    # Feature 4: Check if URL uses HTTPS
    features["is_https"] = 1 if url.startswith("https") else 0  

    # Feature 5: Check if URL contains an IP address
    ip_pattern = r'(\d{1,3}\.){3}\d{1,3}'
    features["contains_ip"] = 1 if re.search(ip_pattern, url) else 0

    # Feature 6: Count digits
    features["num_digits"] = sum(char.isdigit() for char in url)

    # Feature 7: Count hyphens
    features["num_hyphens"] = url.count("-")  

    # Feature 8: Count slashes
    features["num_slashes"] = url.count("/")  

    # New Feature 9: Check for suspicious keywords
    suspicious_keywords = ["login", "paypal", "bank", "secure", "account", "update"]
    features["has_suspicious_keyword"] = 1 if any(keyword in url.lower() for keyword in suspicious_keywords) else 0

    # New Feature 10: Domain length
    domain = re.match(r'(?:https?://)?([^/]+)', url)
    features["domain_length"] = len(domain.group(1)) if domain else len(url)

    return features

# Apply Feature Extraction to the Dataset

In [10]:
# Create an empty list to store extracted features
features_list = []  

# Loop through each URL in the dataset and extract features
for url in url_data["url"]:  
    extracted_features = extract_feature(url)  # Extract features  
    features_list.append(extracted_features)  # Append features to the list  

# Convert the list of extracted features into a DataFrame
features_df = pd.DataFrame(features_list)  

# Display first 5 rows of extracted features
print(features_df.head())  

   length  num_special_chars  num_subdomain  is_https  contains_ip  \
0      16                  0              2         0            0   
1      35                  0              2         0            0   
2      31                  0              2         0            0   
3      88                  8              3         0            0   
4     235                  6              2         0            0   

   num_digits  num_hyphens  num_slashes  has_suspicious_keyword  domain_length  
0           0            1            0                       0             16  
1           1            0            2                       0             11  
2           1            0            3                       0             14  
3           7            1            3                       0             21  
4          22            1            3                       0             23  


# Prepare the Data for Training

In [11]:
# Define target variable (labels)
y = url_data["type"]  

# Ensure features DataFrame and target variable have the same indices
features_df = features_df.loc[y.index]   

In [12]:
# Split the dataset into training (80%) and testing (20%) sets
x_train, x_test, y_train, y_test = train_test_split(features_df, y, test_size=0.2, random_state=42)   

In [13]:

# Display shapes of training and testing datasets
print("Training Features Shape:", x_train.shape)  
print("Testing Features Shape:", x_test.shape)  
print("Training Labels Shape:", y_train.shape)  
print("Testing Labels Shape:", y_test.shape)

Training Features Shape: (520952, 10)
Testing Features Shape: (130239, 10)
Training Labels Shape: (520952,)
Testing Labels Shape: (130239,)


# Train Random Forest Classifier

In [None]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, class_weight="balanced")  

# Train the model
rf_model.fit(x_train, y_train)  

# Make predictions
y_pred_rf = rf_model.predict(x_test)  

# Display accuracy and classification report
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))  
print(classification_report(y_test, y_pred_rf))

In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
scores = cross_val_score(rf_model, features_df, y, cv=5, scoring="accuracy")
print("Cross-Validation Accuracy Scores:", scores)
print("Average CV Accuracy:", scores.mean())

# Save the Trained Model

In [None]:
# Save trained model as a .pkl file
joblib.dump(rf_model, "random_forest_model.joblib")  

print("Model saved successfully!")  


# Load the Model for Future Predictions

In [None]:
# Load the saved model
rf_loaded_model = joblib.load("random_forest_model.joblib")
print("Model loaded successfully!")

# Create a Function to Predict Any URL

In [None]:
# Function to predict if a URL is phishing or safe
def predict_url(url):  
    extracted_features = extract_feature(url)  # Extract features  
    feature_df = pd.DataFrame([extracted_features])  # Convert to DataFrame  
    prediction = rf_loaded_model.predict(feature_df)[0]  # Get predicted class  

    # Map prediction numbers back to category labels
    label_mapping_reverse = {0: 'Benign', 1: 'Phishing', 2: 'Defacement', 3: 'Malware'}  
    
    return label_mapping_reverse[prediction]  # Return prediction label

# Test with a New URL

In [None]:
test_urls = [
    "http://malicious-site.com/paypal-login",
    "https://www.paypal.com/signin",
    "http://192.168.1.1/login",
    "https://example.com",
    "http://fake-bank.com/account-update"
]

for url in test_urls:
    prediction = predict_url(url)
    print(f"URL: {url} – Prediction: {prediction}")