In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
phishing_df = pd.read_csv('Data/phishing_2022.csv')
benign_df = pd.read_csv('Data/benign_2022.csv')


In [3]:
benign_df['label'] = 0  # Benign
phishing_df['label'] = 1  # Phishing

In [4]:
df = pd.concat([benign_df, phishing_df], ignore_index=True)


In [5]:
df.head()

Unnamed: 0,id,datetime,url,title,source_code,redirected_urls,header_info,certificate_information,screenshot_id,label
0,1,2022-07-21 13:07:36.840313,https://webfoundation.org/donate/,Donate – World Wide Web Foundation,"<html lang=""en"" style=""height: 100%;""><head><m...",{https://webfoundation.org/take-action},"{\n ""server"": ""nginx"",\n ""date"": ""Thu, 21 Ju...","{\n ""OCSP"": [\n ""http://r3.o.lencr.org""\n ...",2022_07_21_13_07_36_webfoundation.org0.png,0
1,2,2022-07-21 13:07:43.201543,https://webfoundation.org/our-work/projects/ta...,Tackling Online Gender-Based Violence and Abus...,"<html lang=""en"" style=""height: 100%;""><head><m...",,"{\n ""server"": ""nginx"",\n ""date"": ""Thu, 21 Ju...","{\n ""OCSP"": [\n ""http://r3.o.lencr.org""\n ...",2022_07_21_13_07_43_webfoundation.org0.png,0
2,3,2022-07-21 13:07:49.625375,https://webfoundation.org/2022/06/the-economic...,The economic costs of digital exclusion in Sou...,"<html lang=""en"" style=""height: 100%;""><head><m...",,"{\n ""server"": ""nginx"",\n ""date"": ""Thu, 21 Ju...","{\n ""OCSP"": [\n ""http://r3.o.lencr.org""\n ...",2022_07_21_13_07_49_webfoundation.org0.png,0
3,4,2022-07-21 13:08:22.565578,https://webfoundation.org/news/,News – World Wide Web Foundation,"<html lang=""en"" style=""height: 100%;""><head><m...",,"{\n ""server"": ""nginx"",\n ""date"": ""Thu, 21 Ju...","{\n ""OCSP"": [\n ""http://r3.o.lencr.org""\n ...",2022_07_21_13_08_22_webfoundation.org0.png,0
4,5,2022-07-21 13:08:48.198104,https://webfoundation.org/our-work/,Our Work – World Wide Web Foundation,"<html lang=""en"" style=""height: 100%;""><head><m...",,"{\n ""server"": ""nginx"",\n ""date"": ""Thu, 21 Ju...","{\n ""OCSP"": [\n ""http://r3.o.lencr.org""\n ...",2022_07_21_13_08_48_webfoundation.org0.png,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       200 non-null    int64 
 1   datetime                 200 non-null    object
 2   url                      200 non-null    object
 3   title                    186 non-null    object
 4   source_code              200 non-null    object
 5   redirected_urls          39 non-null     object
 6   header_info              200 non-null    object
 7   certificate_information  200 non-null    object
 8   screenshot_id            200 non-null    object
 9   label                    200 non-null    int64 
dtypes: int64(2), object(8)
memory usage: 15.8+ KB


In [7]:
df['url_length'] = df['url'].apply(len)
df['num_subdomains'] = df['url'].apply(lambda x: x.count('.') - 1)
df['has_https'] = df['url'].apply(lambda x: 1 if x.startswith('https') else 0)
df['has_at_symbol'] = df['url'].apply(lambda x: 1 if '@' in x else 0)
df['num_hyphens'] = df['url'].apply(lambda x: x.count('-'))
df['num_special_chars'] = df['url'].apply(lambda x: sum(x.count(char) for char in ['@', '%', '#', '$']))
suspicious_keywords = ['login', 'secure', 'account', 'verify', 'update', 'password']
df['has_suspicious_keyword'] = df['url'].apply(lambda x: 1 if any(keyword in x.lower() for keyword in suspicious_keywords) else 0)
df['num_external_links'] = df['source_code'].apply(lambda x: str(x).lower().count('href='))

In [9]:
onesite = df['source_code'].iloc[0]
print(onesite)

<html lang="en" style="height: 100%;"><head><meta charset="utf-8"><title>Donate – World Wide Web Foundation</title>
<meta name="robots" content="max-image-preview:large">
<link rel="dns-prefetch" href="//js.stripe.com">
<link rel="dns-prefetch" href="//fonts.googleapis.com">
<link rel="dns-prefetch" href="//s.w.org">
<link rel="alternate" type="application/rss+xml" title="World Wide Web Foundation » Feed" href="https://webfoundation.org/feed/">
<link rel="alternate" type="application/rss+xml" title="World Wide Web Foundation » Comments Feed" href="https://webfoundation.org/comments/feed/">
<script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-KCFFDTJ"></script><script type="text/javascript">
window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/13.1.0\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/13.1.0\/svg\/","svgExt":"

In [10]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

def get_outbound_links(website):
    # Fetch the page content
    soup = BeautifulSoup(website, 'html.parser')
    
    # Parse the base domain of the given URL
    base_domain = urlparse(url).netloc
    
    outbound_links = set()
    
    for a_tag in soup.find_all('a', href=True):  # Find all <a> tags with href
        link = a_tag['href']
        parsed_link = urlparse(link)
        
        # Ignore anchor links (e.g., "#section")
        if link.startswith("#") or parsed_link.fragment:
            continue
        
        # Convert relative links to absolute
        absolute_link = urljoin(url, link)
        parsed_absolute = urlparse(absolute_link)
        
        # Ignore internal links (same domain)
        if parsed_absolute.netloc == base_domain:
            continue
        
        # Ignore image links (ends with image extensions)
        if parsed_absolute.path.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp')):
            continue
        
        # Add to the outbound links set
        outbound_links.add(absolute_link)

    return outbound_links

# Example usage
url = "https://example.com"
outbound_links = get_outbound_links(url)

# Display results
for link in outbound_links:
    print(link)

  soup = BeautifulSoup(website, 'html.parser')


In [None]:
# List of columns to drop
columns_to_drop = ['id', 'datetime', 'url', 'title', 'source_code', 'redirected_urls', 'header_info', 
                   'certificate_information', 'screenshot_id',]

# Drop the columns
df = df.drop(columns=columns_to_drop)


In [None]:
df.tail(40)

In [None]:
df.info()

In [None]:
X = df.drop(columns=['label'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
model.feature_importances_

In [None]:
df.columns

In [None]:
plt.bar(df.columns[1:], model.feature_importances_)
plt.xticks(rotation=90)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))