1. Scrape & preprocess webpage text (remove noise, extract useful parts).
2. Extract NLP features (TF-IDF, embeddings, readability scores).
3. Train ML model (Logistic Regression, Random Forest, LightGBM).
4. Fine-tune deep learning model (BERT-based phishing detection).
5. Deploy in Streamlit UI (user enters URL → NLP model predicts phishing risk).

Extract Features (SEO + NLP):
URL-based Features: Length, special characters, HTTPS usage.
NLP on Page Content: TF-IDF, embeddings (BERT, Word2Vec).
SEO Features:
Meta Tags: Title, description, keywords.
H1-H6 Tags: Content in headings.
Keyword Density: Check keyword stuffing.
Backlinks & Page Authority: Use tools like Moz API (if available).
Domain Age & WHOIS: Check if the domain is newly registered.
Redirect Chains: Phishing pages often use excessive redirections.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [10]:
df = pd.read_csv("Dataset_with_downloaded_bad.csv")
df.head(20)

Unnamed: 0.3,Unnamed: 0.2,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,ObfuscationRatio,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,LargestLineLength,HasTitle,Title,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit,HasSocialNet,HasSubmitButton,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label,Unnamed: 0.1,Unnamed: 0,status_code
0,3237,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,846.0,846.0,200.0
1,3238,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1846.0,1846.0,200.0
2,3239,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2846.0,2846.0,200.0
3,3240,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4346.0,,200.0
4,3241,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5846.0,,200.0
5,3242,mw158850.txt,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7346.0,,200.0
6,3556,mw76389.txt,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,96.0,96.0,200.0
7,3557,mw76389.txt,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1096.0,1096.0,200.0
8,3558,mw76389.txt,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,2096.0,2096.0,200.0
9,3559,mw76389.txt,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,3596.0,,200.0


In [4]:
#drop 
df = df.drop(columns = "Unnamed: 0.2")

In [5]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def extract_features_from_file(file_path, url=None):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
    except Exception:
        return None

    soup = BeautifulSoup(html_content, 'html.parser')

    for script in soup(["script", "style"]):
        script.extract()

    page_text = soup.get_text(separator=" ").strip()
    page_text = re.sub(r'\s+', ' ', page_text)
    page_text = re.sub(r'[^a-zA-Z0-9\s]', '', page_text)

    meta_desc_tag = soup.find("meta", attrs={"name": "description"})
    meta_desc = meta_desc_tag.get("content", "") if meta_desc_tag else ""


    headings = " ".join([h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])

    words = page_text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    word_count = len(filtered_words)

    return {
        "URL": url or os.path.basename(file_path),
        "Meta_Description": meta_desc,
        "Headings": headings,
        "Page_Content": page_text,
        "Word_Count": word_count}


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ssawka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
html_folder = "downloaded_pages"  # folder where HTML files are stored
extracted_data = []

for filename in os.listdir(html_folder):
    if filename.endswith(".html"):
        file_path = os.path.join(html_folder, filename)
        result = extract_features_from_file(file_path)
        if result:
            extracted_data.append(result)

# Convert to DataFrame and save
df = pd.DataFrame(extracted_data)
df.to_csv("content_dataset_from_files_bad.csv", index=False)

print("✅ Feature extraction complete from saved HTML files.")


✅ Feature extraction complete from saved HTML files.


In [22]:
content = pd.read_csv("content_dataset_from_files_bad.csv")

#change column name
content = content.rename(columns={"URL": "FILENAME"})

content.head(3)


Unnamed: 0,FILENAME,Meta_Description,Headings,Page_Content,Word_Count
0,100697.html,Shop our selection of height adjustable standi...,Categories Additional Information Account Inf...,Shop Standing Desks Ergonomic Chairs Monitor A...,1166
1,101082.html,HYbrid Coordinate Ocean Model,Need Help? Tools Disclaimer Alphabet Soup,HYCOM Home Support Us Need Help Ask a Question...,451
2,101360.html,マネジメント、プロモーター、テレビ・ラジオ番組製作、演芸の興行等を行う芸能プロダクション。,事業内容Business サービスService トピックスTopics SDGsへの取り組...,Business NSC FANY FANY BS BS...,17


In [15]:
#Change filename column into string
df["FILENAME"] = df["FILENAME"].astype(str)


#Change filename from txt to html
df["FILENAME"] = df["FILENAME"].str.replace("txt", "html")
df.head(20)


Unnamed: 0.3,Unnamed: 0.2,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,ObfuscationRatio,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,LargestLineLength,HasTitle,Title,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit,HasSocialNet,HasSubmitButton,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label,Unnamed: 0.1,Unnamed: 0,status_code
0,3237,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,846.0,846.0,200.0
1,3238,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1846.0,1846.0,200.0
2,3239,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2846.0,2846.0,200.0
3,3240,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4346.0,,200.0
4,3241,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5846.0,,200.0
5,3242,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7346.0,,200.0
6,3556,mw76389.html,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,96.0,96.0,200.0
7,3557,mw76389.html,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1096.0,1096.0,200.0
8,3558,mw76389.html,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,2096.0,2096.0,200.0
9,3559,mw76389.html,http://www.integrity-roofers.com,31,www.integrity-roofers.com,25,0,com,72.047001,0.588235,0.522907,0.063989,3,1,0,0,0.0,18,0.581,0,0.0,0,0,0,2,0.065,0,2,2826,1,integrity-roofers,100.0,100.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,3596.0,,200.0


In [12]:
df.duplicated().sum()

0

In [21]:
df.head(5)
#content.head(5)

Unnamed: 0.3,Unnamed: 0.2,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,ObfuscationRatio,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,LargestLineLength,HasTitle,Title,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit,HasSocialNet,HasSubmitButton,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label,Unnamed: 0.1,Unnamed: 0,status_code
0,3237,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,846.0,846.0,200.0
1,3238,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1846.0,1846.0,200.0
2,3239,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2846.0,2846.0,200.0
3,3240,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4346.0,,200.0
4,3241,mw158850.html,http://www.tasut.com,20,www.tasut.com,13,0,com,88.395062,1.0,0.522907,0.064505,3,1,0,0,0.0,8,0.4,0,0.0,0,0,0,1,0.05,0,48,138,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5846.0,,200.0


In [23]:
#merge content with Dataset_with_downloaded.csv based on URL

final_dataset = pd.merge(df, content, on = "FILENAME", how = "left")
final_dataset = final_dataset.drop_duplicates()
final_dataset.to_csv("Final_dataset_v1.csv", index = False)