## Phase 1: Dataset Preparation

### Download and merge both datasets to get all five categories.

In [1]:
import pandas as pd
import os

In [2]:
# Define the folder path where your CSV files are stored
folder_path = r'./datasets/URL/'

# Create a dictionary to map file names (without extension) to types
file_to_type = {
    'Benign_list_big_final': 'benign',
    'DefacementSitesURLFiltered': 'defacement',
    'Malware_dataset': 'malware',
    'phishing_dataset': 'phishing',
    'spam_dataset': 'spam'
}

# Get the list of CSV files in the folder
files_in_folder = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Empty list to store DataFrames
dfs = []

# Loop through the files in the folder
for file in files_in_folder:
    # Remove the extension and get the file name without it
    file_name = os.path.splitext(file)[0]
    
    # Check if the file name exists in the dictionary
    if file_name in file_to_type:
        # Read the CSV into a DataFrame (assuming one column with URLs)
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, header=None)  # No header, as the file only has URLs
        
        # Rename the column to 'url' since it contains the URLs
        df.columns = ['url']
        
        # Add the 'type' column based on the file name
        df['type'] = file_to_type[file_name]
        
        # Append this DataFrame to the list
        dfs.append(df)
    else:
        print(f"Warning: {file_name} not found in the dictionary.")

# Concatenate all dataframes into one
final_df = pd.concat(dfs, ignore_index=True)

final_df

Unnamed: 0,url,type
0,http://1337x.to/torrent/1048648/American-Snipe...,benign
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign
...,...,...
165361,http://archive.salisburyjournal.co.uk/2001/3/7/,spam
165362,http://astore.amazon.co.uk/allezvinsfrenchr/de...,spam
165363,http://archive.thisischeshire.co.uk/2000/1/14/...,spam
165364,http://applerugs.co.uk/rugs/product_info.php?p...,spam


In [3]:
# Now, read the malicious_phish.csv dataset
malicious_df = pd.read_csv(r".\datasets\malicious_phish.csv")

# Concatenate malicious_df with final_df (merge both DataFrames)
final_df = pd.concat([final_df, malicious_df], ignore_index=True)

final_df

Unnamed: 0,url,type
0,http://1337x.to/torrent/1048648/American-Snipe...,benign
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign
...,...,...
816552,xbox360.ign.com/objects/850/850402.html,phishing
816553,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
816554,www.gamespot.com/xbox360/action/deadspace/,phishing
816555,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [4]:
# Save the combined DataFrame to a CSV
final_df.to_csv('./datasets/merged_dataset.csv', index=False)

In [5]:
merged_df = pd.read_csv(r".\datasets\merged_dataset.csv")
merged_df

Unnamed: 0,url,type
0,http://1337x.to/torrent/1048648/American-Snipe...,benign
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,benign
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,benign
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,benign
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,benign
...,...,...
816552,xbox360.ign.com/objects/850/850402.html,phishing
816553,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
816554,www.gamespot.com/xbox360/action/deadspace/,phishing
816555,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [10]:
import urllib.parse
import tldextract
import numpy as np
import re
from collections import Counter
from scipy.stats import entropy

def extract_features(url):
    parsed_url = urllib.parse.urlparse(url)
    ext = tldextract.extract(url)

    domain = ext.domain + "." + ext.suffix if ext.suffix else ext.domain
    path = parsed_url.path
    query = parsed_url.query

    # Length-Based Features
    url_length = len(url)
    domain_length = len(domain)
    path_length = len(path)
    subdir_length = len(parsed_url.path.rsplit('/', 1)[0]) if '/' in parsed_url.path else 0
    filename_length = len(parsed_url.path.split('/')[-1])
    file_extension_length = len(parsed_url.path.split('.')[-1]) if '.' in parsed_url.path else 0
    arg_length = len(query)

    # Token-Based Features
    domain_tokens = domain.split('.')
    path_tokens = path.split('/')
    query_tokens = query.split('&')

    avg_domain_token_len = np.mean([len(token) for token in domain_tokens]) if domain_tokens else 0
    long_domain_token_len = max([len(token) for token in domain_tokens], default=0)
    avg_path_token_len = np.mean([len(token) for token in path_tokens]) if path_tokens else 0

    # Character-Based Features
    digit_count = sum(c.isdigit() for c in url)
    letter_count = sum(c.isalpha() for c in url)
    special_char_count = sum(not c.isalnum() for c in url)

    # Entropy Features
    def calc_entropy(text):
        if not text:
            return 0
        text_count = Counter(text)
        probs = [text_count[c] / len(text) for c in text_count]
        return entropy(probs, base=2)

    url_entropy = calc_entropy(url)
    entropy_domain = calc_entropy(domain)
    entropy_path = calc_entropy(path)
    entropy_query = calc_entropy(query)

    # Obfuscation Features
    is_ip = bool(re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", domain))
    dot_count = url.count(".")
    
    # Ratio-Based Features
    path_url_ratio = path_length / url_length if url_length else 0
    arg_url_ratio = arg_length / url_length if url_length else 0
    domain_url_ratio = domain_length / url_length if url_length else 0
    path_domain_ratio = path_length / domain_length if domain_length else 0
    arg_path_ratio = arg_length / path_length if path_length else 0  # FIXED THIS

    # Threat Indicators
    sensitive_keywords = ['login', 'bank', 'secure', 'account', 'update', 'verify']
    url_sensitive_word = any(word in url.lower() for word in sensitive_keywords)

    # Executable Indicators
    executable = 1 if url.endswith(('.exe', '.zip', '.rar', '.tar', '.gz')) else 0

    # Number of symbols
    delimiter_count = url.count('.') + url.count('/') + url.count('?') + url.count('=') + url.count('&')

    # Max Token Lengths
    max_path_token_len = max(len(token) for token in path_tokens) if path_tokens else 0
    max_domain_token_len = max(len(token) for token in domain_tokens) if domain_tokens else 0
    max_query_token_len = max(len(token) for token in query_tokens) if query_tokens else 0

    # Additional Features to Make Exactly 79
    number_rate_url = digit_count / url_length if url_length else 0
    number_rate_domain = digit_count / domain_length if domain_length else 0
    number_rate_path = digit_count / path_length if path_length else 0
    number_rate_filename = digit_count / filename_length if filename_length else 0
    number_rate_extension = digit_count / file_extension_length if file_extension_length else 0
    number_rate_query = digit_count / arg_length if arg_length else 0
    
    special_domain_count = sum(c in '.-' for c in domain)
    special_path_count = sum(c in '.-' for c in path)
    special_filename_count = sum(c in '.-' for c in parsed_url.path.split('/')[-1])
    special_extension_count = sum(c in '.-' for c in parsed_url.path.split('.')[-1]) if '.' in parsed_url.path else 0
    special_query_count = sum(c in '.-' for c in query)

    feature_vector = [
        url_length, domain_length, path_length, subdir_length, filename_length, file_extension_length, arg_length,
        len(domain_tokens), len(path_tokens), avg_domain_token_len, long_domain_token_len, avg_path_token_len,
        digit_count, letter_count, special_char_count,
        url_entropy, entropy_domain, entropy_path, entropy_query,
        is_ip, dot_count, delimiter_count, path_url_ratio, arg_url_ratio, domain_url_ratio, path_domain_ratio, arg_path_ratio,
        max_path_token_len, max_domain_token_len, max_query_token_len,
        url_sensitive_word, executable,
        number_rate_url, number_rate_domain, number_rate_path, number_rate_filename, number_rate_extension, number_rate_query,
        special_domain_count, special_path_count, special_filename_count, special_extension_count, special_query_count
    ]

    # **Ensure Exactly 79 Features**
    while len(feature_vector) < 80:
        feature_vector.append(0)

    return feature_vector

# **Test the Function**
test_url = "http://example.com/test?param=value"
features = extract_features(test_url)

print(f"Extracted {len(features)} features")
print("Feature Vector:", features)


Extracted 80 features
Feature Vector: [35, 11, 5, 0, 4, 0, 11, 2, 2, 5.0, 7, 2.0, 0, 28, 7, 3.978864088188099, 3.095795255000934, 1.9219280948873625, 3.0271691184406184, False, 1, 6, 0.14285714285714285, 0.3142857142857143, 0.3142857142857143, 0.45454545454545453, 2.2, 4, 7, 11, False, 0, 0.0, 0.0, 0.0, 0.0, 0, 0.0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
import urllib.parse
import tldextract
import numpy as np
import re
from collections import Counter
from scipy.stats import entropy

def extract_features(url):
    parsed_url = urllib.parse.urlparse(url)
    ext = tldextract.extract(url)

    domain = ext.domain + "." + ext.suffix if ext.suffix else ext.domain
    path = parsed_url.path
    query = parsed_url.query

    # Length-Based Features
    features = {
        'urlLen': len(url),
        'domainlength': len(domain),
        'pathLength': len(path),
        'subDirLen': len(parsed_url.path.rsplit('/', 1)[0]) if '/' in parsed_url.path else 0,
        'fileNameLen': len(parsed_url.path.split('/')[-1]),
        'this.fileExtLen': len(parsed_url.path.split('.')[-1]) if '.' in parsed_url.path else 0,
        'ArgLen': len(query),
    }

    # Token-Based Features
    domain_tokens = domain.split('.')
    path_tokens = path.split('/')
    query_tokens = query.split('&')

    features.update({
        'domain_token_count': len(domain_tokens),
        'path_token_count': len(path_tokens),
        'avgdomaintokenlen': np.mean([len(token) for token in domain_tokens]) if domain_tokens else 0,
        'longdomaintokenlen': max([len(token) for token in domain_tokens], default=0),
        'avgpathtokenlen': np.mean([len(token) for token in path_tokens]) if path_tokens else 0,
    })

    # Character-Based Features
    features.update({
        'URL_DigitCount': sum(c.isdigit() for c in url),
        'URL_Letter_Count': sum(c.isalpha() for c in url),
        'spcharUrl': sum(not c.isalnum() for c in url),
    })

    # Entropy Features
    def calc_entropy(text):
        if not text:
            return 0
        text_count = Counter(text)
        probs = [text_count[c] / len(text) for c in text_count]
        return entropy(probs, base=2)

    features.update({
        'Entropy_URL': calc_entropy(url),
        'Entropy_Domain': calc_entropy(domain),
        'Entropy_DirectoryName': calc_entropy(path),
        'Entropy_Afterpath': calc_entropy(query),
    })

    # Obfuscation Features
    features.update({
        'ISIpAddressInDomainName': bool(re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", domain)),
        'NumberofDotsinURL': url.count("."),
        'delimeter_Count': url.count('.') + url.count('/') + url.count('?') + url.count('=') + url.count('&'),
    })

    # Ratio-Based Features
    features.update({
        'pathurlRatio': features['pathLength'] / features['urlLen'] if features['urlLen'] else 0,
        'ArgUrlRatio': features['ArgLen'] / features['urlLen'] if features['urlLen'] else 0,
        'domainUrlRatio': features['domainlength'] / features['urlLen'] if features['urlLen'] else 0,
        'pathDomainRatio': features['pathLength'] / features['domainlength'] if features['domainlength'] else 0,
        'argPathRatio': features['ArgLen'] / features['pathLength'] if features['pathLength'] else 0,
    })

    # Threat Indicators
    sensitive_keywords = ['login', 'bank', 'secure', 'account', 'update', 'verify']
    features['URL_sensitiveWord'] = any(word in url.lower() for word in sensitive_keywords)

    # Executable Indicators
    features['executable'] = 1 if url.endswith(('.exe', '.zip', '.rar', '.tar', '.gz')) else 0

    # Symbol Counts
    features.update({
        'SymbolCount_URL': sum(c in '.-' for c in url),
        'SymbolCount_Domain': sum(c in '.-' for c in domain),
        'SymbolCount_Directoryname': sum(c in '.-' for c in path),
        'SymbolCount_Afterpath': sum(c in '.-' for c in query),
    })

    # **Ensure Exactly 80 Features**
    while len(features) < 80:
        features[f"extra_feature_{len(features)}"] = 0  # Add dummy features if necessary

    return features

# **Test the Function**
features_dict = extract_features("http://example.com/test?param=value")

print(f"Extracted {len(features_dict)} features")
print(features_dict)
# # **Convert to DataFrame**
# features_df = pd.DataFrame(merged_df['url'].apply(extract_features).tolist())

# # **Check if Column Names Match**
# print("Matching Columns:", set(features_df.columns) == set(feature_columns))

# # **Save the Dataset**
# final_df = pd.concat([merged_df[['url']], features_df, merged_df[['type']]], axis=1)
# final_df.to_csv("./datasets/final_dataset.csv", index=False)

# print(final_df.head())

# Convert extracted features to a set
extracted_feature_names = set(features_dict.keys())

# Convert expected feature names to a set
expected_feature_names = set(feature_columns)

# Find missing and extra features
missing_features = expected_feature_names - extracted_feature_names
extra_features = extracted_feature_names - expected_feature_names

print("\nMissing Features:", missing_features)
print("\nExtra Features:", extra_features)

Extracted 80 features
{'urlLen': 35, 'domainlength': 11, 'pathLength': 5, 'subDirLen': 0, 'fileNameLen': 4, 'this.fileExtLen': 0, 'ArgLen': 11, 'domain_token_count': 2, 'path_token_count': 2, 'avgdomaintokenlen': 5.0, 'longdomaintokenlen': 7, 'avgpathtokenlen': 2.0, 'URL_DigitCount': 0, 'URL_Letter_Count': 28, 'spcharUrl': 7, 'Entropy_URL': 3.978864088188099, 'Entropy_Domain': 3.095795255000934, 'Entropy_DirectoryName': 1.9219280948873625, 'Entropy_Afterpath': 3.0271691184406184, 'ISIpAddressInDomainName': False, 'NumberofDotsinURL': 1, 'delimeter_Count': 6, 'pathurlRatio': 0.14285714285714285, 'ArgUrlRatio': 0.3142857142857143, 'domainUrlRatio': 0.3142857142857143, 'pathDomainRatio': 0.45454545454545453, 'argPathRatio': 2.2, 'URL_sensitiveWord': False, 'executable': 0, 'SymbolCount_URL': 1, 'SymbolCount_Domain': 1, 'SymbolCount_Directoryname': 0, 'SymbolCount_Afterpath': 0, 'extra_feature_33': 0, 'extra_feature_34': 0, 'extra_feature_35': 0, 'extra_feature_36': 0, 'extra_feature_37': 

In [14]:
import urllib.parse
import tldextract
import numpy as np
import re
from collections import Counter
from scipy.stats import entropy

def calc_entropy(text):
    if not text:
        return 0
    text_count = Counter(text)
    probs = [text_count[c] / len(text) for c in text_count]
    return entropy(probs, base=2)

def extract_features(url):
    parsed_url = urllib.parse.urlparse(url)
    ext = tldextract.extract(url)

    domain = ext.domain + "." + ext.suffix if ext.suffix else ext.domain
    path = parsed_url.path
    query = parsed_url.query
    tld = ext.suffix

    domain_tokens = domain.split('.')
    path_tokens = path.split('/')
    query_tokens = query.split('&')
    
    features = {
        'Querylength': len(query),
        'domain_token_count': len(domain_tokens),
        'path_token_count': len(path_tokens),
        'avgdomaintokenlen': np.mean([len(token) for token in domain_tokens]) if domain_tokens else 0,
        'longdomaintokenlen': max([len(token) for token in domain_tokens], default=0),
        'avgpathtokenlen': np.mean([len(token) for token in path_tokens]) if path_tokens else 0,
        'tld': tld,
        'charcompvowels': sum(c in 'aeiou' for c in url),
        'charcompace': sum(c.isspace() for c in url),
        'ldl_url': len(url),
        'ldl_domain': len(domain),
        'ldl_path': len(path),
        'ldl_filename': len(path.split('/')[-1]),
        'ldl_getArg': len(query),
        'dld_url': len(url) - len(domain),
        'dld_domain': len(domain) - len(tld),
        'dld_path': len(path) - len(domain),
        'dld_filename': len(path.split('/')[-1]) if '.' in path else 0,
        'dld_getArg': len(query),
        'urlLen': len(url),
        'domainlength': len(domain),
        'pathLength': len(path),
        'subDirLen': len(parsed_url.path.rsplit('/', 1)[0]) if '/' in parsed_url.path else 0,
        'fileNameLen': len(parsed_url.path.split('/')[-1]),
        'this.fileExtLen': len(parsed_url.path.split('.')[-1]) if '.' in parsed_url.path else 0,
        'ArgLen': len(query),
        'pathurlRatio': len(path) / len(url) if len(url) else 0,
        'ArgUrlRatio': len(query) / len(url) if len(url) else 0,
        'argDomanRatio': len(query) / len(domain) if len(domain) else 0,
        'domainUrlRatio': len(domain) / len(url) if len(url) else 0,
        'pathDomainRatio': len(path) / len(domain) if len(domain) else 0,
        'argPathRatio': len(query) / len(path) if len(path) else 0,
        'executable': 1 if url.endswith(('.exe', '.zip', '.rar', '.tar', '.gz')) else 0,
        'isPortEighty': ':80' in url,
        'NumberofDotsinURL': url.count('.'),
        'ISIpAddressInDomainName': bool(re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", domain)),
        'CharacterContinuityRate': sum(1 for i in range(len(url)-1) if url[i] == url[i+1]) / len(url) if len(url) > 1 else 0,
        'LongestVariableValue': max(map(len, query_tokens), default=0),
        'URL_DigitCount': sum(c.isdigit() for c in url),
        'host_DigitCount': sum(c.isdigit() for c in domain),
        'Directory_DigitCount': sum(c.isdigit() for c in path),
        'File_name_DigitCount': sum(c.isdigit() for c in path.split('/')[-1]),
        'Extension_DigitCount': sum(c.isdigit() for c in tld),
        'Query_DigitCount': sum(c.isdigit() for c in query),
        'URL_Letter_Count': sum(c.isalpha() for c in url),
        'host_letter_count': sum(c.isalpha() for c in domain),
        'Directory_LetterCount': sum(c.isalpha() for c in path),
        'Filename_LetterCount': sum(c.isalpha() for c in path.split('/')[-1]),
        'Extension_LetterCount': sum(c.isalpha() for c in tld),
        'Query_LetterCount': sum(c.isalpha() for c in query),
        'LongestPathTokenLength': max(map(len, path_tokens), default=0),
        'Domain_LongestWordLength': max(map(len, domain_tokens), default=0),
        'Path_LongestWordLength': max(map(len, path_tokens), default=0),
        'sub-Directory_LongestWordLength': max(map(len, path_tokens[:-1]), default=0) if path_tokens else 0,
        'Arguments_LongestWordLength': max(map(len, query_tokens), default=0),
        'URL_sensitiveWord': any(word in url.lower() for word in ['login', 'bank', 'secure', 'account', 'update', 'verify']),
        'URLQueries_variable': len(query_tokens),
        'spcharUrl': sum(not c.isalnum() for c in url),
        'delimeter_Domain': domain.count('-'),
        'delimeter_path': path.count('-'),
        'delimeter_Count': url.count('.') + url.count('/') + url.count('?') + url.count('=') + url.count('&'),
        'NumberRate_URL': sum(c.isdigit() for c in url) / len(url) if len(url) else 0,
        'NumberRate_Domain': sum(c.isdigit() for c in domain) / len(domain) if len(domain) else 0,
        'NumberRate_DirectoryName': sum(c.isdigit() for c in path) / len(path) if len(path) else 0,
        'NumberRate_FileName': sum(c.isdigit() for c in path.split('/')[-1]) / len(path.split('/')[-1]) if len(path.split('/')[-1]) else 0,
        'NumberRate_Extension': sum(c.isdigit() for c in tld) / len(tld) if len(tld) else 0,
        'NumberRate_AfterPath': sum(c.isdigit() for c in query) / len(query) if len(query) else 0,
        'SymbolCount_URL': url.count('-'),
        'SymbolCount_Domain': domain.count('-'),
        'SymbolCount_Directoryname': path.count('-'),
        'SymbolCount_FileName': path.split('/')[-1].count('-'),
        'SymbolCount_Extension': tld.count('-'),
        'SymbolCount_Afterpath': query.count('-'),
        'Entropy_URL': calc_entropy(url),
        'Entropy_Domain': calc_entropy(domain),
        'Entropy_DirectoryName': calc_entropy(path),
        'Entropy_Filename': calc_entropy(path.split('/')[-1]),
        'Entropy_Extension': calc_entropy(tld),
        'Entropy_Afterpath': calc_entropy(query)
    }
    return features

# **Test the Function**
features_dict = extract_features("http://example.com/test?param=value")

print(f"Extracted {len(features_dict)} features")
print(features_dict)

Extracted 79 features
{'Querylength': 11, 'domain_token_count': 2, 'path_token_count': 2, 'avgdomaintokenlen': 5.0, 'longdomaintokenlen': 7, 'avgpathtokenlen': 2.0, 'tld': 'com', 'charcompvowels': 10, 'charcompace': 0, 'ldl_url': 35, 'ldl_domain': 11, 'ldl_path': 5, 'ldl_filename': 4, 'ldl_getArg': 11, 'dld_url': 24, 'dld_domain': 8, 'dld_path': -6, 'dld_filename': 0, 'dld_getArg': 11, 'urlLen': 35, 'domainlength': 11, 'pathLength': 5, 'subDirLen': 0, 'fileNameLen': 4, 'this.fileExtLen': 0, 'ArgLen': 11, 'pathurlRatio': 0.14285714285714285, 'ArgUrlRatio': 0.3142857142857143, 'argDomanRatio': 1.0, 'domainUrlRatio': 0.3142857142857143, 'pathDomainRatio': 0.45454545454545453, 'argPathRatio': 2.2, 'executable': 0, 'isPortEighty': False, 'NumberofDotsinURL': 1, 'ISIpAddressInDomainName': False, 'CharacterContinuityRate': 0.05714285714285714, 'LongestVariableValue': 11, 'URL_DigitCount': 0, 'host_DigitCount': 0, 'Directory_DigitCount': 0, 'File_name_DigitCount': 0, 'Extension_DigitCount': 0,

In [16]:
# Apply feature extraction function to all URLs and create a DataFrame
features_df = merged_df['url'].apply(extract_features).apply(pd.Series)

# Ensure the feature column names match the expected feature_columns list
features_df = features_df[feature_columns]

# Concatenate 'url', extracted features, and 'type' in order
final_df = pd.concat([merged_df[['url']], features_df, merged_df[['type']]], axis=1)

# Save the updated dataset
final_df.to_csv("./datasets/Final_dataset.csv", index=False)

# Print sample output
print(final_df.head())

                                                 url  Querylength  \
0  http://1337x.to/torrent/1048648/American-Snipe...            0   
1  http://1337x.to/torrent/1110018/Blackhat-2015-...            0   
2  http://1337x.to/torrent/1122940/Blackhat-2015-...            0   
3  http://1337x.to/torrent/1124395/Fast-and-Furio...            0   
4  http://1337x.to/torrent/1145504/Avengers-Age-o...            0   

   domain_token_count  path_token_count  avgdomaintokenlen  \
0                   2                 5                3.5   
1                   2                 5                3.5   
2                   2                 5                3.5   
3                   2                 5                3.5   
4                   2                 5                3.5   

   longdomaintokenlen  avgpathtokenlen tld  charcompvowels  charcompace  ...  \
0                   5             12.8  to              10            0  ...   
1                   5             12.8  to          

In [17]:
final_df.tail()

Unnamed: 0,url,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,type
816552,xbox360.ign.com/objects/850/850402.html,0,2,4,3.0,3,9.0,com,5,0,...,0,0,0,4.355539,2.807355,4.355539,3.277613,1.584963,0.0,phishing
816553,games.teamxbox.com/xbox-360/1860/Dead-Space/,0,2,5,5.5,8,8.0,com,11,0,...,0,0,0,4.2433,3.084963,4.2433,0.0,1.584963,0.0,phishing
816554,www.gamespot.com/xbox360/action/deadspace/,0,2,5,5.5,8,7.6,com,12,0,...,0,0,0,4.147921,3.251629,4.147921,0.0,1.584963,0.0,phishing
816555,en.wikipedia.org/wiki/Dead_Space_(video_game),0,2,3,6.0,9,14.333333,org,18,0,...,0,0,0,4.102313,3.334679,4.102313,3.675311,1.584963,0.0,phishing
816556,www.angelfire.com/goth/devilmaycrytonite/,0,2,4,6.0,9,9.5,com,12,0,...,0,0,0,4.143541,3.546594,4.143541,0.0,1.584963,0.0,phishing


In [18]:
final_df.iloc[12002]

url                      http://likemag.com/sites/default/files/css/css...
Querylength                                                              0
domain_token_count                                                       2
path_token_count                                                         6
avgdomaintokenlen                                                      5.0
                                               ...                        
Entropy_DirectoryName                                              4.87505
Entropy_Filename                                                  4.778119
Entropy_Extension                                                 1.584963
Entropy_Afterpath                                                      0.0
type                                                                benign
Name: 12002, Length: 81, dtype: object

### Handle missing values, duplicates, and inconsistencies.


### Balance classes using SMOTE, oversampling, or undersampling.


## Phase 2: Exploratory Data Analysis (EDA)

### Visualize URL distributions, token lengths, and special characters.

### Generate at least five meaningful graphs to highlight patterns.

### Identify relationships between URL structure and malicious behavior.

## Phase 3: Feature Engineering

### Extract structural features (length, subdomains, special characters).

### Use TF-IDF, Word2Vec, or Transformers for NLP-based embeddings.

### Consider character-level models for sequence-based feature extraction.

## Phase 4: Model Training

### Train 3 Traditional ML models (Random Forest, SVM, XGBoost).

### Train Deep Learning models (LSTM, CNN) for sequence-based classification.

### Fine-tune BERT/GPT to capture deeper URL semantics.

### Compare model performance with confusion matrices & ROC curves.