In [None]:
import pandas as pd
benign_df = pd.read_csv("benign_domain_list.csv", header=None)
malicious_df = pd.read_csv("Malicious_URLs.csv")
mixed_df = pd.read_csv("mixed_domain_list.csv", header=None)
tranco_df = pd.read_csv("tranco_full_list_for_ranking.csv", header=None)

In [None]:
benign_df

In [None]:
malicious_df

In [None]:
mixed_df

In [None]:
tranco_df

In [None]:
# Block for installs
%pip install dnstwist
%pip install newspaper3k
%pip install requests
%pip install beautifulsoup4

In [None]:
### Defining methods block
from urllib.parse import urlparse
import tldextract
import dnstwist
from newspaper import Article, Config
import requests
from bs4 import BeautifulSoup

def calculate_percentage_digits(domain):
    digits_count = sum(c.isdigit() for c in domain)
    total_chars = len(domain)
    percentage_digits = digits_count / total_chars * 100
    return percentage_digits

def get_fqdn(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def count_unique_chars(domain):
    unique_chars = set(domain)
    return len(unique_chars) - 1     ### -1 because it counts the '.' in the FQDN

def count_hyphens(domain):
    hyphens_count = sum(c == '-' for c in domain)
    return hyphens_count

def count_dots(domain):
    dot_count = sum(c == '.' for c in domain)
    return dot_count

def get_tld(domain):
    ext = tldextract.extract(domain)
    return ext.suffix

def dnstwist_results(domain):
    results = dnstwist.Fuzzer(domain)
    return results.generate()

def get_reputation_score(tld):
    url = f'https://www.spamhaus.org/statistics/tlds/{tld}/'
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    score_div = soup.find('div', {'class': 'score'})
    if score_div is not None:
        score = score_div.text.strip()
    else:
        score = 'Not found'
    return score


def generate_potential_typosquatting_domains(domain):
    fuzzer = dnstwist.Fuzzer(domain)
    variations = fuzzer.generate(max_changes=2, max_additions=2, max_deletions=2)
    if variations is not None:
        return set(variation.domain for variation in variations)
    else:
        return set()

In [None]:
benign_df['Domain'] = benign_df[0]
benign_df['Domain Length'] = benign_df[0].str.len()
benign_df['Digit Percentage'] = benign_df[0].apply(calculate_percentage_digits)
benign_df['Unique Character Count'] = benign_df[0].apply(count_unique_chars)
benign_df['Hypen Count'] = benign_df[0].apply(count_hyphens)
benign_df['Dot Count'] = benign_df[0].apply(count_dots)
benign_df['TLD'] = benign_df[0].apply(get_tld)
#benign_df['DNSTwist Results'] = benign_df[0].apply(dnstwist_results)
# tlds = benign_df['TLD']
# scores = []
# for tld in tlds:
#     score = get_reputation_score(tld)
#     scores.append(score)
# benign_df['TLD Reputation Score'] = scores
benign_df['Tranco Rank'] = benign_df['Domain'].map(tranco_df.set_index('Domain')[0])
benign_df

In [None]:
import multiprocessing


num_processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=num_processes) as pool:
    # apply the generate_potential_typosquatting_domains function to each domain name in the dataframe
    potential_typosquatting_domains_list = pool.map(generate_potential_typosquatting_domains, benign_df['Domain'].tolist())
potential_typosquatting_domains = {domain for domain_set in potential_typosquatting_domains_list for domain in domain_set}

# create a new column in the dataframe to store the results
benign_df['potential_typosquatting'] = benign_df["Domain"].apply(lambda x: x in potential_typosquatting_domains)

In [None]:
malicious_df['Domain'] = malicious_df['url'].apply(get_fqdn)
malicious_df['Domain Length'] = malicious_df['Domain'].str.len()
malicious_df['Digit Percentage'] = malicious_df['Domain'].apply(calculate_percentage_digits)
malicious_df['Digit Count'] = malicious_df['Domain'].apply(count_unique_chars)
malicious_df['Hyphen Count'] = malicious_df['Domain'].apply(count_hyphens)
malicious_df['Dot Count'] = malicious_df['Domain'].apply(count_dots)
malicious_df['Tranco Rank'] = malicious_df['Domain'].map(tranco_df.set_index('Domain')[0])
malicious_df

In [None]:

mixed_df['Domain'] = mixed_df[0]
mixed_df['Domain Length'] = mixed_df[0].str.len()
mixed_df['Digit Percentage'] = mixed_df[0].apply(calculate_percentage_digits)
mixed_df['Digit Count'] = mixed_df[0].apply(count_unique_chars)
mixed_df['Hypen Count'] = mixed_df[0].apply(count_hyphens)
mixed_df['Dot Count'] = mixed_df[0].apply(count_dots)
mixed_df['Tranco Rank'] = mixed_df['Domain'].map(tranco_df.set_index('Domain')[0])
mixed_df

In [None]:
tranco_df['Domain'] = tranco_df[1]
tranco_df['Domain Length'] = tranco_df[1].str.len()
tranco_df['Digit Percentage'] = tranco_df[1].apply(calculate_percentage_digits)
tranco_df['Digit Count'] = tranco_df[1].apply(count_unique_chars)
tranco_df['Hypen Count'] = tranco_df[1].apply(count_hyphens)
tranco_df['Dot Count'] = tranco_df[1].apply(count_dots)
tranco_df