In [27]:
import ssl
import socket
from pprint import pprint
from datetime import datetime

def parse_certificate_timestamp(timestamp_str):
    # Example: 'Nov  6 04:14:47 2023 GMT'
    return datetime.strptime(timestamp_str, '%b %d %H:%M:%S %Y %Z')

def print_certificate_details(cert):
    # Check if the subject is empty
    if not cert.get('subject', []):
        print("Subject is empty.")
        return

    # Extract and print the state and locality from the subject
    state = locality = common_name = None
    for field in cert['subject']:
        if len(field) == 2:
            if field[0][0] == 'S':
                state = field[0][1]
            elif field[0][0] == 'L':
                locality = field[0][1]
            elif field[0][0] == 'CN':
                common_name = field[0][1]

    # Print the extracted information
    if state is not None:
        print(f"State/Province (S): {state}")

    if locality is not None:
        print(f"Locality/City (L): {locality}")

    if common_name is not None:
        print(f"Common Name (CN): {common_name}")

    # Number of fields in the subject RDN
    print(f"Number of fields in subject RDN: {len(cert['subject'])}")

    # Length of the subject
    print(f"Length of the subject: {len(str(cert['subject']))}")

    # Validity period of the certificate
    not_before = parse_certificate_timestamp(cert['notBefore'])
    not_after = parse_certificate_timestamp(cert['notAfter'])
    print(f"Validity Period: {not_before} to {not_after}")

    # Number of policy identifiers embedded in certificate
    print(f"Number of Policy Identifiers: {len(cert.get('policy', []))}")

    # Check if the certificate has an OSCP (Online Certificate Status Protocol) link
    ocsp = next((url for url in cert.get('OCSP', []) if url.startswith("http")), None)
    print(f"OCSP Link: {ocsp}" if ocsp else "No OCSP Link found.")

    # Check if the certificate has a CDP (CRL Distribution Point) link
    cdp = next((url for url in cert.get('crlDistributionPoints', []) if url.startswith("http")), None)
    print(f"CDP Link: {cdp}" if cdp else "No CDP Link found.")

    # The number of Subject Alternative Names (SANs)
    sans = cert.get('subjectAltName', [])
    print(f"Number of Subject Alternative Names (SANs): {len(sans)}")

    # The unique number of TLDs of SANs
    unique_tlds = {san[1].split('.')[-1] for san in sans} if sans else set()
    print(f"Unique TLDs of SANs: {len(unique_tlds)}")

    # Attempt to extract public key information
    public_key_info = None

    # First, try to get from 'subjectPublicKeyInfo'
    if 'subjectPublicKeyInfo' in cert:
        public_key_info = cert['subjectPublicKeyInfo']
    # If not found, try to get from 'extensions'
    elif 'extensions' in cert:
        for ext in cert['extensions']:
            if ext[0] == 'subjectPublicKeyInfo':
                public_key_info = ext[1]
                break

    # Print public key information if available
    if public_key_info:
        public_key_algorithm = public_key_info['algorithm']['algorithm']
        public_key_bits = public_key_info['bits']
        public_key = public_key_info['key']

        print(f"Public Key Algorithm: {public_key_algorithm}")
        print(f"Public Key Size: {public_key_bits} bits")
        print(f"Public Key: {public_key}")
    else:
        print("Public Key information not found in the certificate.")

    # The key usage
    key_usage = cert.get('keyUsage', [])
    print(f"Key Usage: {', '.join(key_usage)}" if key_usage else "No Key Usage found.")

    # The version of the certificate
    print(f"Certificate Version: {cert['version'] + 1}")

    # The signature algorithm
    if 'signatureAlgorithm' in cert:
        # The signature algorithm
        print(f"Signature Algorithm: {cert['signatureAlgorithm']}")
    else:
        print("Signature Algorithm not found in the certificate.")
    # The length of the serial number
    print(f"Length of the Serial Number: {len(cert['serialNumber'])}")

    # Check if the certificate has expired
    is_expired = datetime.utcnow() > not_after
    print(f"The certificate {'has' if is_expired else 'has not'} expired.")

    # Check if the issuer is empty
    if not cert.get('issuer', []):
        print("Issuer is empty.")

    # Check if the issuer has a common name (CN) field
    issuer_common_name = next((field[0][1] for field in cert['issuer'] if field[0][0] == 'CN'), None)
    print(f"Issuer Common Name (CN): {issuer_common_name}" if issuer_common_name else "Issuer does not have a Common Name.")

    # Check if the subject is empty
    if not cert.get('subject', []):
        print("Subject is empty.")

    # Check if the certificate has any extensions
    extensions = cert.get('extensions', [])
    print(f"The certificate {'has' if extensions else 'does not have'} extensions.")

    # Check if the serial numbers conform to requirements
    # (You would need to define the specific requirements)

    # Check if the NotBefore timestamp is higher than the NotAfter
    not_before_higher = not_before > not_after
    print(f"The NotBefore timestamp is {'higher' if not_before_higher else 'lower'} than the NotAfter.")

    # Length of the Longest Common Subsequence (LCS) of all SANs
    def longest_common_subsequence(s1, s2):
        m, n = len(s1), len(s2)
        dp = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if s1[i - 1] == s2[j - 1]:
                    dp[i][j] = dp[i - 1][j - 1] + 1
                else:
                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

        return dp[m][n]

    lcs_length = max(longest_common_subsequence(san1[1], san2[1]) for san1 in sans for san2 in sans if san1 != san2)
    print(f"Length of the Longest Common Subsequence (LCS) of all SANs: {lcs_length}")

    # Length of the Longest Common Subsequence (LCS) of all SANs, normalized
    normalized_lcs_length = lcs_length / max(len(san[1]) for san in sans)
    print(f"Length of the Normalized LCS of all SANs: {normalized_lcs_length}")

    # Check if the certificate contains a SAN with a CDN pattern
    cdn_pattern = any('cdn' in san[1].lower() for san in sans)
    print(f"The certificate {'contains' if cdn_pattern else 'does not contain'} a SAN with a CDN pattern.")

    # Existence and critical status of extensions
    extension_checks = {
        'authorityInfoAccess': 'AIA',
        'certificatePolicies': 'Certificate Policies',
        'basicConstraints': 'Basic Constraints',
        'crlDistributionPoints': 'CRL Distribution Points',
        'subjectAltName': 'Subject Alternative Name',
        'extendedKeyUsage': 'Extended Key Usage',
        'authorityKeyIdentifier': 'Authority Key Identifier',
        'signedCertificateTimestamps': 'Signed Certificate Timestamps',
        'inhibitAnyPolicy': 'Inhibit Any Policy',
        'policyConstraints': 'Policy Constraints',
        'nameConstraints': 'Name Constraints',
        'issuerAltName': 'Issuer Alternative Name'
    }

    for extension_name, display_name in extension_checks.items():
        if extension_name in extensions:
            critical = extensions[extension_name].get('critical', False)
            print(f"{display_name} extension {'exists' if critical else 'exists but is not'} critical.")
        else:
            print(f"{display_name} extension does not exist.")

# Example usage
hostname = "nebulanerd.org"
context = ssl.create_default_context()

with context.wrap_socket(socket.socket(socket.AF_INET), server_hostname=hostname) as s:
    s.connect((hostname, 443))
    cert = s.getpeercert()

# Print the detailed information about the certificate
print_certificate_details(cert)


Number of fields in subject RDN: 1
Length of the subject: 42
Validity Period: 2023-11-20 12:46:59 to 2024-02-18 12:46:58
Number of Policy Identifiers: 0
OCSP Link: http://r3.o.lencr.org
No CDP Link found.
Number of Subject Alternative Names (SANs): 3
Unique TLDs of SANs: 2
Public Key information not found in the certificate.
No Key Usage found.
Certificate Version: 4
Signature Algorithm not found in the certificate.
Length of the Serial Number: 36
The certificate has not expired.
Issuer does not have a Common Name.
The certificate does not have extensions.
The NotBefore timestamp is lower than the NotAfter.
Length of the Longest Common Subsequence (LCS) of all SANs: 14
Length of the Normalized LCS of all SANs: 0.7777777777777778
The certificate does not contain a SAN with a CDN pattern.
AIA extension does not exist.
Certificate Policies extension does not exist.
Basic Constraints extension does not exist.
CRL Distribution Points extension does not exist.
Subject Alternative Name extens

In [37]:
import ssl
import socket
from cryptography import x509
from cryptography.hazmat.backends import default_backend

hostname = "techradar.com"
context = ssl.create_default_context()

with context.wrap_socket(socket.socket(socket.AF_INET), server_hostname=hostname) as s:
    s.connect((hostname, 443))
    pem_data = ssl.DER_cert_to_PEM_cert(s.getpeercert(binary_form=True))

# Convert PEM data to bytes
pem_bytes = pem_data.encode('utf-8')

# Parse the certificate using cryptography library
cert = x509.load_pem_x509_certificate(pem_bytes, default_backend())

# Extract and print public key information
public_key = cert.public_key()

# Infer public key algorithm from the type of the public key
public_key_algorithm = public_key.__class__.__name__
print(f"Public Key Algorithm: {public_key_algorithm}")

# Print public key size
public_key_size = public_key.key_size
print(f"Public Key Size: {public_key_size} bits")


Public Key Algorithm: _RSAPublicKey
Public Key Size: 2048 bits


In [39]:
!pip3 install tldextract

Collecting tldextract
  Downloading tldextract-5.1.1-py3-none-any.whl.metadata (11 kB)
Collecting idna (from tldextract)
  Using cached idna-3.6-py3-none-any.whl.metadata (9.9 kB)
Collecting requests>=2.1.0 (from tldextract)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Collecting filelock>=3.0.8 (from tldextract)
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.1.0->tldextract)
  Using cached charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.1.0->tldextract)
  Using cached urllib3-2.1.0-py3-none-any.whl.metadata (6.4 kB)
Collecting certifi>=2017.4.17 (from requests>=2.1.0->tldextract)
  Using cached certifi-2023.11.17-py3-none-any.whl.metadata (2.2 kB)
Downloading tldextract-5.1.1-py3-none-any.whl (97 kB)
[2K   

In [55]:
import tldextract
import string
from collections import Counter
import math
import re
import statistics

def is_suspicious_tld(tld):
    suspicious_tlds = ["xyz", "info", "online"]  # Add more suspicious TLDs as needed
    return tld in suspicious_tlds

def is_suspicious_keyword(domain):
    suspicious_keywords = ["phishing", "malware", "attack"]  # Add more suspicious keywords as needed
    return any(keyword in domain for keyword in suspicious_keywords)

def shannon_entropy(domain):
    char_count = Counter(domain)
    total_chars = len(domain)
    entropy = -sum((count / total_chars) * math.log2(count / total_chars) for count in char_count.values())
    return entropy

def analyze_domain(domain):
    extracted_info = tldextract.extract(domain)
    print(extracted_info)
    result = tldextract.extract(domain)
    full_domain = result.subdomain + '.' + result.domain + '.' + result.suffix
    print("Full Domain:", full_domain)

    # Features
    domain_length = len(result.domain)
#     label_count = len(extracted_info)
    tld = result.suffix
    suspicious_tld = is_suspicious_tld(tld)
    suspicious_keyword = is_suspicious_keyword(domain)
    entropy = shannon_entropy(domain)
    dash_count = domain.count("-")
    token_count = len(re.findall(r'\w+', domain))
    parts_count = len(domain.split('.'))
    
    # Handle subdomain attribute correctly
    subdomain_labels = extracted_info.subdomain.split('.') if extracted_info.subdomain else []
    subdomain_label_count = len(subdomain_labels)
    
#     has_tld_token = any(token == tld for token in extracted_info)
    contains_https = "https" in domain
    special_char_fraction = sum(1 for char in domain if char in string.punctuation) / len(domain)
    is_ip_address = bool(re.match(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", domain))
    is_idn = any(ord(char) > 127 for char in domain)
    vowel_fraction = sum(1 for char in domain if char.lower() in "aeiou") / len(domain)
    digit_fraction = sum(1 for char in domain if char.isdigit()) / len(domain)
    contains_www = "www." in domain
    has_digit_subdomain_label = any(char.isdigit() for char in extracted_info.subdomain)
    mean_subdomain_label_length = statistics.mean(len(label) for label in subdomain_labels) if subdomain_labels else 0
    contains_digits = any(char.isdigit() for char in domain)
    is_valid_tld = bool(tldextract.TLDExtract().tlds)
    has_single_char_subdomain = any(len(label) == 1 for label in subdomain_labels)
    char_diversity = len(set(domain)) / len(domain)
    alphabet_size = len(set(char.lower() for char in domain if char.isalpha()))
    underscore_fraction = domain.count("_") / len(domain)

    # Add more features as needed
    
    # Print or return the results
    print("Domain Length:", domain_length)
#     print("Label Count:", label_count)
    print("TLD:", tld)
    print("Is Suspicious TLD:", suspicious_tld)
    print("Contains Suspicious Keyword:", suspicious_keyword)
    print("Shannon Entropy:", entropy)
    print("Dash Count:", dash_count)
    print("Token Count:", token_count)
    print("Parts Count:", parts_count)
    print("Subdomain Label Count:", subdomain_label_count)
#     print("Has TLD Token:", has_tld_token)
    print("Contains 'https':", contains_https)
    print("Special Character Fraction:", special_char_fraction)
    print("Is IP Address:", is_ip_address)
    print("Is IDN:", is_idn)
    print("Vowel Fraction:", vowel_fraction)
    print("Digit Fraction:", digit_fraction)
    print("Contains 'www.':", contains_www)
    print("Has Digit Subdomain Label:", has_digit_subdomain_label)
    print("Mean Subdomain Label Length:", mean_subdomain_label_length)
    print("Contains Digits:", contains_digits)
    print("Is Valid TLD:", is_valid_tld)
    print("Has Single-Char Subdomain:", has_single_char_subdomain)
    print("Character Diversity:", char_diversity)
    print("Alphabet Size:", alphabet_size)
    print("Underscore Fraction:", underscore_fraction)

# Example usage
domain = "youtube.com"
analyze_domain(domain)


ExtractResult(subdomain='', domain='youtube', suffix='com', is_private=False)
Full Domain: .youtube.com
Domain Length: 7
TLD: com
Is Suspicious TLD: False
Contains Suspicious Keyword: False
Shannon Entropy: 3.0957952550009344
Dash Count: 0
Token Count: 2
Parts Count: 2
Subdomain Label Count: 0
Contains 'https': False
Special Character Fraction: 0.09090909090909091
Is IP Address: False
Is IDN: False
Vowel Fraction: 0.45454545454545453
Digit Fraction: 0.0
Contains 'www.': False
Has Digit Subdomain Label: False
Mean Subdomain Label Length: 0
Contains Digits: False
Is Valid TLD: True
Has Single-Char Subdomain: False
Character Diversity: 0.8181818181818182
Alphabet Size: 8
Underscore Fraction: 0.0
