In [1]:
 # dga_feature_extractor.py
import pandas as pd
import numpy as np
import re
import math
import zlib
import string
from collections import Counter
from nltk.corpus import words
from nltk import download
from difflib import SequenceMatcher
from scipy.stats import entropy

# Download dictionary
download('words')
english_words = set(w.lower() for w in words.words())

VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
POPULAR_TLDS = {".com", ".org", ".net", ".info", ".edu", ".gov"}
BAD_TLDS = {".xyz", ".top", ".club", ".work", ".click"}

# Example benign distribution (uniform)
BENIGN_DIST = np.ones(26) / 26

def shannon_entropy(s):
    p, _ = np.histogram(list(s), bins=range(257), density=True)
    p = p[p > 0]
    return -np.sum(p * np.log2(p))

def normalized_entropy(s):
    if not s:
        return 0
    return shannon_entropy(s) / math.log2(len(set(s)) + 1)

def compress_ratio(s):
    if not s:
        return 0
    return len(zlib.compress(s.encode())) / max(1, len(s))

def dict_word_features(domain):
    matches = [w for w in english_words if w in domain]
    longest = max((len(w) for w in matches), default=0)
    return len(matches), longest, longest / max(1, len(domain))

def sliding_dict_ratio(domain, window=4):
    n = len(domain)
    if n < window:
        return 0
    matches = 0
    for i in range(n - window + 1):
        if domain[i:i+window] in english_words:
            matches += 1
    return matches / (n - window + 1)

def vowel_consonant_alt(domain):
    domain = re.sub(r'[^a-z]', '', domain.lower())
    count = 0
    for i in range(1, len(domain)):
        if (domain[i] in VOWELS and domain[i-1] in CONSONANTS) or (domain[i] in CONSONANTS and domain[i-1] in VOWELS):
            count += 1
    return count

def char_gini(s):
    counts = Counter(s)
    N = sum(counts.values())
    if N == 0:
        return 0
    gini = 1 - sum((c/N)**2 for c in counts.values())
    return gini

def kl_divergence(s):
    counts = Counter([c for c in s if c.isalpha()])
    N = sum(counts.values())
    if N == 0:
        return 0
    dist = np.array([counts.get(chr(97+i),0)/N for i in range(26)])
    return entropy(dist, BENIGN_DIST)

def markov_chain_likelihood(domain):
    if len(domain) < 2:
        return 0
    transitions = {}
    for i in range(len(domain)-1):
        pair = (domain[i], domain[i+1])
        transitions[pair] = transitions.get(pair, 0) + 1
    total = sum(transitions.values())
    return sum(math.log((c/total)+1e-6) for c in transitions.values())

def autocorrelation_score(domain):
    if len(domain) < 2:
        return 0
    values = [ord(c) for c in domain]
    mean = np.mean(values)
    var = np.var(values)
    corr = sum((values[i]-mean)*(values[i+1]-mean) for i in range(len(values)-1)) / (var*(len(values)-1)+1e-6)
    return corr

def hyphen_word_match_ratio(domain):
    """Compute ratio of hyphen-separated parts that are valid dictionary words."""
    parts = domain.split("-")
    if not parts:
        return 0.0
    dict_matches = sum(1 for p in parts if p in english_words)
    return dict_matches / len(parts)

def renyi_entropy(s, alpha=2):
    """Rényi entropy of order alpha (default α=2)."""
    if not s:
        return 0
    counts = Counter(s)
    probs = np.array(list(counts.values())) / len(s)
    if alpha == 1:
        return -np.sum(probs * np.log2(probs))  # Shannon
    return 1 / (1 - alpha) * np.log2(np.sum(probs ** alpha))

# Load top domains (free list like Tranco or Cisco Umbrella top domains)
try:
    with open("top1k_domains.txt") as f:
        popular_domains = [line.strip().lower() for line in f.readlines()]
except FileNotFoundError:
    popular_domains = ["google.com","facebook.com","youtube.com","amazon.com","wikipedia.org"]  # fallback demo list

def min_levenshtein_to_popular(domain, topN=500):
    """Compute minimum normalized Levenshtein distance to topN popular domains."""
    domain = domain.lower()
    min_dist = 1.0
    for pd in popular_domains[:topN]:
        ratio = SequenceMatcher(None, domain, pd).ratio()
        dist = 1 - ratio  # 0 = identical, 1 = very different
        if dist < min_dist:
            min_dist = dist
    return min_dist

# Keyboard adjacency mapping
KEYBOARD_NEIGHBORS = {
    'q': "was", 'w': "qase", 'e': "wsdr", 'r': "edft", 't': "rfgy",
    'y': "tghu", 'u': "yhj", 'i': "ujk", 'o': "ikl", 'p': "ol",
    'a': "qwsz", 's': "qwedxza", 'd': "erfcxs", 'f': "rtgvcd", 'g': "tyhbvf",
    'h': "yujnbg", 'j': "uikmnh", 'k': "iolmj", 'l': "opk",
    'z': "asx", 'x': "zsdc", 'c': "xdfv", 'v': "cfgb", 'b': "vghn",
    'n': "bhjm", 'm': "njk"
}

def keyboard_distance_score(domain):
    """Average keyboard adjacency match score between consecutive characters."""
    domain = re.sub(r'[^a-z]', '', domain.lower())
    if len(domain) < 2:
        return 0
    score = 0
    for i in range(len(domain)-1):
        if domain[i+1] in KEYBOARD_NEIGHBORS.get(domain[i], ""):
            score += 1
    return score / (len(domain)-1)

# === Feature extraction ===
def extract_features(domain):
    if not isinstance(domain, str):
        domain = str(domain)
    domain = domain.lower()
    name, _, tld = domain.rpartition(".")
    if not name:
        name = domain

    feats = {}

    feats["Length"] = len(domain)
    feats["Consonant_Count"] = sum(c in CONSONANTS for c in domain)
    feats["Unique_Chars"] = len(set(domain))
    feats["Max_Cons_Cluster"] = max((len(m.group()) for m in re.finditer(r'[bcdfghjklmnpqrstvwxyz]+', domain)), default=0)

    # Info-theoretic
    feats["Dist_STD"] = np.std([domain.count(c) for c in set(domain)])
    feats["Char_Gini"] = char_gini(domain)
    feats["Char_Freq_Deviation"] = np.std(list(Counter(domain).values()))
    feats["KL_divergence"] = kl_divergence(domain)
    feats["Compression_ratio"] = compress_ratio(domain)

    # Pronounceability
    feats["Pronounceability"] = sum(c in VOWELS for c in domain) / (sum(c in CONSONANTS for c in domain) + 1)

    # N-gram / LM
    feats["Bigram_Score"] = sum(1 for i in range(len(domain) - 1) if domain[i].isalpha() and domain[i + 1].isalpha())
    feats["Trigram_Score"] = sum(1 for i in range(len(domain) - 2) if domain[i].isalpha() and domain[i + 2].isalpha())
    feats["Markov_Chain_Likelihood"] = markov_chain_likelihood(domain)
    feats["Bigram_Likelihood"] = sum(1 for i in range(len(domain)-1) if domain[i].isalpha() and domain[i+1].isalpha())/max(1,len(domain))
    feats["Ngram_LM_Perplexity"] = math.exp(-markov_chain_likelihood(domain) / max(1, len(domain)))

    # Structural/pattern
    feats["Unique_Char_Ratio"] = len(set(domain)) / max(1, len(domain))
    feats["Norm_Char_Freq_Var"] = np.std(list(Counter(domain).values())) / max(1, len(domain))
    feats["Kolmogorov_Complexity"] = compress_ratio(domain)

    # Advanced Features
    feats["Renyi_Entropy"] = renyi_entropy(domain, alpha=2)
    feats["Min_Levenshtein_to_Popular"] = min_levenshtein_to_popular(domain)
    feats["Keyboard_Distance_Score"] = keyboard_distance_score(domain)
    feats["Sliding_Word_Ratio"] = sliding_dict_ratio(domain)

    return feats

# === Run on a dataset ===
df = pd.read_csv("dga_version5.csv")  # must have "domain" column
features = df["domain"].apply(extract_features)
feat_df = pd.DataFrame(list(features))
out = pd.concat([df, feat_df], axis=1)
out.to_csv("f_dga_version1.csv", index=False)
print("✅ Features saved to dga_18_version1.csv")


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


✅ Features saved to dga_18_version1.csv


In [2]:
from google.colab import files
df=pd.read_csv("f_dga_version1.csv")
df.describe()

Unnamed: 0,label,Length,Consonant_Count,Unique_Chars,Max_Cons_Cluster,Dist_STD,Char_Gini,Char_Freq_Deviation,KL_divergence,Compression_ratio,...,Markov_Chain_Likelihood,Bigram_Likelihood,Ngram_LM_Perplexity,Unique_Char_Ratio,Norm_Char_Freq_Var,Kolmogorov_Complexity,Renyi_Entropy,Min_Levenshtein_to_Popular,Keyboard_Distance_Score,Sliding_Word_Ratio
count,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,...,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0
mean,1.0,19.226092,12.200742,12.481366,4.576519,0.712628,0.897836,0.712628,0.94343,1.469141,...,-51.45595,0.819104,13.817479,0.691829,0.03688,1.469141,3.324323,0.618239,0.141644,0.055668
std,0.0,6.938378,4.305605,2.68991,2.643387,0.338331,0.023024,0.338331,0.231497,0.180496,...,24.607247,0.067656,4.933978,0.143928,0.012843,0.180496,0.30448,0.083928,0.094402,0.082787
min,1.0,8.0,2.0,4.0,1.0,0.0,0.641975,0.0,0.26034,0.885714,...,-138.520733,0.15625,2.875434,0.305556,0.0,0.885714,1.481869,0.142857,0.0,0.0
25%,1.0,14.0,9.0,10.0,3.0,0.421325,0.887574,0.421325,0.769302,1.32,...,-68.735314,0.769231,9.912027,0.566667,0.029215,1.32,3.152952,0.567568,0.071429,0.0
50%,1.0,18.0,12.0,12.0,4.0,0.655555,0.901308,0.655555,0.919725,1.444444,...,-48.164338,0.823529,12.802302,0.6875,0.036666,1.444444,3.340923,0.615385,0.133333,0.0
75%,1.0,25.0,15.0,14.0,6.0,0.942809,0.914062,0.942809,1.094141,1.571429,...,-33.344173,0.88,16.398649,0.8125,0.042855,1.571429,3.540568,0.666667,0.212121,0.117647
max,1.0,40.0,27.0,26.0,16.0,2.661453,0.956633,2.661453,2.302397,2.0,...,-10.982275,0.925,34.614674,1.0,0.177778,2.0,4.527247,0.913043,0.857143,0.625


In [3]:
"Length","Consonant_Count","Unique_Chars","Max_Cons_Cluster","Dist_STD","Char_Gini","Char_Freq_Deviation","KL_divergence","Compression_ratio",
"Pronounceability","Bigram_Score","Trigram_Score","Markov_Chain_Likelihood","Bigram_Likelihood","Ngram_LM_Perplexity","Unique_Char_Ratio",
"Norm_Char_Freq_Var","Kolmogorov_Complexity","Renyi_Entropy","Min_Levenshtein_to_Popular","Keyboard_Distance_Score","Sliding_Word_Ratio"

('Norm_Char_Freq_Var',
 'Kolmogorov_Complexity',
 'Renyi_Entropy',
 'Min_Levenshtein_to_Popular',
 'Keyboard_Distance_Score',
 'Sliding_Word_Ratio')

In [4]:
import pandas as pd

def reorder_columns(input_csv, output_csv, desired_order):
    """
    Reorder the columns of a CSV file based on user-defined order.

    Parameters:
    - input_csv (str): Path to input CSV file.
    - output_csv (str): Path to output CSV file with reordered columns.
    - desired_order (list): List of column names in the desired order.
    """
    # Load dataset
    df = pd.read_csv(input_csv)

    # Check which desired columns exist
    available_columns = [col for col in desired_order if col in df.columns]

    # Add missing columns (if any were not in df)
    missing_columns = [col for col in desired_order if col not in df.columns]
    for col in missing_columns:
        df[col] = None  # Fill with None or default values

    # Reorder
    df = df[available_columns + [col for col in df.columns if col not in available_columns]]

    # Save output
    df.to_csv(output_csv, index=False)
    print(f"✅ Reordered CSV saved as {output_csv}")


# =====================
# Example usage
# =====================

# Suppose your dataset has 58 features + "domain" + "label"
input_csv = "f_dga_version1.csv"
output_csv = "f_dga_version2.csv"

# User-defined column order (just an example)
desired_order = [





"domain","Length","Consonant_Count","Unique_Chars","Max_Cons_Cluster","Dist_STD","Char_Gini","Char_Freq_Deviation","KL_divergence","Compression_ratio",
"Pronounceability","Bigram_Score","Trigram_Score","Markov_Chain_Likelihood","Bigram_Likelihood","Ngram_LM_Perplexity","Unique_Char_Ratio",
"Norm_Char_Freq_Var","Kolmogorov_Complexity","Renyi_Entropy","Min_Levenshtein_to_Popular","Keyboard_Distance_Score","Sliding_Word_Ratio","label"

]

# Reorder dataset
reorder_columns(input_csv, output_csv, desired_order)


✅ Reordered CSV saved as f_dga_version2.csv


In [5]:
from google.colab import files
df=pd.read_csv("f_dga_version2.csv")
df.describe()

Unnamed: 0,Length,Consonant_Count,Unique_Chars,Max_Cons_Cluster,Dist_STD,Char_Gini,Char_Freq_Deviation,KL_divergence,Compression_ratio,Pronounceability,...,Bigram_Likelihood,Ngram_LM_Perplexity,Unique_Char_Ratio,Norm_Char_Freq_Var,Kolmogorov_Complexity,Renyi_Entropy,Min_Levenshtein_to_Popular,Keyboard_Distance_Score,Sliding_Word_Ratio,label
count,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,...,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0
mean,19.226092,12.200742,12.481366,4.576519,0.712628,0.897836,0.712628,0.94343,1.469141,0.466916,...,0.819104,13.817479,0.691829,0.03688,1.469141,3.324323,0.618239,0.141644,0.055668,1.0
std,6.938378,4.305605,2.68991,2.643387,0.338331,0.023024,0.338331,0.231497,0.180496,0.246663,...,0.067656,4.933978,0.143928,0.012843,0.180496,0.30448,0.083928,0.094402,0.082787,0.0
min,8.0,2.0,4.0,1.0,0.0,0.641975,0.0,0.26034,0.885714,0.0,...,0.15625,2.875434,0.305556,0.0,0.885714,1.481869,0.142857,0.0,0.0,1.0
25%,14.0,9.0,10.0,3.0,0.421325,0.887574,0.421325,0.769302,1.32,0.25,...,0.769231,9.912027,0.566667,0.029215,1.32,3.152952,0.567568,0.071429,0.0,1.0
50%,18.0,12.0,12.0,4.0,0.655555,0.901308,0.655555,0.919725,1.444444,0.47619,...,0.823529,12.802302,0.6875,0.036666,1.444444,3.340923,0.615385,0.133333,0.0,1.0
75%,25.0,15.0,14.0,6.0,0.942809,0.914062,0.942809,1.094141,1.571429,0.608696,...,0.88,16.398649,0.8125,0.042855,1.571429,3.540568,0.666667,0.212121,0.117647,1.0
max,40.0,27.0,26.0,16.0,2.661453,0.956633,2.661453,2.302397,2.0,4.0,...,0.925,34.614674,1.0,0.177778,2.0,4.527247,0.913043,0.857143,0.625,1.0


In [6]:
import pandas as pd

# Load dataset (replace with your file path)
file_path = "f_dga_version2.csv"


# Read dataset
df = pd.read_csv(file_path)

# Count number of rows
row_count = len(df)

print(f"Number of rows in the dataset: {row_count}")


Number of rows in the dataset: 694173


In [7]:
import pandas as pd

def filter_outliers(df, feature_cols, method="iqr", z_thresh=3, save_path=None):
    """
    Filters datapoints outside the lower/upper bound for each feature
    and optionally saves the datapoints that are within the bounds.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset with features.
    feature_cols : list
        List of feature columns to check.
    method : str
        "iqr" (default) -> Interquartile Range method
        "zscore" -> Standard deviation based
    z_thresh : int
        Threshold for zscore method
    save_path : str or None
        If provided, saves the filtered dataset to this CSV path.

    Returns:
    --------
    pd.DataFrame : Filtered dataset (within bounds)
    pd.DataFrame : Bounds for each feature
    """
    bounds = {}
    df_filtered = df.copy()

    for col in feature_cols:
        if col not in df.columns:
            continue  # skip missing features

        series = df[col].dropna()

        if method == "iqr":
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

        elif method == "zscore":
            mean = series.mean()
            std = series.std()
            lower = mean - z_thresh * std
            upper = mean + z_thresh * std

        else:
            raise ValueError("Method must be 'iqr' or 'zscore'")

        bounds[col] = (lower, upper)

        # keep only rows within bounds
        df_filtered = df_filtered[(df_filtered[col] >= lower) & (df_filtered[col] <= upper)]

    bounds_df = pd.DataFrame(bounds, index=["Lower_Bound", "Upper_Bound"]).T

    # Save filtered dataset if save_path provided
    if save_path:
        df_filtered.to_csv(save_path, index=False)
        print(f"✅ Filtered dataset (within bounds) saved to {save_path}")

    return df_filtered.reset_index(drop=True), bounds_df


# ==== Example Usage ====
all_features = [


"Length","Consonant_Count","Unique_Chars","Max_Cons_Cluster","Dist_STD","Char_Gini","Char_Freq_Deviation","KL_divergence","Compression_ratio",
"Pronounceability","Bigram_Score","Trigram_Score","Markov_Chain_Likelihood","Bigram_Likelihood","Ngram_LM_Perplexity","Unique_Char_Ratio",
"Norm_Char_Freq_Var","Kolmogorov_Complexity","Renyi_Entropy","Min_Levenshtein_to_Popular","Keyboard_Distance_Score","Sliding_Word_Ratio"





]

df = pd.read_csv("f_dga_version2.csv")
# Remove duplicate columns (keep first occurrence)
df = df.loc[:, ~df.columns.duplicated()]

# Save filtered data to CSV
df_filtered, bounds = filter_outliers(
    df, all_features, method="iqr", save_path="f_dga_version3.csv"
)

print(bounds)


✅ Filtered dataset (within bounds) saved to f_dga_version3.csv
                            Lower_Bound  Upper_Bound
Length                        -2.500000    41.500000
Consonant_Count                0.000000    24.000000
Unique_Chars                   4.000000    20.000000
Max_Cons_Cluster              -1.500000    10.500000
Dist_STD                      -0.360901     1.725035
Char_Gini                      0.847841     0.953795
Char_Freq_Deviation           -0.360901     1.725035
KL_divergence                  0.282044     1.581399
Compression_ratio              0.942857     1.948571
Pronounceability              -0.288043     1.146739
Bigram_Score                  -5.500000    38.500000
Trigram_Score                 -6.500000    37.500000
Markov_Chain_Likelihood     -121.822026    19.742540
Bigram_Likelihood              0.603077     1.046154
Ngram_LM_Perplexity            0.182094    26.128583
Unique_Char_Ratio              0.197917     1.181250
Norm_Char_Freq_Var             0.008

In [8]:
from google.colab import files
df=pd.read_csv("f_dga_version3.csv")
df.describe()

Unnamed: 0,Length,Consonant_Count,Unique_Chars,Max_Cons_Cluster,Dist_STD,Char_Gini,Char_Freq_Deviation,KL_divergence,Compression_ratio,Pronounceability,...,Bigram_Likelihood,Ngram_LM_Perplexity,Unique_Char_Ratio,Norm_Char_Freq_Var,Kolmogorov_Complexity,Renyi_Entropy,Min_Levenshtein_to_Popular,Keyboard_Distance_Score,Sliding_Word_Ratio,label
count,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,...,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0
mean,19.848646,12.490755,12.75215,4.33089,0.725677,0.900989,0.725677,0.91504,1.448823,0.476259,...,0.827141,14.232752,0.682994,0.03626,1.448823,3.359791,0.614898,0.139552,0.05909,1.0
std,6.775358,4.141531,2.508406,2.135666,0.317764,0.018044,0.317764,0.202885,0.167894,0.213786,...,0.061562,4.63402,0.139425,0.009227,0.167894,0.259978,0.077071,0.087475,0.081459,0.0
min,9.0,4.0,8.0,1.0,0.229061,0.85,0.229061,0.386876,0.947368,0.0,...,0.607143,3.518005,0.305556,0.012056,0.947368,2.736966,0.419355,0.0,0.0,1.0
25%,16.0,10.0,11.0,3.0,0.433013,0.89,0.433013,0.752807,1.307692,0.307692,...,0.8,10.823575,0.56,0.029463,1.307692,3.184425,0.567568,0.071429,0.0,1.0
50%,19.0,12.0,13.0,4.0,0.655555,0.903226,0.655555,0.89736,1.421053,0.5,...,0.842105,13.671606,0.684211,0.036289,1.421053,3.369234,0.615385,0.133333,0.0,1.0
75%,26.0,15.0,15.0,6.0,0.942809,0.914127,0.942809,1.070826,1.5,0.611111,...,0.884615,16.398649,0.8125,0.041667,1.5,3.541659,0.666667,0.2,0.125,1.0
max,39.0,24.0,20.0,10.0,1.724879,0.942222,1.724879,1.39611,1.888889,1.142857,...,0.923077,25.93164,0.947368,0.063311,1.888889,4.113341,0.8125,0.423077,0.294118,1.0


In [9]:
# dga_feature_extractor.py
import pandas as pd
import numpy as np
import re
import math
import zlib
import string
from collections import Counter
from nltk.corpus import words
from nltk import download
from difflib import SequenceMatcher
from scipy.stats import entropy

# Download dictionary
download('words')
english_words = set(w.lower() for w in words.words())

VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
POPULAR_TLDS = {".com", ".org", ".net", ".info", ".edu", ".gov"}
BAD_TLDS = {".xyz", ".top", ".club", ".work", ".click"}

# Example benign distribution (uniform)
BENIGN_DIST = np.ones(26) / 26

def shannon_entropy(s):
    p, _ = np.histogram(list(s), bins=range(257), density=True)
    p = p[p > 0]
    return -np.sum(p * np.log2(p))

def normalized_entropy(s):
    if not s:
        return 0
    return shannon_entropy(s) / math.log2(len(set(s)) + 1)

def compress_ratio(s):
    if not s:
        return 0
    return len(zlib.compress(s.encode())) / max(1, len(s))

def dict_word_features(domain):
    matches = [w for w in english_words if w in domain]
    longest = max((len(w) for w in matches), default=0)
    return len(matches), longest, longest / max(1, len(domain))

def sliding_dict_ratio(domain, window=4):
    n = len(domain)
    if n < window:
        return 0
    matches = 0
    for i in range(n - window + 1):
        if domain[i:i+window] in english_words:
            matches += 1
    return matches / (n - window + 1)

def vowel_consonant_alt(domain):
    domain = re.sub(r'[^a-z]', '', domain.lower())
    count = 0
    for i in range(1, len(domain)):
        if (domain[i] in VOWELS and domain[i-1] in CONSONANTS) or (domain[i] in CONSONANTS and domain[i-1] in VOWELS):
            count += 1
    return count

def char_gini(s):
    counts = Counter(s)
    N = sum(counts.values())
    if N == 0:
        return 0
    gini = 1 - sum((c/N)**2 for c in counts.values())
    return gini

def kl_divergence(s):
    counts = Counter([c for c in s if c.isalpha()])
    N = sum(counts.values())
    if N == 0:
        return 0
    dist = np.array([counts.get(chr(97+i),0)/N for i in range(26)])
    return entropy(dist, BENIGN_DIST)

def markov_chain_likelihood(domain):
    if len(domain) < 2:
        return 0
    transitions = {}
    for i in range(len(domain)-1):
        pair = (domain[i], domain[i+1])
        transitions[pair] = transitions.get(pair, 0) + 1
    total = sum(transitions.values())
    return sum(math.log((c/total)+1e-6) for c in transitions.values())

def autocorrelation_score(domain):
    if len(domain) < 2:
        return 0
    values = [ord(c) for c in domain]
    mean = np.mean(values)
    var = np.var(values)
    corr = sum((values[i]-mean)*(values[i+1]-mean) for i in range(len(values)-1)) / (var*(len(values)-1)+1e-6)
    return corr

def hyphen_word_match_ratio(domain):
    """Compute ratio of hyphen-separated parts that are valid dictionary words."""
    parts = domain.split("-")
    if not parts:
        return 0.0
    dict_matches = sum(1 for p in parts if p in english_words)
    return dict_matches / len(parts)

def renyi_entropy(s, alpha=2):
    """Rényi entropy of order alpha (default α=2)."""
    if not s:
        return 0
    counts = Counter(s)
    probs = np.array(list(counts.values())) / len(s)
    if alpha == 1:
        return -np.sum(probs * np.log2(probs))  # Shannon
    return 1 / (1 - alpha) * np.log2(np.sum(probs ** alpha))

# Load top domains (free list like Tranco or Cisco Umbrella top domains)
try:
    with open("top1k_domains.txt") as f:
        popular_domains = [line.strip().lower() for line in f.readlines()]
except FileNotFoundError:
    popular_domains = ["google.com","facebook.com","youtube.com","amazon.com","wikipedia.org"]  # fallback demo list

def min_levenshtein_to_popular(domain, topN=500):
    """Compute minimum normalized Levenshtein distance to topN popular domains."""
    domain = domain.lower()
    min_dist = 1.0
    for pd in popular_domains[:topN]:
        ratio = SequenceMatcher(None, domain, pd).ratio()
        dist = 1 - ratio  # 0 = identical, 1 = very different
        if dist < min_dist:
            min_dist = dist
    return min_dist

# Keyboard adjacency mapping
KEYBOARD_NEIGHBORS = {
    'q': "was", 'w': "qase", 'e': "wsdr", 'r': "edft", 't': "rfgy",
    'y': "tghu", 'u': "yhj", 'i': "ujk", 'o': "ikl", 'p': "ol",
    'a': "qwsz", 's': "qwedxza", 'd': "erfcxs", 'f': "rtgvcd", 'g': "tyhbvf",
    'h': "yujnbg", 'j': "uikmnh", 'k': "iolmj", 'l': "opk",
    'z': "asx", 'x': "zsdc", 'c': "xdfv", 'v': "cfgb", 'b': "vghn",
    'n': "bhjm", 'm': "njk"
}

def keyboard_distance_score(domain):
    """Average keyboard adjacency match score between consecutive characters."""
    domain = re.sub(r'[^a-z]', '', domain.lower())
    if len(domain) < 2:
        return 0
    score = 0
    for i in range(len(domain)-1):
        if domain[i+1] in KEYBOARD_NEIGHBORS.get(domain[i], ""):
            score += 1
    return score / (len(domain)-1)

# === Feature extraction ===
def extract_features(domain):
    if not isinstance(domain, str):
        domain = str(domain)
    domain = domain.lower()
    name, _, tld = domain.rpartition(".")
    if not name:
        name = domain

    feats = {}

    feats["Length"] = len(domain)
    feats["Consonant_Count"] = sum(c in CONSONANTS for c in domain)
    feats["Unique_Chars"] = len(set(domain))
    feats["Max_Cons_Cluster"] = max((len(m.group()) for m in re.finditer(r'[bcdfghjklmnpqrstvwxyz]+', domain)), default=0)

    # Info-theoretic
    feats["Dist_STD"] = np.std([domain.count(c) for c in set(domain)])
    feats["Char_Gini"] = char_gini(domain)
    feats["Char_Freq_Deviation"] = np.std(list(Counter(domain).values()))
    feats["KL_divergence"] = kl_divergence(domain)
    feats["Compression_ratio"] = compress_ratio(domain)

    # Pronounceability
    feats["Pronounceability"] = sum(c in VOWELS for c in domain) / (sum(c in CONSONANTS for c in domain) + 1)

    # N-gram / LM
    feats["Bigram_Score"] = sum(1 for i in range(len(domain) - 1) if domain[i].isalpha() and domain[i + 1].isalpha())
    feats["Trigram_Score"] = sum(1 for i in range(len(domain) - 2) if domain[i].isalpha() and domain[i + 2].isalpha())
    feats["Markov_Chain_Likelihood"] = markov_chain_likelihood(domain)
    feats["Bigram_Likelihood"] = sum(1 for i in range(len(domain)-1) if domain[i].isalpha() and domain[i+1].isalpha())/max(1,len(domain))
    feats["Ngram_LM_Perplexity"] = math.exp(-markov_chain_likelihood(domain) / max(1, len(domain)))

    # Structural/pattern
    feats["Unique_Char_Ratio"] = len(set(domain)) / max(1, len(domain))
    feats["Norm_Char_Freq_Var"] = np.std(list(Counter(domain).values())) / max(1, len(domain))
    feats["Kolmogorov_Complexity"] = compress_ratio(domain)

    # Advanced Features
    feats["Renyi_Entropy"] = renyi_entropy(domain, alpha=2)
    feats["Min_Levenshtein_to_Popular"] = min_levenshtein_to_popular(domain)
    feats["Keyboard_Distance_Score"] = keyboard_distance_score(domain)
    feats["Sliding_Word_Ratio"] = sliding_dict_ratio(domain)

    return feats

# === Run on a dataset ===
df = pd.read_csv("ndga_version2.csv")  # must have "domain" column
features = df["domain"].apply(extract_features)
feat_df = pd.DataFrame(list(features))
out = pd.concat([df, feat_df], axis=1)
out.to_csv("f_ndga_version1.csv", index=False)
print("✅ Features saved to dga_18_version1.csv")


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


✅ Features saved to dga_18_version1.csv


In [10]:

from google.colab import files

df=pd.read_csv("f_ndga_version1.csv")

df.describe()

Unnamed: 0,label,Length,Consonant_Count,Unique_Chars,Max_Cons_Cluster,Dist_STD,Char_Gini,Char_Freq_Deviation,KL_divergence,Compression_ratio,...,Markov_Chain_Likelihood,Bigram_Likelihood,Ngram_LM_Perplexity,Unique_Char_Ratio,Norm_Char_Freq_Var,Kolmogorov_Complexity,Renyi_Entropy,Min_Levenshtein_to_Popular,Keyboard_Distance_Score,Sliding_Word_Ratio
count,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,...,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0
mean,0.0,13.52508,7.716392,10.27084,2.559998,0.4981125,0.8812457,0.4981125,1.171982,1.648859,...,-31.40455,0.7059835,9.797679,0.784204,0.03664792,1.648859,3.11732,0.5676884,0.1289533,0.1016527
std,0.0,4.305555,2.782675,2.394362,1.070682,0.258246,0.03121526,0.258246,0.2879947,0.2116489,...,14.74233,0.1251146,3.472379,0.1191128,0.01793356,0.2116489,0.3453143,0.122098,0.1136739,0.09450895
min,0.0,2.0,0.0,2.0,0.0,0.0,0.04253308,0.0,0.01025937,0.326087,...,-279.6816,0.04761905,0.9999995,0.04347826,0.0,0.326087,0.06270545,0.0,0.0,0.0
25%,0.0,10.0,6.0,9.0,2.0,0.3499271,0.8641975,0.3499271,0.9703233,1.5,...,-40.62053,0.6666667,7.224616,0.7,0.02886751,1.5,2.880418,0.4782609,0.0,0.0
50%,0.0,13.0,7.0,10.0,2.0,0.4615385,0.8888889,0.4615385,1.119763,1.615385,...,-29.81874,0.7272727,9.007527,0.7857143,0.03586096,1.615385,3.169925,0.5555556,0.1111111,0.1
75%,0.0,16.0,9.0,12.0,3.0,0.6555548,0.9027778,0.6555548,1.351942,1.8,...,-19.77494,0.7857143,11.74124,0.875,0.0440411,1.8,3.36257,0.6666667,0.2,0.1666667
max,0.0,75.0,45.0,37.0,43.0,22.0,0.970372,22.0,3.258097,5.0,...,9.999995e-07,0.9552239,61.99524,1.0,0.4782609,5.0,5.076896,0.9333333,1.0,0.6666667


In [11]:
import pandas as pd

def reorder_columns(input_csv, output_csv, desired_order):
    """
    Reorder the columns of a CSV file based on user-defined order.

    Parameters:
    - input_csv (str): Path to input CSV file.
    - output_csv (str): Path to output CSV file with reordered columns.
    - desired_order (list): List of column names in the desired order.
    """
    # Load dataset
    df = pd.read_csv(input_csv)

    # Check which desired columns exist
    available_columns = [col for col in desired_order if col in df.columns]

    # Add missing columns (if any were not in df)
    missing_columns = [col for col in desired_order if col not in df.columns]
    for col in missing_columns:
        df[col] = None  # Fill with None or default values

    # Reorder
    df = df[available_columns + [col for col in df.columns if col not in available_columns]]

    # Save output
    df.to_csv(output_csv, index=False)
    print(f"✅ Reordered CSV saved as {output_csv}")


# =====================
# Example usage
# =====================

# Suppose your dataset has 58 features + "domain" + "label"
input_csv = "f_ndga_version1.csv"
output_csv = "f_ndga_version2.csv"

# User-defined column order (just an example)
desired_order = [





"domain","Length","Consonant_Count","Unique_Chars","Max_Cons_Cluster","Dist_STD","Char_Gini","Char_Freq_Deviation",
"KL_divergence","Compression_ratio","Pronounceability","Bigram_Score","Trigram_Score","Markov_Chain_Likelihood",
"Bigram_Likelihood","Ngram_LM_Perplexity","Unique_Char_Ratio","Norm_Char_Freq_Var","Kolmogorov_Complexity",
 "Renyi_Entropy","Min_Levenshtein_to_Popular","Keyboard_Distance_Score","Sliding_Word_Ratio","label"

]

# Reorder dataset
reorder_columns(input_csv, output_csv, desired_order)


✅ Reordered CSV saved as f_ndga_version2.csv


In [12]:
import pandas as pd

# Load dataset (replace with your file path)
file_path = "f_ndga_version2.csv"


# Read dataset
df = pd.read_csv(file_path)

# Count number of rows
row_count = len(df)

print(f"Number of rows in the dataset: {row_count}")


Number of rows in the dataset: 1000018


In [13]:
from google.colab import files
df=pd.read_csv("f_ndga_version2.csv")
df.describe()

Unnamed: 0,Length,Consonant_Count,Unique_Chars,Max_Cons_Cluster,Dist_STD,Char_Gini,Char_Freq_Deviation,KL_divergence,Compression_ratio,Pronounceability,...,Bigram_Likelihood,Ngram_LM_Perplexity,Unique_Char_Ratio,Norm_Char_Freq_Var,Kolmogorov_Complexity,Renyi_Entropy,Min_Levenshtein_to_Popular,Keyboard_Distance_Score,Sliding_Word_Ratio,label
count,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,...,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0
mean,13.52508,7.716392,10.27084,2.559998,0.4981125,0.8812457,0.4981125,1.171982,1.648859,0.5104943,...,0.7059835,9.797679,0.784204,0.03664792,1.648859,3.11732,0.5676884,0.1289533,0.1016527,0.0
std,4.305555,2.782675,2.394362,1.070682,0.258246,0.03121526,0.258246,0.2879947,0.2116489,0.2197559,...,0.1251146,3.472379,0.1191128,0.01793356,0.2116489,0.3453143,0.122098,0.1136739,0.09450895,0.0
min,2.0,0.0,2.0,0.0,0.0,0.04253308,0.0,0.01025937,0.326087,0.0,...,0.04761905,0.9999995,0.04347826,0.0,0.326087,0.06270545,0.0,0.0,0.0,0.0
25%,10.0,6.0,9.0,2.0,0.3499271,0.8641975,0.3499271,0.9703233,1.5,0.375,...,0.6666667,7.224616,0.7,0.02886751,1.5,2.880418,0.4782609,0.0,0.0,0.0
50%,13.0,7.0,10.0,2.0,0.4615385,0.8888889,0.4615385,1.119763,1.615385,0.5,...,0.7272727,9.007527,0.7857143,0.03586096,1.615385,3.169925,0.5555556,0.1111111,0.1,0.0
75%,16.0,9.0,12.0,3.0,0.6555548,0.9027778,0.6555548,1.351942,1.8,0.625,...,0.7857143,11.74124,0.875,0.0440411,1.8,3.36257,0.6666667,0.2,0.1666667,0.0
max,75.0,45.0,37.0,43.0,22.0,0.970372,22.0,3.258097,5.0,7.0,...,0.9552239,61.99524,1.0,0.4782609,5.0,5.076896,0.9333333,1.0,0.6666667,0.0


In [14]:
import pandas as pd

def filter_outliers(df, feature_cols, method="iqr", z_thresh=3, save_path=None):
    """
    Filters datapoints outside the lower/upper bound for each feature
    and optionally saves the datapoints that are within the bounds.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset with features.
    feature_cols : list
        List of feature columns to check.
    method : str
        "iqr" (default) -> Interquartile Range method
        "zscore" -> Standard deviation based
    z_thresh : int
        Threshold for zscore method
    save_path : str or None
        If provided, saves the filtered dataset to this CSV path.

    Returns:
    --------
    pd.DataFrame : Filtered dataset (within bounds)
    pd.DataFrame : Bounds for each feature
    """
    bounds = {}
    df_filtered = df.copy()

    for col in feature_cols:
        if col not in df.columns:
            continue  # skip missing features

        series = df[col].dropna()

        if method == "iqr":
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

        elif method == "zscore":
            mean = series.mean()
            std = series.std()
            lower = mean - z_thresh * std
            upper = mean + z_thresh * std

        else:
            raise ValueError("Method must be 'iqr' or 'zscore'")

        bounds[col] = (lower, upper)

        # keep only rows within bounds
        df_filtered = df_filtered[(df_filtered[col] >= lower) & (df_filtered[col] <= upper)]

    bounds_df = pd.DataFrame(bounds, index=["Lower_Bound", "Upper_Bound"]).T

    # Save filtered dataset if save_path provided
    if save_path:
        df_filtered.to_csv(save_path, index=False)
        print(f"✅ Filtered dataset (within bounds) saved to {save_path}")

    return df_filtered.reset_index(drop=True), bounds_df


# ==== Example Usage ====
all_features = [


"Length","Consonant_Count","Unique_Chars","Max_Cons_Cluster","Dist_STD","Char_Gini","Char_Freq_Deviation","KL_divergence","Compression_ratio",
"Pronounceability","Bigram_Score","Trigram_Score","Markov_Chain_Likelihood","Bigram_Likelihood","Ngram_LM_Perplexity","Unique_Char_Ratio",
"Norm_Char_Freq_Var","Kolmogorov_Complexity","Renyi_Entropy","Min_Levenshtein_to_Popular","Keyboard_Distance_Score","Sliding_Word_Ratio"





]

df = pd.read_csv("f_ndga_version2.csv")
# Remove duplicate columns (keep first occurrence)
df = df.loc[:, ~df.columns.duplicated()]

# Save filtered data to CSV
df_filtered, bounds = filter_outliers(
    df, all_features, method="iqr", save_path="f_ndga_version3.csv"
)

print(bounds)


✅ Filtered dataset (within bounds) saved to f_ndga_version3.csv
                            Lower_Bound  Upper_Bound
Length                         1.000000    25.000000
Consonant_Count                1.500000    13.500000
Unique_Chars                   4.500000    16.500000
Max_Cons_Cluster               0.500000     4.500000
Dist_STD                      -0.108514     1.113996
Char_Gini                      0.806327     0.960648
Char_Freq_Deviation           -0.108514     1.113996
KL_divergence                  0.397896     1.924370
Compression_ratio              1.050000     2.250000
Pronounceability               0.000000     1.000000
Bigram_Score                  -0.500000    19.500000
Trigram_Score                 -1.500000    18.500000
Markov_Chain_Likelihood      -71.888910    11.493442
Bigram_Likelihood              0.488095     0.964286
Ngram_LM_Perplexity            0.449685    18.516167
Unique_Char_Ratio              0.437500     1.137500
Norm_Char_Freq_Var             0.00

In [15]:

import pandas as pd

def select_entries(file, num_entries, output_file="f_ndga_version4.csv"):
    """
    Select the user-specified number of entries (rows) from a dataset.

    Args:
        file (str): Path to the CSV file.

        num_entries (int): Number of rows to select.
        output_file (str): File to save the selected rows.
    """
    # Load dataset
    df = pd.read_csv(file)

    # Select first 'num_entries' rows
    selected_df = df.head(num_entries)

    # Save result
    selected_df.to_csv(output_file, index=False)


    return selected_df


# Example usage:
selected = select_entries("f_ndga_version3.csv", 572233 )   # get first 100 rows
print(selected)


                    domain  Length  Consonant_Count  Unique_Chars  \
0               0-0.online      10                3             8   
1           0-18klinik.com      14                6            12   
2           0-jayparts.com      14                8            13   
3             000betpk.com      12                6            10   
4           000nethost.com      14                7            10   
...                    ...     ...              ...           ...   
572228          sezane.com      10                5             9   
572229         sezar.space      11                6             8   
572230          sezeyx.com      10                6             9   
572231  seznam-autobusu.cz      18                9            13   
572232          seznam.com      10                6             9   

        Max_Cons_Cluster  Dist_STD  Char_Gini  Char_Freq_Deviation  \
0                      2  0.433013   0.860000             0.433013   
1                      2  0.372

In [16]:
from google.colab import files
df=pd.read_csv("f_ndga_version4.csv")
df.describe()

Unnamed: 0,Length,Consonant_Count,Unique_Chars,Max_Cons_Cluster,Dist_STD,Char_Gini,Char_Freq_Deviation,KL_divergence,Compression_ratio,Pronounceability,...,Bigram_Likelihood,Ngram_LM_Perplexity,Unique_Char_Ratio,Norm_Char_Freq_Var,Kolmogorov_Complexity,Renyi_Entropy,Min_Levenshtein_to_Popular,Keyboard_Distance_Score,Sliding_Word_Ratio,label
count,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,...,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0,572233.0
mean,13.801055,7.825887,10.45295,2.435265,0.519571,0.884966,0.519571,1.127726,1.613881,0.537828,...,0.731544,10.098747,0.77133,0.038124,1.613881,3.148503,0.556256,0.12787,0.112117,0.0
std,3.371609,2.159378,2.048881,0.781156,0.174773,0.023408,0.174773,0.219953,0.159463,0.174551,...,0.083852,2.856781,0.094892,0.010382,0.159463,0.285464,0.119785,0.104957,0.091494,0.0
min,7.0,3.0,6.0,1.0,0.242061,0.809917,0.242061,0.550046,1.05,0.0,...,0.5,2.626425,0.4375,0.014239,1.05,2.395301,0.2,0.0,0.0,0.0
25%,11.0,6.0,9.0,2.0,0.385695,0.875,0.385695,0.970323,1.5,0.428571,...,0.6875,8.111235,0.705882,0.030184,1.5,3.0,0.461538,0.055556,0.0,0.0
50%,13.0,8.0,10.0,2.0,0.471405,0.888889,0.471405,1.112255,1.615385,0.5,...,0.75,9.912027,0.777778,0.036747,1.615385,3.169925,0.545455,0.111111,0.111111,0.0
75%,16.0,9.0,12.0,3.0,0.642824,0.902778,0.642824,1.266587,1.727273,0.636364,...,0.8,11.741236,0.846154,0.044011,1.727273,3.36257,0.652174,0.2,0.166667,0.0
max,25.0,13.0,16.0,4.0,1.113404,0.934256,1.113404,1.906313,2.142857,1.0,...,0.863636,18.285621,0.941176,0.066221,2.142857,3.926998,0.913043,0.5,0.411765,0.0


In [17]:
import pandas as pd

def combine_datasets(file1, file2, output_file="combined.csv"):
    """
    Combine two datasets without altering datapoints.
    The header of the second dataset is removed automatically.
    """
    # Load first dataset normally (with header)
    df1 = pd.read_csv(file1)

    # Load second dataset as raw, then reassign columns from df1
    df2 = pd.read_csv(file2, header=None, skiprows=1)
    df2.columns = df1.columns  # assign the same header as df1

    # Concatenate without altering datapoints
    combined_df = pd.concat([df1, df2], ignore_index=True)

    # Save to CSV
    combined_df.to_csv(output_file, index=False)
    return combined_df



# Example usage:
combined = combine_datasets("f_dga_version3.csv", "f_ndga_version4.csv")
print(combined.head())


                 domain  Length  Consonant_Count  Unique_Chars  \
0   ofdhiydrrttpblp.com      19               15            14   
1    osvwkptpwqyiqen.ru      18               13            15   
2  wwcdhdhijsfsuyr.info      20               15            14   
3   kbbqiudkyyffmeq.com      19               14            13   
4   xxrdnsgxijevnij.net      19               14            12   

   Max_Cons_Cluster  Dist_STD  Char_Gini  Char_Freq_Deviation  KL_divergence  \
0                10  0.479157   0.919668             0.479157       0.752807   
1                10  0.400000   0.925926             0.400000       0.669523   
2                 7  0.494872   0.920000             0.494872       0.751435   
3                 7  0.498519   0.914127             0.498519       0.829823   
4                 8  0.759203   0.897507             0.759203       0.964978   

   Compression_ratio  ...  Bigram_Likelihood  Ngram_LM_Perplexity  \
0           1.421053  ...           0.842105         

In [18]:
!pip install xgboost
!pip install scikit-learn



In [None]:
# =========================
# DGA Detection Complete Script
# =========================

# -------------------------
# Imports
# -------------------------
import pandas as pd
import numpy as np
import re
import math
import zlib
import string
import joblib
from collections import Counter
from difflib import SequenceMatcher
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import words
from nltk import download
from scipy.stats import entropy

# -------------------------
# Download English words
# -------------------------
try:
    download('words')
    ENGLISH_WORDS = set(w.lower() for w in words.words())
except:
    ENGLISH_WORDS = {"test","shop","login","home","mail","secure"}  # fallback

# -------------------------
# Constants
# -------------------------
VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
POPULAR_TLDS = {".com", ".org", ".net", ".info", ".edu", ".gov"}
BAD_TLDS = {".xyz", ".top", ".club", ".work", ".click"}
BENIGN_DIST = np.ones(26)/26  # uniform for KL divergence

# Top popular domains (fallback if file not found)
try:
    with open("top1k_domains.txt") as f:
        POPULAR_DOMAINS = [line.strip().lower() for line in f.readlines()]
except FileNotFoundError:
    POPULAR_DOMAINS = ["google.com","facebook.com","youtube.com","amazon.com","wikipedia.org"]

# -------------------------
# Feature Functions
# -------------------------
def shannon_entropy(s):
    if not s: return 0
    p, _ = np.histogram(list(s), bins=range(257), density=True)
    p = p[p>0]
    return -np.sum(p*np.log2(p))

def compress_ratio(s):
    if not s: return 0
    return len(zlib.compress(s.encode()))/max(1,len(s))

def renyi_entropy(s, alpha=2):
    if not s: return 0
    counts = Counter(s)
    probs = np.array(list(counts.values())) / len(s)
    if alpha == 1:
        return -np.sum(probs*np.log2(probs))
    return 1/(1-alpha) * np.log2(np.sum(probs**alpha))

def min_levenshtein_to_popular(domain, popular_list=None):
    if popular_list is None:
        popular_list = POPULAR_DOMAINS
    domain = domain.lower()
    min_dist = 1.0
    for pd in popular_list:
        ratio = SequenceMatcher(None, domain, pd).ratio()
        dist = 1 - ratio
        if dist < min_dist:
            min_dist = dist
    return min_dist

# Keyboard adjacency mapping
KEYBOARD_NEIGHBORS = {
    'q':"was",'w':"qase",'e':"wsdr",'r':"edft",'t':"rfgy",
    'y':"tghu",'u':"yhj",'i':"ujk",'o':"ikl",'p':"ol",
    'a':"qwsz",'s':"qwedxza",'d':"erfcxs",'f':"rtgvcd",'g':"tyhbvf",
    'h':"yujnbg",'j':"uikmnh",'k':"iolmj",'l':"opk",
    'z':"asx",'x':"zsdc",'c':"xdfv",'v':"cfgb",'b':"vghn",
    'n':"bhjm",'m':"njk"
}

def keyboard_distance_score(domain):
    domain = re.sub(r'[^a-z]','',domain.lower())
    if len(domain)<2: return 0
    score=0
    for i in range(len(domain)-1):
        if domain[i+1] in KEYBOARD_NEIGHBORS.get(domain[i],""):
            score+=1
    return score/(len(domain)-1)

# Word / lexical features
def dict_word_features(domain):
    matches = [w for w in ENGLISH_WORDS if w in domain]
    longest = max((len(w) for w in matches), default=0)
    return len(matches), longest, longest/max(1,len(domain))

def sliding_dict_ratio(domain, window=4):
    n=len(domain)
    if n<window: return 0
    matches=0
    for i in range(n-window+1):
        if domain[i:i+window] in ENGLISH_WORDS:
            matches+=1
    return matches/(n-window+1)

def max_cons_cluster(domain):
    return max((len(m.group()) for m in re.finditer(r"[bcdfghjklmnpqrstvwxyz]+", domain.lower())), default=0)

def consonant_count(domain):
    return sum(c in CONSONANTS for c in domain.lower())

def vowel_count(domain):
    return sum(c in VOWELS for c in domain.lower())

def pronouncability_score(domain):
    return vowel_count(domain)/ (consonant_count(domain)+1)

def bigram_score(domain):
    return sum(1 for i in range(len(domain)-1) if domain[i].isalpha() and domain[i+1].isalpha())

def trigram_score(domain):
    return sum(1 for i in range(len(domain)-2) if domain[i].isalpha() and domain[i+2].isalpha())

def char_gini(s):
    counts=Counter(s)
    N=sum(counts.values())
    if N==0: return 0
    return 1 - sum((c/N)**2 for c in counts.values())

def kl_divergence(s):
    counts=Counter([c for c in s if c.isalpha()])
    N=sum(counts.values())
    if N==0: return 0
    dist=np.array([counts.get(chr(97+i),0)/N for i in range(26)])
    return entropy(dist, BENIGN_DIST)

def markov_chain_likelihood(domain):
    if len(domain)<2: return 0
    transitions={}
    for i in range(len(domain)-1):
        pair=(domain[i],domain[i+1])
        transitions[pair]=transitions.get(pair,0)+1
    total=sum(transitions.values())
    return sum(math.log((c/total)+1e-6) for c in transitions.values())

def ngram_lm_perplexity(domain):
    likelihood = markov_chain_likelihood(domain)
    if len(domain)==0:
        return 0
    return math.exp(-likelihood / len(domain))

def char_freq_deviation(domain):
    counts = list(Counter(domain).values())
    return np.std(counts) if counts else 0

def normal_char_freq_variance(domain):
    counts = list(Counter(domain).values())
    return (np.std(counts)/len(domain)) if domain else 0

# -------------------------
# Feature Extraction
# -------------------------
def extract_features(domain):
    if not isinstance(domain, str): domain = str(domain)
    domain = domain.lower()
    dwc, lwl, lwr = dict_word_features(domain)
    tld = domain.split(".")[-1] if "." in domain else ""

    feats = {
        "Length": len(domain),
        "Consonant_Count": consonant_count(domain),
        "Unique_Chars": len(set(domain)),
        "Max_Cons_Cluster": max_cons_cluster(domain),
        "Dist_STD": np.std(list(Counter(domain).values())),
        "Char_Gini": char_gini(domain),
        "Char_Freq_Deviation": char_freq_deviation(domain),
        "KL_divergence": kl_divergence(domain),
        "Compression_ratio": compress_ratio(domain),
        "Pronounceability": pronouncability_score(domain),
        "Bigram_Score": bigram_score(domain),
        "Trigram_Score": trigram_score(domain),
        "Markov_Chain_Likelihood": markov_chain_likelihood(domain),
        "Bigram_Likelihood": bigram_score(domain)/max(1,len(domain)),
        "Ngram_LM_Perplexity": ngram_lm_perplexity(domain),
        "Unique_Char_Ratio": len(set(domain))/max(1,len(domain)),
        "Norm_Char_Freq_Var": normal_char_freq_variance(domain),
        "Kolmogorov_Complexity": len(zlib.compress(domain.encode())),
        "Renyi_Entropy": renyi_entropy(domain),
        "Min_Levenshtein_to_Popular": min_levenshtein_to_popular(domain),
        "Keyboard_Distance_Score": keyboard_distance_score(domain),
        "Sliding_Word_Ratio": sliding_dict_ratio(domain)
    }
    return feats

def compute_features_for_dataset(df, domain_col="domain"):
    feature_rows = [extract_features(d) for d in df[domain_col]]
    return pd.DataFrame(feature_rows)

# -------------------------
# Training Phase
# -------------------------
def train_model(train_file, label_col="label", model_type="xgboost", save_model="dga_model.pkl"):
    df = pd.read_csv(train_file)
    drop_cols = [label_col]
    if "domain" in df.columns:
        drop_cols.append("domain")
    X = df.drop(columns=drop_cols).apply(pd.to_numeric, errors="coerce").fillna(0)
    y = df[label_col]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    if model_type=="xgboost":
        model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    else:
        model = RandomForestClassifier(n_estimators=200, random_state=42)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, preds))
    print(classification_report(y_val, preds))
    joblib.dump(model, save_model)
    print(f"✅ Model saved to {save_model}")
    return model

# -------------------------
# Prediction Phase
# -------------------------
def predict_new(test_file, model_file="dga_model.pkl", output_file="predictions_dga.csv"):
    df_test = pd.read_csv(test_file)
    domain_col = "domain" if "domain" in df_test.columns else df_test.columns[0]
    X_test = compute_features_for_dataset(df_test, domain_col=domain_col).apply(pd.to_numeric, errors="coerce").fillna(0)
    model = joblib.load(model_file)
    preds = model.predict(X_test)
    df_out = df_test.copy()
    df_out["Prediction"] = preds
    df_out.to_csv(output_file, index=False)
    print(f"✅ Predictions saved to {output_file}")
    return df_out

# =========================
# Example Usage
# =========================
if __name__ == "__main__":
    # Train model
    model = train_model("combined.csv", label_col="label", model_type="xgboost")

    # Predict new domains
    results = predict_new("dataset_dga.csv", model_file="dga_model.pkl")
    print(results.head())


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.964407105472402
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    114309
           1       0.97      0.96      0.96    114585

    accuracy                           0.96    228894
   macro avg       0.96      0.96      0.96    228894
weighted avg       0.96      0.96      0.96    228894

✅ Model saved to dga_model.pkl


In [None]:
import pandas as pd

from google.colab import files
pd.download("combined.csv")

In [22]:
import pandas as pd
#Min_Levenshtein_to_Popular"
def count_zero_one(df, column_name):
    """
    Count how many entries are 0 or 1 in a specific column.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset
    column_name : str
        Column to check

    Returns:
    --------
    dict : counts of 0 and 1
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in dataset")

    counts = {
        "count_0": (df[column_name] == 0).sum(),
        "count_1": (df[column_name] == 1).sum()
    }
    return counts


# ==== Example Usage ====
df = pd.read_csv("predictions_dga.csv")

# Replace "Label" with your column name
result = count_zero_one(df, "Prediction")

print(f"Number of 0s: {result['count_0']}")
print(f"Number of 1s: {result['count_1']}")



##DGA-86%
## Benign 87%

##DGA-95%
## Benign 86%

Number of 0s: 99
Number of 1s: 2401


In [1]:
# =========================
# DGA Detection Complete Script
# =========================

# -------------------------
# Imports
# -------------------------
import pandas as pd
import numpy as np
import re
import math
import zlib
import string
import joblib
from collections import Counter
from difflib import SequenceMatcher
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import words
from nltk import download
from scipy.stats import entropy

# -------------------------
# Download English words
# -------------------------
try:
    download('words')
    ENGLISH_WORDS = set(w.lower() for w in words.words())
except:
    ENGLISH_WORDS = {"test","shop","login","home","mail","secure"}  # fallback

# -------------------------
# Constants
# -------------------------
VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
POPULAR_TLDS = {".com", ".org", ".net", ".info", ".edu", ".gov"}
BAD_TLDS = {".xyz", ".top", ".club", ".work", ".click"}
BENIGN_DIST = np.ones(26)/26  # uniform for KL divergence

# Top popular domains (fallback if file not found)
try:
    with open("top1k_domains.txt") as f:
        POPULAR_DOMAINS = [line.strip().lower() for line in f.readlines()]
except FileNotFoundError:
    POPULAR_DOMAINS = ["google.com","facebook.com","youtube.com","amazon.com","wikipedia.org"]

# -------------------------
# Feature Functions
# -------------------------
def shannon_entropy(s):
    if not s: return 0
    p, _ = np.histogram(list(s), bins=range(257), density=True)
    p = p[p>0]
    return -np.sum(p*np.log2(p))

def compress_ratio(s):
    if not s: return 0
    return len(zlib.compress(s.encode()))/max(1,len(s))

def renyi_entropy(s, alpha=2):
    if not s: return 0
    counts = Counter(s)
    probs = np.array(list(counts.values())) / len(s)
    if alpha == 1:
        return -np.sum(probs*np.log2(probs))
    return 1/(1-alpha) * np.log2(np.sum(probs**alpha))

def min_levenshtein_to_popular(domain, popular_list=None):
    if popular_list is None:
        popular_list = POPULAR_DOMAINS
    domain = domain.lower()
    min_dist = 1.0
    for pd in popular_list:
        ratio = SequenceMatcher(None, domain, pd).ratio()
        dist = 1 - ratio
        if dist < min_dist:
            min_dist = dist
    return min_dist

# Keyboard adjacency mapping
KEYBOARD_NEIGHBORS = {
    'q':"was",'w':"qase",'e':"wsdr",'r':"edft",'t':"rfgy",
    'y':"tghu",'u':"yhj",'i':"ujk",'o':"ikl",'p':"ol",
    'a':"qwsz",'s':"qwedxza",'d':"erfcxs",'f':"rtgvcd",'g':"tyhbvf",
    'h':"yujnbg",'j':"uikmnh",'k':"iolmj",'l':"opk",
    'z':"asx",'x':"zsdc",'c':"xdfv",'v':"cfgb",'b':"vghn",
    'n':"bhjm",'m':"njk"
}

def keyboard_distance_score(domain):
    domain = re.sub(r'[^a-z]','',domain.lower())
    if len(domain)<2: return 0
    score=0
    for i in range(len(domain)-1):
        if domain[i+1] in KEYBOARD_NEIGHBORS.get(domain[i],""):
            score+=1
    return score/(len(domain)-1)

# Word / lexical features
def dict_word_features(domain):
    matches = [w for w in ENGLISH_WORDS if w in domain]
    longest = max((len(w) for w in matches), default=0)
    return len(matches), longest, longest/max(1,len(domain))

def sliding_dict_ratio(domain, window=4):
    n=len(domain)
    if n<window: return 0
    matches=0
    for i in range(n-window+1):
        if domain[i:i+window] in ENGLISH_WORDS:
            matches+=1
    return matches/(n-window+1)

def max_cons_cluster(domain):
    return max((len(m.group()) for m in re.finditer(r"[bcdfghjklmnpqrstvwxyz]+", domain.lower())), default=0)

def consonant_count(domain):
    return sum(c in CONSONANTS for c in domain.lower())

def vowel_count(domain):
    return sum(c in VOWELS for c in domain.lower())

def pronouncability_score(domain):
    return vowel_count(domain)/ (consonant_count(domain)+1)

def bigram_score(domain):
    return sum(1 for i in range(len(domain)-1) if domain[i].isalpha() and domain[i+1].isalpha())

def trigram_score(domain):
    return sum(1 for i in range(len(domain)-2) if domain[i].isalpha() and domain[i+2].isalpha())

def char_gini(s):
    counts=Counter(s)
    N=sum(counts.values())
    if N==0: return 0
    return 1 - sum((c/N)**2 for c in counts.values())

def kl_divergence(s):
    counts=Counter([c for c in s if c.isalpha()])
    N=sum(counts.values())
    if N==0: return 0
    dist=np.array([counts.get(chr(97+i),0)/N for i in range(26)])
    return entropy(dist, BENIGN_DIST)

def markov_chain_likelihood(domain):
    if len(domain)<2: return 0
    transitions={}
    for i in range(len(domain)-1):
        pair=(domain[i],domain[i+1])
        transitions[pair]=transitions.get(pair,0)+1
    total=sum(transitions.values())
    return sum(math.log((c/total)+1e-6) for c in transitions.values())

def ngram_lm_perplexity(domain):
    likelihood = markov_chain_likelihood(domain)
    if len(domain)==0:
        return 0
    return math.exp(-likelihood / len(domain))

def char_freq_deviation(domain):
    counts = list(Counter(domain).values())
    return np.std(counts) if counts else 0

def normal_char_freq_variance(domain):
    counts = list(Counter(domain).values())
    return (np.std(counts)/len(domain)) if domain else 0

# -------------------------
# Feature Extraction
# -------------------------
def extract_features(domain):
    if not isinstance(domain, str): domain = str(domain)
    domain = domain.lower()
    dwc, lwl, lwr = dict_word_features(domain)
    tld = domain.split(".")[-1] if "." in domain else ""

    feats = {
        "Length": len(domain),
        "Consonant_Count": consonant_count(domain),
        "Unique_Chars": len(set(domain)),
        "Max_Cons_Cluster": max_cons_cluster(domain),
        "Dist_STD": np.std(list(Counter(domain).values())),
        "Char_Gini": char_gini(domain),
        "Char_Freq_Deviation": char_freq_deviation(domain),
        "KL_divergence": kl_divergence(domain),
        "Compression_ratio": compress_ratio(domain),
        "Pronounceability": pronouncability_score(domain),
        "Bigram_Score": bigram_score(domain),
        "Trigram_Score": trigram_score(domain),
        "Markov_Chain_Likelihood": markov_chain_likelihood(domain),
        "Bigram_Likelihood": bigram_score(domain)/max(1,len(domain)),
        "Ngram_LM_Perplexity": ngram_lm_perplexity(domain),
        "Unique_Char_Ratio": len(set(domain))/max(1,len(domain)),
        "Norm_Char_Freq_Var": normal_char_freq_variance(domain),
        "Kolmogorov_Complexity": len(zlib.compress(domain.encode())),
        "Renyi_Entropy": renyi_entropy(domain),
        "Min_Levenshtein_to_Popular": min_levenshtein_to_popular(domain),
        "Keyboard_Distance_Score": keyboard_distance_score(domain),
        "Sliding_Word_Ratio": sliding_dict_ratio(domain)
    }
    return feats

def compute_features_for_dataset(df, domain_col="domain"):
    feature_rows = [extract_features(d) for d in df[domain_col]]
    return pd.DataFrame(feature_rows)

# -------------------------
# Training Phase
# -------------------------
def train_model(train_file, label_col="label", model_type="xgboost", save_model="dga_model.pkl"):
    df = pd.read_csv(train_file)
    drop_cols = [label_col]
    if "domain" in df.columns:
        drop_cols.append("domain")
    X = df.drop(columns=drop_cols).apply(pd.to_numeric, errors="coerce").fillna(0)
    y = df[label_col]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    if model_type=="xgboost":
        model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    else:
        model = RandomForestClassifier(n_estimators=200, random_state=42)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, preds))
    print(classification_report(y_val, preds))
    joblib.dump(model, save_model)
    print(f"✅ Model saved to {save_model}")
    return model

# -------------------------
# Prediction Phase
# -------------------------
def predict_new(test_file, model_file="dga_model.pkl", output_file="predictions_ndga.csv"):
    df_test = pd.read_csv(test_file)
    domain_col = "domain" if "domain" in df_test.columns else df_test.columns[0]
    X_test = compute_features_for_dataset(df_test, domain_col=domain_col).apply(pd.to_numeric, errors="coerce").fillna(0)
    model = joblib.load(model_file)
    preds = model.predict(X_test)
    df_out = df_test.copy()
    df_out["Prediction"] = preds
    df_out.to_csv(output_file, index=False)
    print(f"✅ Predictions saved to {output_file}")
    return df_out

# =========================
# Example Usage
# =========================
if __name__ == "__main__":
    # Train model
    model = train_model("combined.csv", label_col="label", model_type="xgboost")

    # Predict new domains
    results = predict_new("dataset_ndga.csv", model_file="dga_model.pkl")
    print(results.head())


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.964407105472402
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    114309
           1       0.97      0.96      0.96    114585

    accuracy                           0.96    228894
   macro avg       0.96      0.96      0.96    228894
weighted avg       0.96      0.96      0.96    228894

✅ Model saved to dga_model.pkl
✅ Predictions saved to predictions_ndga.csv
              domains  Prediction
0        eldenvpn.net           0
1  mrworldpremiere.tv           1
2           xpjfw.com           1
3  kvdveganbeauty.com           0
4        officient.io           0


In [2]:
import pandas as pd

def count_zero_one(df, column_name):
    """
    Count how many entries are 0 or 1 in a specific column.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset
    column_name : str
        Column to check

    Returns:
    --------
    dict : counts of 0 and 1
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in dataset")

    counts = {
        "count_0": (df[column_name] == 0).sum(),
        "count_1": (df[column_name] == 1).sum()
    }
    return counts


# ==== Example Usage ====
df = pd.read_csv("predictions_ndga.csv")

# Replace "Label" with your column name
result = count_zero_one(df, "Prediction")

print(f"Number of 0s: {result['count_0']}")
print(f"Number of 1s: {result['count_1']}")
 ##dga 79
 ## 68


Number of 0s: 2212
Number of 1s: 288


In [None]:
#all dga 90%
#benign 86%
#-2:compression ratio
#:Bigram score
# dga 89
#BENIGN: 86