In [2]:
#!/usr/bin/env python3
"""
Fast CPU-only DGA Feature Extraction (31 features)
Input:  domains.csv (must contain a column 'domain')
Output: features_optimized.csv
"""

import re, math, string, zlib
import pandas as pd
import numpy as np
from collections import Counter
from difflib import SequenceMatcher
from nltk.corpus import words as nltk_words
import nltk

try:
    nltk.data.find("corpora/words")
except LookupError:
    nltk.download("words")

ENGLISH_WORDS = set(w.lower() for w in nltk_words.words())
VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
ALPHABET = string.ascii_lowercase
POP_DOMAINS = ["google", "facebook", "youtube", "amazon", "twitter", "instagram"]

# Precompute key maps
KEYBOARD_POS = {
    c: (i // 10, i % 10)
    for i, c in enumerate("qwertyuiopasdfghjklzxcvbnm")
}

# --- Utility functions ---
def safe(s): return str(s).lower() if isinstance(s, str) else ""

def shannon_entropy(s):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return -np.sum(probs * np.log2(probs + 1e-12))

def renyi_entropy(s, alpha=2):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return 1.0 / (1.0 - alpha) * np.log2(np.sum(probs ** alpha) + 1e-12)

def kolmogorov_complexity(s):
    if not s: return 0.0
    comp = zlib.compress(s.encode("utf-8"))
    return len(comp) / max(1, len(s))

def bigram_likelihood(s):
    if len(s) < 2: return 0.0
    bigrams = [s[i:i+2] for i in range(len(s)-1)]
    freq = Counter(bigrams)
    probs = np.array(list(freq.values())) / len(bigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def trigram_score(s):
    if len(s) < 3: return 0.0
    trigrams = [s[i:i+3] for i in range(len(s)-2)]
    freq = Counter(trigrams)
    probs = np.array(list(freq.values())) / len(trigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def char_freq_dev(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()))
    return np.std(freq / np.sum(freq))

def char_gini(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    p = freq / freq.sum()
    return 1.0 - np.sum(p**2)

def vowel_consonant_features(s):
    v_runs = re.findall(r"[aeiou]+", s)
    c_runs = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    v_run_count = len(v_runs)
    c_run_count = len(c_runs)
    v_cluster_ratio = max((len(r) for r in v_runs), default=0) / max(1, len(s))
    return v_run_count, c_run_count, v_cluster_ratio

def max_consonant_cluster(s):
    clusters = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    return max((len(c) for c in clusters), default=0)

def pronounceability_score(s):
    if not s: return 0.0
    score = sum(c in VOWELS for c in s) / len(s)
    return score

def unique_char_ratio(s):
    return len(set(s)) / max(1, len(s))

def unique_char(s):
    return len(set(s))

def dict_std(s):
    if not s: return 0.0
    words_found = sum(w in ENGLISH_WORDS for w in re.findall(r"[a-z]+", s))
    return words_found / max(1, len(s.split(".")))

def markov_chain_likelihood(s):
    if len(s) < 2: return 0.0
    probs = []
    for i in range(1, len(s)):
        probs.append(1.0 if s[i] == s[i-1] else 0.5)
    return np.mean(probs)

def kl_divergence(s):
    benign_dist = np.ones(26) / 26
    if not s: return 0.0
    counts = np.array([s.count(c) for c in ALPHABET], dtype=float)
    if counts.sum() == 0: return 0.0
    p = counts / counts.sum()
    return np.sum(p * np.log2((p + 1e-12) / (benign_dist + 1e-12)))

def sliding_word_ratio(s):
    if len(s) < 4: return 0.0
    matches = sum(s[i:i+4] in ENGLISH_WORDS for i in range(len(s)-3))
    return matches / (len(s) - 3)

def keyboard_distance_score(s):
    total = 0.0
    count = 0
    for a, b in zip(s, s[1:]):
        if a in KEYBOARD_POS and b in KEYBOARD_POS:
            pa, pb = KEYBOARD_POS[a], KEYBOARD_POS[b]
            total += math.dist(pa, pb)
            count += 1
    return total / count if count else 0.0

def min_levenshtein_to_popular(s):
    return min(SequenceMatcher(None, s, p).ratio() for p in POP_DOMAINS)

def repetition_ratio(s):
    if not s: return 0.0
    freq = Counter(s)
    return max(freq.values()) / len(s)

def alphabetic_ratio(s):
    letters = sum(c.isalpha() for c in s)
    return letters / max(1, len(s))

def symbol_ratio(s):
    symbols = sum(not c.isalnum() for c in s)
    return symbols / max(1, len(s))

def entropy_per_length(s):
    e = shannon_entropy(s)
    return e / max(1, len(s))

def entropy_slope(s):
    if len(s) < 2: return 0.0
    entropies = [shannon_entropy(s[:i]) for i in range(2, len(s)+1)]
    x = np.arange(2, len(s)+1)
    slope, _ = np.polyfit(x, entropies, 1)
    return slope

def char_distribution_symmetry(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    mean = freq.mean()
    return np.mean(np.abs(freq - mean)) / mean

# === MAIN ===
def main(input_csv="dga_version5.csv", output_csv="oct30_dga_v1.csv"):
    df = pd.read_csv(input_csv, dtype=str)
    domains = df["domain"].fillna("").str.lower().tolist()
    print(f"Processing {len(domains)} domains...")

    feats = []
    for d in domains:
        s = safe(d)
        v_run, c_run, v_ratio = vowel_consonant_features(s)
        feats.append({
            "Maximum_Consonants_Cluster": max_consonant_cluster(s),
            "Consonant_count": sum(c in CONSONANTS for c in s),
            "Pronounceability_Score": pronounceability_score(s),
            "Bigram-Likelihood": bigram_likelihood(s),
            "Character_Frequency_Deviation": char_freq_dev(s),
            "Unique_Character_Ratio": unique_char_ratio(s),
            "Unique_Character": unique_char(s),
            "Dictionary_Standard": dict_std(s),
            "Markov_Chain_Likelihood": markov_chain_likelihood(s),
            "Length": len(s),
            "Compression_Ratio": kolmogorov_complexity(s),
            "Bigram_Score": bigram_likelihood(s),
            "Trigram_Score": trigram_score(s),
            "N-gram_LM_perplexity": bigram_likelihood(s) + trigram_score(s),
            "Normal_Character_Frequency_varience": char_freq_dev(s),
            "Character_Gini": char_gini(s),
            "KL_Divergence": kl_divergence(s),
            "Sliding_word_ratio": sliding_word_ratio(s),
            "Kolmogorov_Complexity": kolmogorov_complexity(s),
            "Renyi_Entropy": renyi_entropy(s),
            "Keyboard_Distance_Score": keyboard_distance_score(s),
            "Min_Levenshtein_To_Popular": min_levenshtein_to_popular(s),
            "Repetition_Ratio": repetition_ratio(s),
            "Alphabetic_Ratio": alphabetic_ratio(s),
            "Symbol_Ratio": symbol_ratio(s),
            "Vowel_run_count": v_run,
            "Consonant_run_count": c_run,
            "Vowel_cluster_ratio": v_ratio,
            "Entropy_per_length": entropy_per_length(s),
            "Entropy_Slope": entropy_slope(s),
            "Character_Distribution_Symmetry": char_distribution_symmetry(s),
        })

    feat_df = pd.DataFrame(feats)
    out = pd.concat([df, feat_df], axis=1)
    out.to_csv(output_csv, index=False)
    print(f"✅ Features saved to {output_csv}")

if __name__ == "__main__":
    main()


Processing 694173 domains...
✅ Features saved to features_optimized.csv


In [None]:
from google.colab import files
files.download("oct30_dga_v1.csv")

In [4]:
df=pd.read_csv('oct30_dga_v1.csv')
df.describe()

Unnamed: 0,label,Maximum_Consonants_Cluster,Consonant_count,Pronounceability_Score,Bigram-Likelihood,Character_Frequency_Deviation,Unique_Character_Ratio,Unique_Character,Dictionary_Standard,Markov_Chain_Likelihood,...,Min_Levenshtein_To_Popular,Repetition_Ratio,Alphabetic_Ratio,Symbol_Ratio,Vowel_run_count,Consonant_run_count,Vowel_cluster_ratio,Entropy_per_length,Entropy_Slope,Character_Distribution_Symmetry
count,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,...,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0
mean,1.0,4.576519,12.200742,0.298728,4.020743,0.03688,0.691829,12.481366,0.040699,0.518253,...,0.084301,0.168355,0.939118,0.060405,5.068241,6.144378,0.092197,0.200827,0.144334,0.361636
std,0.0,2.643387,4.305605,0.117054,0.521081,0.012843,0.143928,2.68991,0.14695,0.024762,...,0.036036,0.044974,0.024891,0.022564,2.953502,3.084483,0.048756,0.06046,0.060895,0.120434
min,1.0,1.0,2.0,0.0,2.370951,0.0,0.305556,4.0,0.0,0.5,...,0.0,0.052632,0.40625,0.025,0.0,1.0,0.0,0.089459,0.030941,0.0
25%,1.0,3.0,9.0,0.2,3.664498,0.029215,0.566667,10.0,0.0,0.5,...,0.068966,0.133333,0.923077,0.04,3.0,4.0,0.0625,0.146931,0.089846,0.288462
50%,1.0,4.0,12.0,0.321429,4.037401,0.036666,0.6875,12.0,0.0,0.5,...,0.086957,0.166667,0.941176,0.058824,4.0,5.0,0.076923,0.198916,0.13989,0.37594
75%,1.0,6.0,15.0,0.375,4.436605,0.042855,0.8125,14.0,0.0,0.533333,...,0.105263,0.1875,0.96,0.076923,8.0,9.0,0.105263,0.242187,0.188662,0.448276
max,1.0,16.0,27.0,0.8,5.247928,0.177778,1.0,26.0,4.0,0.8125,...,0.266667,0.555556,0.975,0.125,17.0,17.0,0.7,0.375,0.444796,0.807692


In [14]:
import pandas as pd

def reorder_columns(input_csv, output_csv, desired_order):
    """
    Reorder the columns of a CSV file based on user-defined order.

    Parameters:
    - input_csv (str): Path to input CSV file.
    - output_csv (str): Path to output CSV file with reordered columns.
    - desired_order (list): List of column names in the desired order.
    """
    # Load dataset
    df = pd.read_csv(input_csv)

    # Check which desired columns exist
    available_columns = [col for col in desired_order if col in df.columns]

    # Add missing columns (if any were not in df)
    missing_columns = [col for col in desired_order if col not in df.columns]
    for col in missing_columns:
        df[col] = None  # Fill with None or default values

    # Reorder
    df = df[available_columns + [col for col in df.columns if col not in available_columns]]

    # Save output
    df.to_csv(output_csv, index=False)
    print(f"✅ Reordered CSV saved as {output_csv}")


# =====================
# Example usage
# =====================

# Suppose your dataset has 58 features + "domain" + "label"
input_csv = "oct30_dga_v1.csv"
output_csv = "oct30_dga_v2.csv"

# User-defined column order (just an example)
desired_order = [





"domain",
"Maximum_Consonants_Cluster",
"Consonant_count",
"Pronounceability_Score",
"Bigram-Likelihood",
"Character_Frequency_Deviation",
"Unique_Character_Ratio",
"Unique_Character",
"Dictionary_Standard",
"Markov_Chain_Likelihood",
"Length",
"Compression_Ratio",
"Bigram_Score",
"Trigram_Score",
"N-gram_LM_perplexity",
"Normal_Character_Frequency_varience",
"Character_Gini",
"KL_Divergence",
"Sliding_word_ratio",
"Kolmogorov_Complexity",
"Renyi_Entropy",
"Keyboard_Distance_Score",
"Min_Levenshtein_To_Popular",
"Repetition_Ratio",
"Alphabetic_Ratio",
"Symbol_Ratio",
"Vowel_run_count",
"Consonant_run_count",
"Vowel_cluster_ratio",
"Entropy_per_length",
"Entropy_Slope",
"Character_Distribution_Symmetry",
"label"

]

# Reorder dataset
reorder_columns(input_csv, output_csv, desired_order)


✅ Reordered CSV saved as oct30_dga_v2.csv


In [15]:
df=pd.read_csv('oct30_dga_v2.csv')
df.describe()

Unnamed: 0,Maximum_Consonants_Cluster,Consonant_count,Pronounceability_Score,Bigram-Likelihood,Character_Frequency_Deviation,Unique_Character_Ratio,Unique_Character,Dictionary_Standard,Markov_Chain_Likelihood,Length,...,Repetition_Ratio,Alphabetic_Ratio,Symbol_Ratio,Vowel_run_count,Consonant_run_count,Vowel_cluster_ratio,Entropy_per_length,Entropy_Slope,Character_Distribution_Symmetry,label
count,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,...,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0,694173.0
mean,4.576519,12.200742,0.298728,4.020743,0.03688,0.691829,12.481366,0.040699,0.518253,19.226092,...,0.168355,0.939118,0.060405,5.068241,6.144378,0.092197,0.200827,0.144334,0.361636,1.0
std,2.643387,4.305605,0.117054,0.521081,0.012843,0.143928,2.68991,0.14695,0.024762,6.938378,...,0.044974,0.024891,0.022564,2.953502,3.084483,0.048756,0.06046,0.060895,0.120434,0.0
min,1.0,2.0,0.0,2.370951,0.0,0.305556,4.0,0.0,0.5,8.0,...,0.052632,0.40625,0.025,0.0,1.0,0.0,0.089459,0.030941,0.0,1.0
25%,3.0,9.0,0.2,3.664498,0.029215,0.566667,10.0,0.0,0.5,14.0,...,0.133333,0.923077,0.04,3.0,4.0,0.0625,0.146931,0.089846,0.288462,1.0
50%,4.0,12.0,0.321429,4.037401,0.036666,0.6875,12.0,0.0,0.5,18.0,...,0.166667,0.941176,0.058824,4.0,5.0,0.076923,0.198916,0.13989,0.37594,1.0
75%,6.0,15.0,0.375,4.436605,0.042855,0.8125,14.0,0.0,0.533333,25.0,...,0.1875,0.96,0.076923,8.0,9.0,0.105263,0.242187,0.188662,0.448276,1.0
max,16.0,27.0,0.8,5.247928,0.177778,1.0,26.0,4.0,0.8125,40.0,...,0.555556,0.975,0.125,17.0,17.0,0.7,0.375,0.444796,0.807692,1.0


In [16]:
import pandas as pd

def filter_outliers(df, feature_cols, method="iqr", z_thresh=3, save_path=None):
    """
    Filters datapoints outside the lower/upper bound for each feature
    and optionally saves the datapoints that are within the bounds.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset with features.
    feature_cols : list
        List of feature columns to check.
    method : str
        "iqr" (default) -> Interquartile Range method
        "zscore" -> Standard deviation based
    z_thresh : int
        Threshold for zscore method
    save_path : str or None
        If provided, saves the filtered dataset to this CSV path.

    Returns:
    --------
    pd.DataFrame : Filtered dataset (within bounds)
    pd.DataFrame : Bounds for each feature
    """
    bounds = {}
    df_filtered = df.copy()

    for col in feature_cols:
        if col not in df.columns:
            continue  # skip missing features

        series = df[col].dropna()

        if method == "iqr":
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

        elif method == "zscore":
            mean = series.mean()
            std = series.std()
            lower = mean - z_thresh * std
            upper = mean + z_thresh * std

        else:
            raise ValueError("Method must be 'iqr' or 'zscore'")

        bounds[col] = (lower, upper)

        # keep only rows within bounds
        df_filtered = df_filtered[(df_filtered[col] >= lower) & (df_filtered[col] <= upper)]

    bounds_df = pd.DataFrame(bounds, index=["Lower_Bound", "Upper_Bound"]).T

    # Save filtered dataset if save_path provided
    if save_path:
        df_filtered.to_csv(save_path, index=False)
        print(f"✅ Filtered dataset (within bounds) saved to {save_path}")

    return df_filtered.reset_index(drop=True), bounds_df


# ==== Example Usage ====
all_features = [

"Maximum_Consonants_Cluster",
"Consonant_count",
"Pronounceability_Score",
"Bigram-Likelihood",
"Character_Frequency_Deviation",
"Unique_Character_Ratio",
"Unique_Character",
"Dictionary_Standard",
"Markov_Chain_Likelihood",
"Length",
"Compression_Ratio",
"Bigram_Score",
"Trigram_Score",
"N-gram_LM_perplexity",
"Normal_Character_Frequency_varience",
"Character_Gini",
"KL_Divergence",
"Sliding_word_ratio",
"Kolmogorov_Complexity",
"Renyi_Entropy",
"Keyboard_Distance_Score",
"Min_Levenshtein_To_Popular",
"Repetition_Ratio",
"Alphabetic_Ratio",
"Symbol_Ratio",
"Vowel_run_count",
"Consonant_run_count",
"Vowel_cluster_ratio",
"Entropy_per_length",
"Entropy_Slope",
"Character_Distribution_Symmetry"




]

df = pd.read_csv("oct30_dga_v2.csv")
# Remove duplicate columns (keep first occurrence)
df = df.loc[:, ~df.columns.duplicated()]

# Save filtered data to CSV
df_filtered, bounds = filter_outliers(
    df, all_features, method="iqr", save_path="oct30_dga_v3.csv"
)

print(bounds)


✅ Filtered dataset (within bounds) saved to oct30_dga_v3.csv
                                     Lower_Bound  Upper_Bound
Maximum_Consonants_Cluster             -1.500000    10.500000
Consonant_count                         0.000000    24.000000
Pronounceability_Score                 -0.062500     0.637500
Bigram-Likelihood                       2.506336     5.594767
Character_Frequency_Deviation           0.008755     0.063315
Unique_Character_Ratio                  0.197917     1.181250
Unique_Character                        4.000000    20.000000
Dictionary_Standard                     0.000000     0.000000
Markov_Chain_Likelihood                 0.450000     0.583333
Length                                 -2.500000    41.500000
Compression_Ratio                       0.942857     1.948571
Bigram_Score                            2.506336     5.594767
Trigram_Score                           2.273259     5.771135
N-gram_LM_perplexity                    4.800719    11.426541
Normal_Ch

In [17]:
df=pd.read_csv('oct30_dga_v3.csv')
df.describe()

Unnamed: 0,Maximum_Consonants_Cluster,Consonant_count,Pronounceability_Score,Bigram-Likelihood,Character_Frequency_Deviation,Unique_Character_Ratio,Unique_Character,Dictionary_Standard,Markov_Chain_Likelihood,Length,...,Repetition_Ratio,Alphabetic_Ratio,Symbol_Ratio,Vowel_run_count,Consonant_run_count,Vowel_cluster_ratio,Entropy_per_length,Entropy_Slope,Character_Distribution_Symmetry,label
count,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,...,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0,446419.0
mean,4.353867,13.693187,0.308617,4.219672,0.036251,0.650148,13.377294,0.0,0.516887,21.749233,...,0.160256,0.947611,0.052378,6.067665,7.1519,0.078718,0.179045,0.123232,0.388791,1.0
std,2.017143,3.842061,0.093463,0.452189,0.008934,0.134842,2.387391,0.0,0.020204,6.705792,...,0.034522,0.019341,0.019339,2.978324,3.210679,0.029853,0.052316,0.05267,0.10257,0.0
min,1.0,4.0,0.0,2.921928,0.012056,0.305556,8.0,0.0,0.5,9.0,...,0.066667,0.888889,0.025,0.0,2.0,0.0,0.089459,0.030941,0.099415,1.0
25%,3.0,11.0,0.25,3.906891,0.029463,0.545455,12.0,0.0,0.5,16.0,...,0.129032,0.9375,0.035714,4.0,4.0,0.0625,0.133325,0.078893,0.32967,1.0
50%,4.0,13.0,0.333333,4.247928,0.035977,0.62069,13.0,0.0,0.514706,20.0,...,0.15625,0.95,0.05,5.0,6.0,0.068966,0.170185,0.109683,0.4,1.0
75%,5.0,17.0,0.375,4.606739,0.04166,0.75,15.0,0.0,0.53125,28.0,...,0.1875,0.964286,0.0625,9.0,10.0,0.1,0.215801,0.158928,0.463602,1.0
max,10.0,24.0,0.6,5.247928,0.063315,0.947368,20.0,0.0,0.580645,40.0,...,0.266667,0.975,0.111111,15.0,16.0,0.166667,0.327523,0.336816,0.687831,1.0


In [8]:
#!/usr/bin/env python3
"""
Fast CPU-only DGA Feature Extraction (31 features)
Input:  domains.csv (must contain a column 'domain')
Output: features_optimized.csv
"""

import re, math, string, zlib
import pandas as pd
import numpy as np
from collections import Counter
from difflib import SequenceMatcher
from nltk.corpus import words as nltk_words
import nltk

try:
    nltk.data.find("corpora/words")
except LookupError:
    nltk.download("words")

ENGLISH_WORDS = set(w.lower() for w in nltk_words.words())
VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
ALPHABET = string.ascii_lowercase
POP_DOMAINS = ["google", "facebook", "youtube", "amazon", "twitter", "instagram"]

# Precompute key maps
KEYBOARD_POS = {
    c: (i // 10, i % 10)
    for i, c in enumerate("qwertyuiopasdfghjklzxcvbnm")
}

# --- Utility functions ---
def safe(s): return str(s).lower() if isinstance(s, str) else ""

def shannon_entropy(s):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return -np.sum(probs * np.log2(probs + 1e-12))

def renyi_entropy(s, alpha=2):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return 1.0 / (1.0 - alpha) * np.log2(np.sum(probs ** alpha) + 1e-12)

def kolmogorov_complexity(s):
    if not s: return 0.0
    comp = zlib.compress(s.encode("utf-8"))
    return len(comp) / max(1, len(s))

def bigram_likelihood(s):
    if len(s) < 2: return 0.0
    bigrams = [s[i:i+2] for i in range(len(s)-1)]
    freq = Counter(bigrams)
    probs = np.array(list(freq.values())) / len(bigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def trigram_score(s):
    if len(s) < 3: return 0.0
    trigrams = [s[i:i+3] for i in range(len(s)-2)]
    freq = Counter(trigrams)
    probs = np.array(list(freq.values())) / len(trigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def char_freq_dev(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()))
    return np.std(freq / np.sum(freq))

def char_gini(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    p = freq / freq.sum()
    return 1.0 - np.sum(p**2)

def vowel_consonant_features(s):
    v_runs = re.findall(r"[aeiou]+", s)
    c_runs = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    v_run_count = len(v_runs)
    c_run_count = len(c_runs)
    v_cluster_ratio = max((len(r) for r in v_runs), default=0) / max(1, len(s))
    return v_run_count, c_run_count, v_cluster_ratio

def max_consonant_cluster(s):
    clusters = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    return max((len(c) for c in clusters), default=0)

def pronounceability_score(s):
    if not s: return 0.0
    score = sum(c in VOWELS for c in s) / len(s)
    return score

def unique_char_ratio(s):
    return len(set(s)) / max(1, len(s))

def unique_char(s):
    return len(set(s))

def dict_std(s):
    if not s: return 0.0
    words_found = sum(w in ENGLISH_WORDS for w in re.findall(r"[a-z]+", s))
    return words_found / max(1, len(s.split(".")))

def markov_chain_likelihood(s):
    if len(s) < 2: return 0.0
    probs = []
    for i in range(1, len(s)):
        probs.append(1.0 if s[i] == s[i-1] else 0.5)
    return np.mean(probs)

def kl_divergence(s):
    benign_dist = np.ones(26) / 26
    if not s: return 0.0
    counts = np.array([s.count(c) for c in ALPHABET], dtype=float)
    if counts.sum() == 0: return 0.0
    p = counts / counts.sum()
    return np.sum(p * np.log2((p + 1e-12) / (benign_dist + 1e-12)))

def sliding_word_ratio(s):
    if len(s) < 4: return 0.0
    matches = sum(s[i:i+4] in ENGLISH_WORDS for i in range(len(s)-3))
    return matches / (len(s) - 3)

def keyboard_distance_score(s):
    total = 0.0
    count = 0
    for a, b in zip(s, s[1:]):
        if a in KEYBOARD_POS and b in KEYBOARD_POS:
            pa, pb = KEYBOARD_POS[a], KEYBOARD_POS[b]
            total += math.dist(pa, pb)
            count += 1
    return total / count if count else 0.0

def min_levenshtein_to_popular(s):
    return min(SequenceMatcher(None, s, p).ratio() for p in POP_DOMAINS)

def repetition_ratio(s):
    if not s: return 0.0
    freq = Counter(s)
    return max(freq.values()) / len(s)

def alphabetic_ratio(s):
    letters = sum(c.isalpha() for c in s)
    return letters / max(1, len(s))

def symbol_ratio(s):
    symbols = sum(not c.isalnum() for c in s)
    return symbols / max(1, len(s))

def entropy_per_length(s):
    e = shannon_entropy(s)
    return e / max(1, len(s))

def entropy_slope(s):
    if len(s) < 2: return 0.0
    entropies = [shannon_entropy(s[:i]) for i in range(2, len(s)+1)]
    x = np.arange(2, len(s)+1)
    slope, _ = np.polyfit(x, entropies, 1)
    return slope

def char_distribution_symmetry(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    mean = freq.mean()
    return np.mean(np.abs(freq - mean)) / mean

# === MAIN ===
def main(input_csv="ndga_version2.csv", output_csv="oct30_ndga_v1.csv"):
    df = pd.read_csv(input_csv, dtype=str)
    domains = df["domain"].fillna("").str.lower().tolist()
    print(f"Processing {len(domains)} domains...")

    feats = []
    for d in domains:
        s = safe(d)
        v_run, c_run, v_ratio = vowel_consonant_features(s)
        feats.append({
            "Maximum_Consonants_Cluster": max_consonant_cluster(s),
            "Consonant_count": sum(c in CONSONANTS for c in s),
            "Pronounceability_Score": pronounceability_score(s),
            "Bigram-Likelihood": bigram_likelihood(s),
            "Character_Frequency_Deviation": char_freq_dev(s),
            "Unique_Character_Ratio": unique_char_ratio(s),
            "Unique_Character": unique_char(s),
            "Dictionary_Standard": dict_std(s),
            "Markov_Chain_Likelihood": markov_chain_likelihood(s),
            "Length": len(s),
            "Compression_Ratio": kolmogorov_complexity(s),
            "Bigram_Score": bigram_likelihood(s),
            "Trigram_Score": trigram_score(s),
            "N-gram_LM_perplexity": bigram_likelihood(s) + trigram_score(s),
            "Normal_Character_Frequency_varience": char_freq_dev(s),
            "Character_Gini": char_gini(s),
            "KL_Divergence": kl_divergence(s),
            "Sliding_word_ratio": sliding_word_ratio(s),
            "Kolmogorov_Complexity": kolmogorov_complexity(s),
            "Renyi_Entropy": renyi_entropy(s),
            "Keyboard_Distance_Score": keyboard_distance_score(s),
            "Min_Levenshtein_To_Popular": min_levenshtein_to_popular(s),
            "Repetition_Ratio": repetition_ratio(s),
            "Alphabetic_Ratio": alphabetic_ratio(s),
            "Symbol_Ratio": symbol_ratio(s),
            "Vowel_run_count": v_run,
            "Consonant_run_count": c_run,
            "Vowel_cluster_ratio": v_ratio,
            "Entropy_per_length": entropy_per_length(s),
            "Entropy_Slope": entropy_slope(s),
            "Character_Distribution_Symmetry": char_distribution_symmetry(s),
        })

    feat_df = pd.DataFrame(feats)
    out = pd.concat([df, feat_df], axis=1)
    out.to_csv(output_csv, index=False)
    print(f"✅ Features saved to {output_csv}")

if __name__ == "__main__":
    main()

Processing 1000018 domains...


  slope, _ = np.polyfit(x, entropies, 1)


✅ Features saved to oct30_ndga_v1.csv


In [9]:
df=pd.read_csv('oct30_ndga_v1.csv')
df.describe()

Unnamed: 0,label,Maximum_Consonants_Cluster,Consonant_count,Pronounceability_Score,Bigram-Likelihood,Character_Frequency_Deviation,Unique_Character_Ratio,Unique_Character,Dictionary_Standard,Markov_Chain_Likelihood,...,Min_Levenshtein_To_Popular,Repetition_Ratio,Alphabetic_Ratio,Symbol_Ratio,Vowel_run_count,Consonant_run_count,Vowel_cluster_ratio,Entropy_per_length,Entropy_Slope,Character_Distribution_Symmetry
count,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,...,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0
mean,0.0,2.559998,7.716392,0.3146914,3.526768,0.03664792,0.784204,10.27084,0.2110617,0.5115228,...,0.09522608,0.1833156,0.883741,0.09447994,3.886544,5.200499,0.1085705,0.2555062,0.1958674,0.2922898
std,0.0,1.070682,2.782675,0.09763058,0.4773525,0.01793356,0.1191128,2.394362,0.3294658,0.02439352,...,0.04453127,0.05519978,0.08710253,0.03649864,1.761619,1.833195,0.04979761,0.05937606,0.06539805,0.1290987
min,0.0,0.0,0.0,0.0,-1.442823e-12,0.0,0.04347826,2.0,0.0,0.5,...,0.0,0.04477612,0.1176471,0.0,0.0,0.0,0.0,0.003284717,-0.09436094,0.0
25%,0.0,2.0,6.0,0.2666667,3.169925,0.02886751,0.7,9.0,0.0,0.5,...,0.08333333,0.1428571,0.875,0.06666667,3.0,4.0,0.07692308,0.2127573,0.148576,0.2142857
50%,0.0,2.0,7.0,0.3333333,3.546594,0.03586096,0.7857143,10.0,0.0,0.5,...,0.1,0.1764706,0.9090909,0.08695652,4.0,5.0,0.1,0.2515458,0.1880979,0.3174603
75%,0.0,3.0,9.0,0.375,3.875,0.0440411,0.875,12.0,0.5,0.5,...,0.1176471,0.2142857,0.9285714,0.1111111,5.0,6.0,0.1333333,0.2921928,0.2348486,0.3850267
max,0.0,43.0,45.0,0.875,6.044394,0.4782609,1.0,37.0,8.5,0.9777778,...,0.3157895,0.9782609,1.0,0.56,25.0,25.0,0.7692308,0.5,0.6347488,1.21393


In [10]:
import pandas as pd

def reorder_columns(input_csv, output_csv, desired_order):
    """
    Reorder the columns of a CSV file based on user-defined order.

    Parameters:
    - input_csv (str): Path to input CSV file.
    - output_csv (str): Path to output CSV file with reordered columns.
    - desired_order (list): List of column names in the desired order.
    """
    # Load dataset
    df = pd.read_csv(input_csv)

    # Check which desired columns exist
    available_columns = [col for col in desired_order if col in df.columns]

    # Add missing columns (if any were not in df)
    missing_columns = [col for col in desired_order if col not in df.columns]
    for col in missing_columns:
        df[col] = None  # Fill with None or default values

    # Reorder
    df = df[available_columns + [col for col in df.columns if col not in available_columns]]

    # Save output
    df.to_csv(output_csv, index=False)
    print(f"✅ Reordered CSV saved as {output_csv}")


# =====================
# Example usage
# =====================

# Suppose your dataset has 58 features + "domain" + "label"
input_csv = "oct30_ndga_v1.csv"
output_csv = "oct30_ndga_v2.csv"

# User-defined column order (just an example)
desired_order = [





"domain",
"Maximum_Consonants_Cluster",
"Consonant_count",
"Pronounceability_Score",
"Bigram-Likelihood",
"Character_Frequency_Deviation",
"Unique_Character_Ratio",
"Unique_Character",
"Dictionary_Standard",
"Markov_Chain_Likelihood",
"Length",
"Compression_Ratio",
"Bigram_Score",
"Trigram_Score",
"N-gram_LM_perplexity",
"Normal_Character_Frequency_varience",
"Character_Gini",
"KL_Divergence",
"Sliding_word_ratio",
"Kolmogorov_Complexity",
"Renyi_Entropy",
"Keyboard_Distance_Score",
"Min_Levenshtein_To_Popular",
"Repetition_Ratio",
"Alphabetic_Ratio",
"Symbol_Ratio",
"Vowel_run_count",
"Consonant_run_count",
"Vowel_cluster_ratio",
"Entropy_per_length",
"Entropy_Slope",
"Character_Distribution_Symmetry",
"label"

]

# Reorder dataset
reorder_columns(input_csv, output_csv, desired_order)


✅ Reordered CSV saved as oct30_ndga_v2.csv


In [11]:

df=pd.read_csv('oct30_ndga_v2.csv')
df.describe()

Unnamed: 0,Maximum_Consonants_Cluster,Consonant_count,Pronounceability_Score,Bigram-Likelihood,Character_Frequency_Deviation,Unique_Character_Ratio,Unique_Character,Dictionary_Standard,Markov_Chain_Likelihood,Length,...,Repetition_Ratio,Alphabetic_Ratio,Symbol_Ratio,Vowel_run_count,Consonant_run_count,Vowel_cluster_ratio,Entropy_per_length,Entropy_Slope,Character_Distribution_Symmetry,label
count,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,...,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0,1000018.0
mean,2.559998,7.716392,0.3146914,3.526768,0.03664792,0.784204,10.27084,0.2110617,0.5115228,13.52508,...,0.1833156,0.883741,0.09447994,3.886544,5.200499,0.1085705,0.2555062,0.1958674,0.2922898,0.0
std,1.070682,2.782675,0.09763058,0.4773525,0.01793356,0.1191128,2.394362,0.3294658,0.02439352,4.305555,...,0.05519978,0.08710253,0.03649864,1.761619,1.833195,0.04979761,0.05937606,0.06539805,0.1290987,0.0
min,0.0,0.0,0.0,-1.442823e-12,0.0,0.04347826,2.0,0.0,0.5,2.0,...,0.04477612,0.1176471,0.0,0.0,0.0,0.0,0.003284717,-0.09436094,0.0,0.0
25%,2.0,6.0,0.2666667,3.169925,0.02886751,0.7,9.0,0.0,0.5,10.0,...,0.1428571,0.875,0.06666667,3.0,4.0,0.07692308,0.2127573,0.148576,0.2142857,0.0
50%,2.0,7.0,0.3333333,3.546594,0.03586096,0.7857143,10.0,0.0,0.5,13.0,...,0.1764706,0.9090909,0.08695652,4.0,5.0,0.1,0.2515458,0.1880979,0.3174603,0.0
75%,3.0,9.0,0.375,3.875,0.0440411,0.875,12.0,0.5,0.5,16.0,...,0.2142857,0.9285714,0.1111111,5.0,6.0,0.1333333,0.2921928,0.2348486,0.3850267,0.0
max,43.0,45.0,0.875,6.044394,0.4782609,1.0,37.0,8.5,0.9777778,75.0,...,0.9782609,1.0,0.56,25.0,25.0,0.7692308,0.5,0.6347488,1.21393,0.0


In [12]:
import pandas as pd

def filter_outliers(df, feature_cols, method="iqr", z_thresh=3, save_path=None):
    """
    Filters datapoints outside the lower/upper bound for each feature
    and optionally saves the datapoints that are within the bounds.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset with features.
    feature_cols : list
        List of feature columns to check.
    method : str
        "iqr" (default) -> Interquartile Range method
        "zscore" -> Standard deviation based
    z_thresh : int
        Threshold for zscore method
    save_path : str or None
        If provided, saves the filtered dataset to this CSV path.

    Returns:
    --------
    pd.DataFrame : Filtered dataset (within bounds)
    pd.DataFrame : Bounds for each feature
    """
    bounds = {}
    df_filtered = df.copy()

    for col in feature_cols:
        if col not in df.columns:
            continue  # skip missing features

        series = df[col].dropna()

        if method == "iqr":
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

        elif method == "zscore":
            mean = series.mean()
            std = series.std()
            lower = mean - z_thresh * std
            upper = mean + z_thresh * std

        else:
            raise ValueError("Method must be 'iqr' or 'zscore'")

        bounds[col] = (lower, upper)

        # keep only rows within bounds
        df_filtered = df_filtered[(df_filtered[col] >= lower) & (df_filtered[col] <= upper)]

    bounds_df = pd.DataFrame(bounds, index=["Lower_Bound", "Upper_Bound"]).T

    # Save filtered dataset if save_path provided
    if save_path:
        df_filtered.to_csv(save_path, index=False)
        print(f"✅ Filtered dataset (within bounds) saved to {save_path}")

    return df_filtered.reset_index(drop=True), bounds_df


# ==== Example Usage ====
all_features = [

"Maximum_Consonants_Cluster",
"Consonant_count",
"Pronounceability_Score",
"Bigram-Likelihood",
"Character_Frequency_Deviation",
"Unique_Character_Ratio",
"Unique_Character",
"Dictionary_Standard",
"Markov_Chain_Likelihood",
"Length",
"Compression_Ratio",
"Bigram_Score",
"Trigram_Score",
"N-gram_LM_perplexity",
"Normal_Character_Frequency_varience",
"Character_Gini",
"KL_Divergence",
"Sliding_word_ratio",
"Kolmogorov_Complexity",
"Renyi_Entropy",
"Keyboard_Distance_Score",
"Min_Levenshtein_To_Popular",
"Repetition_Ratio",
"Alphabetic_Ratio",
"Symbol_Ratio",
"Vowel_run_count",
"Consonant_run_count",
"Vowel_cluster_ratio",
"Entropy_per_length",
"Entropy_Slope",
"Character_Distribution_Symmetry"




]

df = pd.read_csv("oct30_ndga_v2.csv")
# Remove duplicate columns (keep first occurrence)
df = df.loc[:, ~df.columns.duplicated()]

# Save filtered data to CSV
df_filtered, bounds = filter_outliers(
    df, all_features, method="iqr", save_path="oct30_ndga_v3.csv"
)

print(bounds)


✅ Filtered dataset (within bounds) saved to oct30_ndga_v3.csv
                                      Lower_Bound  Upper_Bound
Maximum_Consonants_Cluster           5.000000e-01     4.500000
Consonant_count                      1.500000e+00    13.500000
Pronounceability_Score               1.041667e-01     0.537500
Bigram-Likelihood                    2.112313e+00     4.932612
Character_Frequency_Deviation        6.107128e-03     0.066801
Unique_Character_Ratio               4.375000e-01     1.137500
Unique_Character                     4.500000e+00    16.500000
Dictionary_Standard                 -7.500000e-01     1.250000
Markov_Chain_Likelihood              5.000000e-01     0.500000
Length                               1.000000e+00    25.000000
Compression_Ratio                    1.050000e+00     2.250000
Bigram_Score                         2.112313e+00     4.932612
Trigram_Score                        1.788968e+00     5.018387
N-gram_LM_perplexity                 3.853444e+00    10.

In [13]:
df=pd.read_csv('oct30_ndga_v3.csv')
df.describe()

Unnamed: 0,Maximum_Consonants_Cluster,Consonant_count,Pronounceability_Score,Bigram-Likelihood,Character_Frequency_Deviation,Unique_Character_Ratio,Unique_Character,Dictionary_Standard,Markov_Chain_Likelihood,Length,...,Repetition_Ratio,Alphabetic_Ratio,Symbol_Ratio,Vowel_run_count,Consonant_run_count,Vowel_cluster_ratio,Entropy_per_length,Entropy_Slope,Character_Distribution_Symmetry,label
count,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,...,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0,454706.0
mean,2.436612,8.068697,0.338539,3.630005,0.036673,0.774857,10.742007,0.177044,0.5,14.110909,...,0.176914,0.910362,0.087077,4.353514,5.660161,0.101217,0.24356,0.179834,0.309662,0.0
std,0.777383,2.09311,0.069524,0.359127,0.010007,0.094647,1.974541,0.279427,0.0,3.286992,...,0.039576,0.030982,0.028032,1.339635,1.413821,0.037521,0.043275,0.045075,0.089103,0.0
min,1.0,3.0,0.111111,2.5,0.014239,0.4375,6.0,0.0,0.5,7.0,...,0.083333,0.8,0.04,1.0,1.0,0.04,0.132603,0.062547,0.090909,0.0
25%,2.0,7.0,0.294118,3.321928,0.029463,0.705882,9.0,0.0,0.5,12.0,...,0.15,0.9,0.066667,3.0,5.0,0.071429,0.210145,0.145248,0.251748,0.0
50%,2.0,8.0,0.333333,3.584963,0.035251,0.777778,11.0,0.0,0.5,14.0,...,0.166667,0.916667,0.083333,4.0,6.0,0.090909,0.241342,0.17664,0.323077,0.0
75%,3.0,10.0,0.384615,3.906891,0.043301,0.846154,12.0,0.5,0.5,16.0,...,0.2,0.933333,0.1,5.0,7.0,0.125,0.272815,0.211065,0.376923,0.0
max,4.0,13.0,0.533333,4.584963,0.066609,0.941176,16.0,1.0,0.5,25.0,...,0.3,0.96,0.176471,8.0,9.0,0.214286,0.360234,0.343718,0.637681,0.0


In [18]:

import pandas as pd

def select_entries(file, num_entries, output_file='oct30_ndga_v4.csv'):
    """
    Select the user-specified number of entries (rows) from a dataset.

    Args:
        file (str): Path to the CSV file.

        num_entries (int): Number of rows to select.
        output_file (str): File to save the selected rows.
    """
    # Load dataset
    df = pd.read_csv(file)

    # Select first 'num_entries' rows
    selected_df = df.head(num_entries)

    # Save result
    selected_df.to_csv(output_file, index=False)


    return selected_df


# Example usage:
selected = select_entries('oct30_ndga_v3.csv', 446419 )   # get first 100 rows
print(selected)


                       domain  Maximum_Consonants_Cluster  Consonant_count  \
0           01heiliaomimi.com                           1                6   
1       01nextprivate.website                           4               11   
2             01transport.com                           3                9   
3               0bigazart.com                           2                7   
4                  0catch.com                           3                6   
...                       ...                         ...              ...   
446414          yadlachim.org                           2                8   
446415       yado-sagashi.net                           2                8   
446416           yadong.party                           3                8   
446417          yadongcam.net                           3                8   
446418           yadongpan.me                           3                7   

        Pronounceability_Score  Bigram-Likelihood  \
0         

In [19]:
import pandas as pd

def combine_datasets(file1, file2, output_file="combined.csv"):
    """
    Combine two datasets without altering datapoints.
    The header of the second dataset is removed automatically.
    """
    # Load first dataset normally (with header)
    df1 = pd.read_csv(file1)

    # Load second dataset as raw, then reassign columns from df1
    df2 = pd.read_csv(file2, header=None, skiprows=1)
    df2.columns = df1.columns  # assign the same header as df1

    # Concatenate without altering datapoints
    combined_df = pd.concat([df1, df2], ignore_index=True)

    # Save to CSV
    combined_df.to_csv(output_file, index=False)
    return combined_df



# Example usage:
combined = combine_datasets('oct30_dga_v3.csv', 'oct30_ndga_v4.csv')
print(combined.head())


                 domain  Maximum_Consonants_Cluster  Consonant_count  \
0   ofdhiydrrttpblp.com                          10               15   
1    osvwkptpwqyiqen.ru                          10               13   
2  wwcdhdhijsfsuyr.info                           7               15   
3    fhhvhiqlrtwpnik.ru                           7               14   
4   gwgweakshkaxnqv.org                           4               14   

   Pronounceability_Score  Bigram-Likelihood  Character_Frequency_Deviation  \
0                0.157895           4.169925                       0.025219   
1                0.222222           4.087463                       0.022222   
2                0.200000           4.142664                       0.024744   
3                0.166667           4.087463                       0.032723   
4                0.210526           4.058814                       0.032120   

   Unique_Character_Ratio  Unique_Character  Dictionary_Standard  \
0                0.73684

In [None]:
# =========================
# DGA Detection Optimized (31 Features)
# =========================

import re, math, string, zlib
import pandas as pd
import numpy as np
from collections import Counter
from difflib import SequenceMatcher
import joblib
from nltk.corpus import words as nltk_words
import nltk
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# -------------------------
# Download English Words
# -------------------------
try:
    nltk.data.find("corpora/words")
except LookupError:
    nltk.download("words")

ENGLISH_WORDS = set(w.lower() for w in nltk_words.words())
VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
ALPHABET = string.ascii_lowercase
POP_DOMAINS = ["google", "facebook", "youtube", "amazon", "twitter", "instagram"]

# Precompute keyboard positions
KEYBOARD_POS = {c: (i // 10, i % 10) for i, c in enumerate("qwertyuiopasdfghjklzxcvbnm")}

# -------------------------
# Utility Functions
# -------------------------
def safe(s): return str(s).lower() if isinstance(s, str) else ""

def shannon_entropy(s):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return -np.sum(probs * np.log2(probs + 1e-12))

def renyi_entropy(s, alpha=2):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return 1.0 / (1.0 - alpha) * np.log2(np.sum(probs ** alpha) + 1e-12)

def kolmogorov_complexity(s):
    if not s: return 0.0
    comp = zlib.compress(s.encode("utf-8"))
    return len(comp) / max(1, len(s))

def bigram_likelihood(s):
    if len(s) < 2: return 0.0
    bigrams = [s[i:i+2] for i in range(len(s)-1)]
    freq = Counter(bigrams)
    probs = np.array(list(freq.values())) / len(bigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def trigram_score(s):
    if len(s) < 3: return 0.0
    trigrams = [s[i:i+3] for i in range(len(s)-2)]
    freq = Counter(trigrams)
    probs = np.array(list(freq.values())) / len(trigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def char_freq_dev(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()))
    return np.std(freq / np.sum(freq))

def char_gini(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    p = freq / freq.sum()
    return 1.0 - np.sum(p**2)

def vowel_consonant_features(s):
    v_runs = re.findall(r"[aeiou]+", s)
    c_runs = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    v_run_count = len(v_runs)
    c_run_count = len(c_runs)
    v_cluster_ratio = max((len(r) for r in v_runs), default=0) / max(1, len(s))
    return v_run_count, c_run_count, v_cluster_ratio

def max_consonant_cluster(s):
    clusters = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    return max((len(c) for c in clusters), default=0)

def pronounceability_score(s):
    if not s: return 0.0
    return sum(c in VOWELS for c in s) / max(1, len(s))

def unique_char_ratio(s): return len(set(s)) / max(1, len(s))
def unique_char(s): return len(set(s))

def dict_std(s):
    if not s: return 0.0
    words_found = sum(w in ENGLISH_WORDS for w in re.findall(r"[a-z]+", s))
    return words_found / max(1, len(s.split(".")))

def markov_chain_likelihood(s):
    if len(s) < 2: return 0.0
    probs = [1.0 if s[i] == s[i-1] else 0.5 for i in range(1, len(s))]
    return np.mean(probs)

def kl_divergence(s):
    benign_dist = np.ones(26) / 26
    if not s: return 0.0
    counts = np.array([s.count(c) for c in ALPHABET], dtype=float)
    if counts.sum() == 0: return 0.0
    p = counts / counts.sum()
    return np.sum(p * np.log2((p + 1e-12) / (benign_dist + 1e-12)))

def sliding_word_ratio(s):
    if len(s) < 4: return 0.0
    matches = sum(s[i:i+4] in ENGLISH_WORDS for i in range(len(s)-3))
    return matches / (len(s) - 3)

def keyboard_distance_score(s):
    total = count = 0.0
    for a, b in zip(s, s[1:]):
        if a in KEYBOARD_POS and b in KEYBOARD_POS:
            total += math.dist(KEYBOARD_POS[a], KEYBOARD_POS[b])
            count += 1
    return total / count if count else 0.0

def min_levenshtein_to_popular(s):
    return min(SequenceMatcher(None, s, p).ratio() for p in POP_DOMAINS)

def repetition_ratio(s):
    if not s: return 0.0
    freq = Counter(s)
    return max(freq.values()) / len(s)

def alphabetic_ratio(s):
    return sum(c.isalpha() for c in s) / max(1, len(s))

def symbol_ratio(s):
    return sum(not c.isalnum() for c in s) / max(1, len(s))

def entropy_per_length(s):
    return shannon_entropy(s) / max(1, len(s))

def entropy_slope(s):
    if len(s) < 2: return 0.0
    entropies = [shannon_entropy(s[:i]) for i in range(2, len(s)+1)]
    x = np.arange(2, len(s)+1)
    slope, _ = np.polyfit(x, entropies, 1)
    return slope

def char_distribution_symmetry(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    mean = freq.mean()
    return np.mean(np.abs(freq - mean)) / mean

# -------------------------
# Feature Extraction (31 features)
# -------------------------
def extract_features(domain):
    s = safe(domain)
    v_run, c_run, v_ratio = vowel_consonant_features(s)
    return {
        "Maximum_Consonants_Cluster": max_consonant_cluster(s),
        "Consonant_count": sum(c in CONSONANTS for c in s),
        "Pronounceability_Score": pronounceability_score(s),
        "Bigram-Likelihood": bigram_likelihood(s),
        "Character_Frequency_Deviation": char_freq_dev(s),
        "Unique_Character_Ratio": unique_char_ratio(s),
        "Unique_Character": unique_char(s),
        "Dictionary_Standard": dict_std(s),
        "Markov_Chain_Likelihood": markov_chain_likelihood(s),
        "Length": len(s),
        "Compression_Ratio": kolmogorov_complexity(s),
        "Bigram_Score": bigram_likelihood(s),
        "Trigram_Score": trigram_score(s),
        "N-gram_LM_perplexity": bigram_likelihood(s) + trigram_score(s),
        "Normal_Character_Frequency_varience": char_freq_dev(s),
        "Character_Gini": char_gini(s),
        "KL_Divergence": kl_divergence(s),
        "Sliding_word_ratio": sliding_word_ratio(s),
        "Kolmogorov_Complexity": kolmogorov_complexity(s),
        "Renyi_Entropy": renyi_entropy(s),
        "Keyboard_Distance_Score": keyboard_distance_score(s),
        "Min_Levenshtein_To_Popular": min_levenshtein_to_popular(s),
        "Repetition_Ratio": repetition_ratio(s),
        "Alphabetic_Ratio": alphabetic_ratio(s),
        "Symbol_Ratio": symbol_ratio(s),
        "Vowel_run_count": v_run,
        "Consonant_run_count": c_run,
        "Vowel_cluster_ratio": v_ratio,
        "Entropy_per_length": entropy_per_length(s),
        "Entropy_Slope": entropy_slope(s),
        "Character_Distribution_Symmetry": char_distribution_symmetry(s),
    }

def compute_features_for_dataset(df, domain_col="domain"):
    feats = [extract_features(d) for d in df[domain_col]]
    return pd.DataFrame(feats)

# -------------------------
# Model Training
# -------------------------
def train_model(train_file, label_col="label", model_type="xgboost", save_model="dga_model.pkl"):
    df = pd.read_csv(train_file)
    drop_cols = [label_col]
    if "domain" in df.columns:
        drop_cols.append("domain")
    X = df.drop(columns=drop_cols).apply(pd.to_numeric, errors="coerce").fillna(0)
    y = df[label_col]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss") if model_type == "xgboost" \
        else RandomForestClassifier(n_estimators=200, random_state=42)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, preds))
    print(classification_report(y_val, preds))
    joblib.dump(model, save_model)
    print(f"✅ Model saved to {save_model}")
    return model

# -------------------------
# Prediction
# -------------------------
def predict_new(test_file, model_file="dga_model.pkl", output_file="predictions_dga.csv"):
    df_test = pd.read_csv(test_file)
    domain_col = "domain" if "domain" in df_test.columns else df_test.columns[0]
    X_test = compute_features_for_dataset(df_test, domain_col).apply(pd.to_numeric, errors="coerce").fillna(0)
    model = joblib.load(model_file)
    preds = model.predict(X_test)
    df_out = df_test.copy()
    df_out["Prediction"] = preds
    df_out.to_csv(output_file, index=False)
    print(f"✅ Predictions saved to {output_file}")
    return df_out

# -------------------------
# Example Usage
# -------------------------
if __name__ == "__main__":
    model = train_model("combined.csv", label_col="label", model_type="xgboost")
    results = predict_new("dataset_dga.csv", model_file="dga_model.pkl")
    print(results.head())


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.9819676537789526
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     89347
           1       0.98      0.98      0.98     89221

    accuracy                           0.98    178568
   macro avg       0.98      0.98      0.98    178568
weighted avg       0.98      0.98      0.98    178568

✅ Model saved to dga_model.pkl


In [23]:
import pandas as pd

def count_zero_one(df, column_name):
    """
    Count how many entries are 0 or 1 in a specific column.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset
    column_name : str
        Column to check

    Returns:
    --------
    dict : counts of 0 and 1
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in dataset")

    counts = {
        "count_0": (df[column_name] == 0).sum(),
        "count_1": (df[column_name] == 1).sum()
    }
    return counts


# ==== Example Usage ====
df = pd.read_csv("predictions_dga.csv")

# Replace "Label" with your column name
result = count_zero_one(df, "Prediction")

print(f"Number of 0s: {result['count_0']}")
print(f"Number of 1s: {result['count_1']}")



Number of 0s: 332
Number of 1s: 2168


In [20]:
files.download("combined.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
# =========================
# DGA Detection Optimized (31 Features)
# =========================

import re, math, string, zlib
import pandas as pd
import numpy as np
from collections import Counter
from difflib import SequenceMatcher
import joblib
from nltk.corpus import words as nltk_words
import nltk
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# -------------------------
# Download English Words
# -------------------------
try:
    nltk.data.find("corpora/words")
except LookupError:
    nltk.download("words")

ENGLISH_WORDS = set(w.lower() for w in nltk_words.words())
VOWELS = set("aeiou")
CONSONANTS = set(string.ascii_lowercase) - VOWELS
ALPHABET = string.ascii_lowercase
POP_DOMAINS = ["google", "facebook", "youtube", "amazon", "twitter", "instagram"]

# Precompute keyboard positions
KEYBOARD_POS = {c: (i // 10, i % 10) for i, c in enumerate("qwertyuiopasdfghjklzxcvbnm")}

# -------------------------
# Utility Functions
# -------------------------
def safe(s): return str(s).lower() if isinstance(s, str) else ""

def shannon_entropy(s):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return -np.sum(probs * np.log2(probs + 1e-12))

def renyi_entropy(s, alpha=2):
    if not s: return 0.0
    freq = Counter(s)
    probs = np.array(list(freq.values()), dtype=float) / len(s)
    return 1.0 / (1.0 - alpha) * np.log2(np.sum(probs ** alpha) + 1e-12)

def kolmogorov_complexity(s):
    if not s: return 0.0
    comp = zlib.compress(s.encode("utf-8"))
    return len(comp) / max(1, len(s))

def bigram_likelihood(s):
    if len(s) < 2: return 0.0
    bigrams = [s[i:i+2] for i in range(len(s)-1)]
    freq = Counter(bigrams)
    probs = np.array(list(freq.values())) / len(bigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def trigram_score(s):
    if len(s) < 3: return 0.0
    trigrams = [s[i:i+3] for i in range(len(s)-2)]
    freq = Counter(trigrams)
    probs = np.array(list(freq.values())) / len(trigrams)
    return -np.sum(probs * np.log2(probs + 1e-12))

def char_freq_dev(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()))
    return np.std(freq / np.sum(freq))

def char_gini(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    p = freq / freq.sum()
    return 1.0 - np.sum(p**2)

def vowel_consonant_features(s):
    v_runs = re.findall(r"[aeiou]+", s)
    c_runs = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    v_run_count = len(v_runs)
    c_run_count = len(c_runs)
    v_cluster_ratio = max((len(r) for r in v_runs), default=0) / max(1, len(s))
    return v_run_count, c_run_count, v_cluster_ratio

def max_consonant_cluster(s):
    clusters = re.findall(r"[bcdfghjklmnpqrstvwxyz]+", s)
    return max((len(c) for c in clusters), default=0)

def pronounceability_score(s):
    if not s: return 0.0
    return sum(c in VOWELS for c in s) / max(1, len(s))

def unique_char_ratio(s): return len(set(s)) / max(1, len(s))
def unique_char(s): return len(set(s))

def dict_std(s):
    if not s: return 0.0
    words_found = sum(w in ENGLISH_WORDS for w in re.findall(r"[a-z]+", s))
    return words_found / max(1, len(s.split(".")))

def markov_chain_likelihood(s):
    if len(s) < 2: return 0.0
    probs = [1.0 if s[i] == s[i-1] else 0.5 for i in range(1, len(s))]
    return np.mean(probs)

def kl_divergence(s):
    benign_dist = np.ones(26) / 26
    if not s: return 0.0
    counts = np.array([s.count(c) for c in ALPHABET], dtype=float)
    if counts.sum() == 0: return 0.0
    p = counts / counts.sum()
    return np.sum(p * np.log2((p + 1e-12) / (benign_dist + 1e-12)))

def sliding_word_ratio(s):
    if len(s) < 4: return 0.0
    matches = sum(s[i:i+4] in ENGLISH_WORDS for i in range(len(s)-3))
    return matches / (len(s) - 3)

def keyboard_distance_score(s):
    total = count = 0.0
    for a, b in zip(s, s[1:]):
        if a in KEYBOARD_POS and b in KEYBOARD_POS:
            total += math.dist(KEYBOARD_POS[a], KEYBOARD_POS[b])
            count += 1
    return total / count if count else 0.0

def min_levenshtein_to_popular(s):
    return min(SequenceMatcher(None, s, p).ratio() for p in POP_DOMAINS)

def repetition_ratio(s):
    if not s: return 0.0
    freq = Counter(s)
    return max(freq.values()) / len(s)

def alphabetic_ratio(s):
    return sum(c.isalpha() for c in s) / max(1, len(s))

def symbol_ratio(s):
    return sum(not c.isalnum() for c in s) / max(1, len(s))

def entropy_per_length(s):
    return shannon_entropy(s) / max(1, len(s))

def entropy_slope(s):
    if len(s) < 2: return 0.0
    entropies = [shannon_entropy(s[:i]) for i in range(2, len(s)+1)]
    x = np.arange(2, len(s)+1)
    slope, _ = np.polyfit(x, entropies, 1)
    return slope

def char_distribution_symmetry(s):
    if not s: return 0.0
    freq = np.array(list(Counter(s).values()), dtype=float)
    mean = freq.mean()
    return np.mean(np.abs(freq - mean)) / mean

# -------------------------
# Feature Extraction (31 features)
# -------------------------
def extract_features(domain):
    s = safe(domain)
    v_run, c_run, v_ratio = vowel_consonant_features(s)
    return {
        "Maximum_Consonants_Cluster": max_consonant_cluster(s),
        "Consonant_count": sum(c in CONSONANTS for c in s),
        "Pronounceability_Score": pronounceability_score(s),
        "Bigram-Likelihood": bigram_likelihood(s),
        "Character_Frequency_Deviation": char_freq_dev(s),
        "Unique_Character_Ratio": unique_char_ratio(s),
        "Unique_Character": unique_char(s),
        "Dictionary_Standard": dict_std(s),
        "Markov_Chain_Likelihood": markov_chain_likelihood(s),
        "Length": len(s),
        "Compression_Ratio": kolmogorov_complexity(s),
        "Bigram_Score": bigram_likelihood(s),
        "Trigram_Score": trigram_score(s),
        "N-gram_LM_perplexity": bigram_likelihood(s) + trigram_score(s),
        "Normal_Character_Frequency_varience": char_freq_dev(s),
        "Character_Gini": char_gini(s),
        "KL_Divergence": kl_divergence(s),
        "Sliding_word_ratio": sliding_word_ratio(s),
        "Kolmogorov_Complexity": kolmogorov_complexity(s),
        "Renyi_Entropy": renyi_entropy(s),
        "Keyboard_Distance_Score": keyboard_distance_score(s),
        "Min_Levenshtein_To_Popular": min_levenshtein_to_popular(s),
        "Repetition_Ratio": repetition_ratio(s),
        "Alphabetic_Ratio": alphabetic_ratio(s),
        "Symbol_Ratio": symbol_ratio(s),
        "Vowel_run_count": v_run,
        "Consonant_run_count": c_run,
        "Vowel_cluster_ratio": v_ratio,
        "Entropy_per_length": entropy_per_length(s),
        "Entropy_Slope": entropy_slope(s),
        "Character_Distribution_Symmetry": char_distribution_symmetry(s),
    }

def compute_features_for_dataset(df, domain_col="domain"):
    feats = [extract_features(d) for d in df[domain_col]]
    return pd.DataFrame(feats)

# -------------------------
# Model Training
# -------------------------
def train_model(train_file, label_col="label", model_type="xgboost", save_model="dga_model.pkl"):
    df = pd.read_csv(train_file)
    drop_cols = [label_col]
    if "domain" in df.columns:
        drop_cols.append("domain")
    X = df.drop(columns=drop_cols).apply(pd.to_numeric, errors="coerce").fillna(0)
    y = df[label_col]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss") if model_type == "xgboost" \
        else RandomForestClassifier(n_estimators=200, random_state=42)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, preds))
    print(classification_report(y_val, preds))
    joblib.dump(model, save_model)
    print(f"✅ Model saved to {save_model}")
    return model

# -------------------------
# Prediction
# -------------------------
def predict_new(test_file, model_file="dga_model.pkl", output_file="predictions_ndga.csv"):
    df_test = pd.read_csv(test_file)
    domain_col = "domain" if "domain" in df_test.columns else df_test.columns[0]
    X_test = compute_features_for_dataset(df_test, domain_col).apply(pd.to_numeric, errors="coerce").fillna(0)
    model = joblib.load(model_file)
    preds = model.predict(X_test)
    df_out = df_test.copy()
    df_out["Prediction"] = preds
    df_out.to_csv(output_file, index=False)
    print(f"✅ Predictions saved to {output_file}")
    return df_out

# -------------------------
# Example Usage
# -------------------------
if __name__ == "__main__":
    model = train_model("combined.csv", label_col="label", model_type="xgboost")
    results = predict_new("dataset_ndga.csv", model_file="dga_model.pkl")
    print(results.head())


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.9819676537789526
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     89347
           1       0.98      0.98      0.98     89221

    accuracy                           0.98    178568
   macro avg       0.98      0.98      0.98    178568
weighted avg       0.98      0.98      0.98    178568

✅ Model saved to dga_model.pkl
✅ Predictions saved to predictions_ndga.csv
              domains  Prediction
0        eldenvpn.net           0
1  mrworldpremiere.tv           1
2           xpjfw.com           1
3  kvdveganbeauty.com           0
4        officient.io           0


In [25]:
import pandas as pd

def count_zero_one(df, column_name):
    """
    Count how many entries are 0 or 1 in a specific column.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataset
    column_name : str
        Column to check

    Returns:
    --------
    dict : counts of 0 and 1
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in dataset")

    counts = {
        "count_0": (df[column_name] == 0).sum(),
        "count_1": (df[column_name] == 1).sum()
    }
    return counts


# ==== Example Usage ====
df = pd.read_csv("predictions_ndga.csv")

# Replace "Label" with your column name
result = count_zero_one(df, "Prediction")

print(f"Number of 0s: {result['count_0']}")
print(f"Number of 1s: {result['count_1']}")



Number of 0s: 1959
Number of 1s: 541
