In [None]:
# ========================= Model-1======================
# PART 2: LOAD MODEL & PREDICT INTERACTIVELY
# =========================

!pip install xgboost nltk joblib pandas numpy

import pandas as pd
import numpy as np
import re
import math
from collections import Counter
from xgboost import XGBClassifier
import joblib
from nltk.corpus import words
from nltk import download

# Setup
download('words')
english_words = set(words.words())

# Load external resources
try:
    top1m = pd.read_csv('top-1m.csv', header=None)
    popular_domains = set(top1m.iloc[:, -1].str.lower())
except Exception:
    popular_domains = set()

bad_tlds = ['xyz', 'click', 'top', 'gq', 'tk', 'ml', 'cf']

# =========================
# Feature Functions
# =========================
def shannon_entropy(s):
    freq = Counter(s)
    probs = [f / len(s) for f in freq.values()]
    return -sum(p * math.log2(p) for p in probs)

def ngram_score(s, n):
    ng = [s[i:i+n] for i in range(len(s)-n+1)]
    return len(set(ng)) / (len(ng) or 1)

def count_repeats(s):
    return len(re.findall(r'(.)\1+', s))

def compute_word_match_ratio(s):
    tokens = re.split(r'\W+', s)
    matched = sum(1 for t in tokens if t in english_words and len(t) > 2)
    return matched / len(tokens) if tokens else 0

def sliding_word_ratio(s):
    matches = total = 0
    for size in range(3, 10):
        for i in range(len(s) - size + 1):
            sub = s[i:i+size]
            total += 1
            if sub in english_words:
                matches += 1
    return matches / total if total else 0

def longest_dict_word(s):
    tokens = re.split(r'\W+', s)
    lengths = [len(t) for t in tokens if t in english_words]
    return max(lengths) if lengths else 0

def char_distribution_std(s):
    vals = np.array(list(Counter(s).values()))
    return float(np.std(vals))

def vowel_consonant_alternation(s):
    vc = ''.join('v' if c in 'aeiou' else 'c' if c.isalpha() else '' for c in s)
    return sum(1 for i in range(1, len(vc)) if vc[i] != vc[i-1])

def compute_lexical_complexity(s):
    cons_clusters = re.findall(r'[^aeiou]{3,}', s)
    alternations = sum(
        1 for i in range(1, len(s))
        if s[i].isalpha() and s[i-1].isalpha() and (s[i].isalpha() != s[i-1].isalpha())
    )
    return len(cons_clusters) + alternations

def compute_domain_features(domain):
    dom = domain.lower()
    vowels = 'aeiou'
    consonants = 'bcdfghjklmnpqrstvwxyz'
    length = len(dom)
    digit_count = sum(c.isdigit() for c in dom)
    max_consec_digits = max((len(g) for g in re.findall(r'\d+', dom)), default=0)
    vowel_count = sum(c in vowels for c in dom)
    consonant_count = sum(c in consonants for c in dom)
    entropy = shannon_entropy(dom)
    word_match_ratio = compute_word_match_ratio(dom)
    sliding_ratio = sliding_word_ratio(dom)
    longest_word = longest_dict_word(dom)
    bigram_score = ngram_score(dom, 2)
    trigram_score = ngram_score(dom, 3)
    pronounceability = vowel_count / length if length else 0
    unique_chars = len(set(dom))
    dist_std = char_distribution_std(dom)
    vowel_consonant_alt = vowel_consonant_alternation(dom)
    repeat_chars = count_repeats(dom)
    cons_vowel_ratio = consonant_count / vowel_count if vowel_count else consonant_count
    has_hyphen = 1 if '-' in dom else 0
    lexical_complexity = compute_lexical_complexity(dom)
    tld = dom.split('.')[-1]
    tld_common = 1 if tld in ['com', 'net', 'org', 'info', 'biz'] else 0
    tld_bad = 1 if tld in bad_tlds else 0
    privacy = 1 if dom in popular_domains else 0

    return {
        'domains': domain,
        'Length': length,
        'Digit_Count': digit_count,
        'Max_Consec_Digits': max_consec_digits,
        'Vowel_Count': vowel_count,
        'Consonant_Count': consonant_count,
        'Unique_Chars': unique_chars,
        'Entropy': entropy,
        'Dist_STD': dist_std,
        'Word_Match_Ratio': word_match_ratio,
        'Sliding_Word_Ratio': sliding_ratio,
        'Longest_Word_Len': longest_word,
        'Bigram_Score': bigram_score,
        'Trigram_Score': trigram_score,
        'Vowel_Consonant_Alt': vowel_consonant_alt,
        'Pronounceability': pronounceability,
        'Repeat_Chars': repeat_chars,
        'Cons_Vowel_Ratio': cons_vowel_ratio,
        'Has_Hyphen': has_hyphen,
        'Lexical_Complexity': lexical_complexity,
        'TLD_Common': tld_common,
        'TLD_Bad_Score': tld_bad,
        'Popular_Domain': privacy
    }

# =========================
# Load Model
# =========================
model = joblib.load('m_1_xgb_dga_classifier.pkl')
print("✅ Model loaded successfully.")

# =========================
# Interactive Predictions
# =========================
print("\n🔮 Enter domains for prediction (comma-separated). Type 'q' to quit.\n")

threshold = 0.4

while True:
    user_input = input("Enter domain(s): ").strip()

    if user_input.lower() == "q":
        print("👋 Exiting program.")
        break

    domains = [d.strip() for d in user_input.split(",") if d.strip()]

    if not domains:
        print("⚠️ No valid domains entered. Try again.")
        continue

    test_features = [compute_domain_features(d) for d in domains]
    test_df = pd.DataFrame(test_features)
    X_test = test_df.drop(columns=['domains'])

    pred_probs = model.predict_proba(X_test)[:, 1]
    pred_labels = (pred_probs >= threshold).astype(int)

    results = pd.DataFrame({
        'Domain': domains,
        'Predicted_Label': pred_labels,
        'Probability': pred_probs,
        'Classification': ["Benign" if l == 0 else "DGA" for l in pred_labels]
    })

    print("\n===== Prediction Results =====")
    print(results.to_string(index=False))
    print("\n")
