# Laboratorio #5 – Threat hunting
## Semestre I - 2025

#### Manuel Rodas 21509

#### Parte 1 – Filtrado y preprocesamiento

In [None]:

import json

with open("large_eve.json", "r") as f:
    data = [json.loads(line) for line in f]

print("Cantidad total de registros:", len(data))


Cantidad total de registros: 746909


In [None]:

dns_data = [entry for entry in data if entry.get("event_type") == "dns"]

print("Cantidad de registros DNS:", len(dns_data)) 


Cantidad de registros DNS: 15749


In [None]:

import random
random.sample(dns_data, 2)


[{'timestamp': '2017-07-22T18:29:18.461266-0500',
  'flow_id': 1540390050990347,
  'pcap_cnt': 506539,
  'event_type': 'dns',
  'vlan': 150,
  'src_ip': '192.168.207.4',
  'src_port': 53,
  'dest_ip': '192.168.205.188',
  'dest_port': 60491,
  'proto': 'UDP',
  'dns': {'type': 'answer',
   'id': 29190,
   'rcode': 'NXDOMAIN',
   'rrname': '<root>',
   'rrtype': 'SOA',
   'ttl': 20864}},
 {'timestamp': '2017-07-22T17:38:23.002765-0500',
  'flow_id': 1229116390968013,
  'pcap_cnt': 82657,
  'event_type': 'dns',
  'vlan': 150,
  'src_ip': '192.168.205.188',
  'src_port': 38094,
  'dest_ip': '192.168.207.4',
  'dest_port': 53,
  'proto': 'UDP',
  'dns': {'type': 'query',
   'id': 44125,
   'rrname': '201.26.168.192.in-addr.arpa',
   'rrtype': 'PTR',
   'tx_id': 0}}]

In [None]:

import pandas as pd
from pandas import json_normalize

dns_df = json_normalize(dns_data)
print("Shape del dataframe:", dns_df.shape) 


Shape del dataframe: (15749, 18)


In [None]:

dns_a_df = dns_df[dns_df["dns.rrtype"] == "A"]
print("Cantidad de registros tipo A:", dns_a_df.shape[0]) 


Cantidad de registros tipo A: 2849


In [None]:

unique_domains = dns_a_df["dns.rrname"].dropna().unique()
print("Cantidad de dominios únicos:", len(unique_domains))  


Cantidad de dominios únicos: 177


In [None]:

def get_tld(domain):
    parts = domain.strip('.').split('.')
    if len(parts) >= 2:
        return '.'.join(parts[-2:])
    else:
        return domain


print(get_tld("api.wunderground.com"))  
print(get_tld("safebrowsing.clients.google.com.home")) 


wunderground.com
com.home


In [None]:

domain_df = pd.DataFrame(unique_domains, columns=["domain"])
domain_df["domain_tld"] = domain_df["domain"].apply(get_tld)
print(domain_df.head())


                                       domain        domain_tld
0                        api.wunderground.com  wunderground.com
1                         stork79.dropbox.com       dropbox.com
2  hpca-tier2.office.aol.com.ad.aol.aoltw.net         aoltw.net
3        safebrowsing.clients.google.com.home          com.home
4                         fxfeeds.mozilla.com       mozilla.com


### Parte 2 – Data Science

In [26]:
!pip install --upgrade google-generativeai


Defaulting to user installation because normal site-packages is not writeable


In [42]:
import google.generativeai as genai
import os

genai.configure(api_key="AIzaSyB2wwU1jsS41_tawNFYRrHqeE3hsQiYepU")

for m in genai.list_models():
    print(f"Name: {m.name}")
    print(f"  Generation: {'Yes' if 'generateContent' in m.supported_generation_methods else 'No'}")
    print(f"  Input: {m.input_token_limit} tokens")
    print(f"  Output: {m.output_token_limit} tokens")
    print()

Name: models/chat-bison-001
  Generation: No
  Input: 4096 tokens
  Output: 1024 tokens

Name: models/text-bison-001
  Generation: No
  Input: 8196 tokens
  Output: 1024 tokens

Name: models/embedding-gecko-001
  Generation: No
  Input: 1024 tokens
  Output: 1 tokens

Name: models/gemini-1.0-pro-vision-latest
  Generation: Yes
  Input: 12288 tokens
  Output: 4096 tokens

Name: models/gemini-pro-vision
  Generation: Yes
  Input: 12288 tokens
  Output: 4096 tokens

Name: models/gemini-1.5-pro-latest
  Generation: Yes
  Input: 2000000 tokens
  Output: 8192 tokens

Name: models/gemini-1.5-pro-001
  Generation: Yes
  Input: 2000000 tokens
  Output: 8192 tokens

Name: models/gemini-1.5-pro-002
  Generation: Yes
  Input: 2000000 tokens
  Output: 8192 tokens

Name: models/gemini-1.5-pro
  Generation: Yes
  Input: 2000000 tokens
  Output: 8192 tokens

Name: models/gemini-1.5-flash-latest
  Generation: Yes
  Input: 1000000 tokens
  Output: 8192 tokens

Name: models/gemini-1.5-flash-001
  Generat

In [None]:
import os
import google.generativeai as genai

GOOGLE_API_KEY = "AIzaSyB2wwU1jsS41_tawNFYRrHqeE3hsQiYepU"
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel("models/gemini-1.5-pro")

import re

def classify_domain(domain):
    prompt = f"""
Eres un experto en ciberseguridad. Clasifica el siguiente dominio como DGA (1) o legítimo (0).
Solo responde con 1 o 0. Dominio: {domain}
"""
    try:
        response = model.generate_content(prompt)
        text = response.text.strip()

        match = re.search(r'[01]', text)
        if match:
            return int(match.group(0))
        else:
            print(f"[!] Respuesta inesperada para {domain}: '{text}'")
            return -1
    except Exception as e:
        print(f"[X] Error clasificando {domain}: {e}")
        return -1


In [45]:
response = model.generate_content("Clasifica el dominio api.wunderground.com como DGA (1) o legítimo (0). Solo responde con 1 o 0.")
print(response.text.strip())


0


In [46]:
domain_df["dga_label"] = domain_df["domain"].apply(classify_domain)
domain_df["dga_label"].value_counts()


dga_label
0    174
1      3
Name: count, dtype: int64

In [47]:
errores = domain_df[domain_df["dga_label"] == -1]
print(f"Errores de clasificación: {len(errores)}")


Errores de clasificación: 0


In [None]:

dga_domains = domain_df[domain_df["dga_label"] == 1]
dga_domains_unique = dga_domains.drop_duplicates(subset=["domain"])

print("Cantidad de dominios clasificados como DGA (con posibles duplicados):", dga_domains.shape[0])
print("Cantidad de dominios DGA únicos:", dga_domains_unique.shape[0])
dga_domains_unique.head()



Cantidad de dominios clasificados como DGA (con posibles duplicados): 3
Cantidad de dominios DGA únicos: 3


Unnamed: 0,domain,domain_tld,dga_label
27,proxim.ntkrnlpa.info,ntkrnlpa.info,1
160,vtlfccmfxlkgifuf.com,vtlfccmfxlkgifuf.com,1
167,ejfodfmfxlkgifuf.xyz,ejfodfmfxlkgifuf.xyz,1


### Parte 3 – Dominio experto

In [None]:

top_1m_list = set()

with open("top-1m.csv", "r") as f:
    for line in f:
        parts = line.strip().split(",")
        if len(parts) == 2:
            _, domain = parts
            top_1m_list.add(domain.lower())


In [51]:
def is_in_top_list(tld):
    return 0 if tld.lower() in top_1m_list else 1


In [None]:
dga_domains_unique["not_in_top"] = dga_domains_unique["domain_tld"].apply(is_in_top_list)


final_suspects = dga_domains_unique[dga_domains_unique["not_in_top"] == 1].drop_duplicates(subset=["domain_tld"])

print("Dominios sospechosos que no están en la lista top 1M:", final_suspects.shape[0])
final_suspects[["domain", "domain_tld"]]


Dominios sospechosos que no están en la lista top 1M: 3


Unnamed: 0,domain,domain_tld
27,proxim.ntkrnlpa.info,ntkrnlpa.info
160,vtlfccmfxlkgifuf.com,vtlfccmfxlkgifuf.com
167,ejfodfmfxlkgifuf.xyz,ejfodfmfxlkgifuf.xyz


In [53]:
!pip install python-whois


Defaulting to user installation because normal site-packages is not writeable
Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
Installing collected packages: python-whois
Successfully installed python-whois-0.9.5


In [54]:
import whois
from datetime import datetime

def get_creation_date(tld):
    try:
        w = whois.whois(tld)
        creation_date = w.creation_date
        # A veces es una lista
        if isinstance(creation_date, list):
            return creation_date[0]
        return creation_date
    except Exception as e:
        print(f"No se pudo obtener fecha para {tld}: {e}")
        return None


In [55]:
final_suspects["creation_date"] = final_suspects["domain_tld"].apply(get_creation_date)
final_suspects[["domain", "domain_tld", "creation_date"]]


No se pudo obtener fecha para vtlfccmfxlkgifuf.com: No match for "VTLFCCMFXLKGIFUF.COM".
>>> Last update of whois database: 2025-03-27T04:50:27Z <<<

NOTICE: The expiration date displayed in this record is the date the
registrar's sponsorship of the domain name registration in the registry is
currently set to expire. This date does not necessarily reflect the expiration
date of the domain name registrant's agreement with the sponsoring
registrar.  Users may consult the sponsoring registrar's Whois database to
view the registrar's reported date of expiration for this registration.

TERMS OF USE: You are not authorized to access or query our Whois
database through the use of electronic processes that are high-volume and
automated except as reasonably necessary to register domain names or
modify existing registrations; the Data in VeriSign Global Registry
Services' ("VeriSign") Whois database is provided by VeriSign for
information purposes only, and to assist persons in obtaining informa

Unnamed: 0,domain,domain_tld,creation_date
27,proxim.ntkrnlpa.info,ntkrnlpa.info,
160,vtlfccmfxlkgifuf.com,vtlfccmfxlkgifuf.com,
167,ejfodfmfxlkgifuf.xyz,ejfodfmfxlkgifuf.xyz,


In [None]:
import re

def looks_random(domain):
    return 1 if re.search(r'[bcdfghjklmnpqrstvwxyz]{5,}', domain.lower()) else 0

final_suspects["pattern_dga"] = final_suspects["domain"].apply(looks_random)
final_suspects[["domain", "domain_tld", "creation_date", "pattern_dga"]]


Unnamed: 0,domain,domain_tld,creation_date,pattern_dga
27,proxim.ntkrnlpa.info,ntkrnlpa.info,,1
160,vtlfccmfxlkgifuf.com,vtlfccmfxlkgifuf.com,,1
167,ejfodfmfxlkgifuf.xyz,ejfodfmfxlkgifuf.xyz,,1
