# Threat Hunting Lab 

## Fase 1 

In [1]:
import pandas as pd
import json
from tldextract import extract

In [2]:
file_path = 'large_eve.json'
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]

print(f'Total de registros: {len(data)}')  # Debe ser 746909

Total de registros: 746909


In [3]:
dns_records = [record for record in data if record.get('event_type') == 'dns']
print(f'Total de registros DNS: {len(dns_records)}')  # Debe ser 21484

# 4. Mostrar 2 registros cualesquiera
print(json.dumps(dns_records[:2], indent=2))

Total de registros DNS: 15749
[
  {
    "timestamp": "2017-07-22T17:33:16.661646-0500",
    "flow_id": 1327836194150542,
    "pcap_cnt": 22269,
    "event_type": "dns",
    "vlan": 110,
    "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
    "src_port": 59680,
    "dest_ip": "2001:0500:0001:0000:0000:0000:803f:0235",
    "dest_port": 53,
    "proto": "UDP",
    "dns": {
      "type": "query",
      "id": 15529,
      "rrname": "api.wunderground.com",
      "rrtype": "A",
      "tx_id": 0
    }
  },
  {
    "timestamp": "2017-07-22T17:33:24.990320-0500",
    "flow_id": 2022925111925872,
    "pcap_cnt": 54352,
    "event_type": "dns",
    "vlan": 110,
    "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
    "src_port": 38051,
    "dest_ip": "2001:0500:0003:0000:0000:0000:0000:0042",
    "dest_port": 53,
    "proto": "UDP",
    "dns": {
      "type": "query",
      "id": 58278,
      "rrname": "stork79.dropbox.com",
      "rrtype": "A",
      "tx_id": 0
    }
  }
]


In [4]:
df = pd.json_normalize(dns_records)
print(f'Shape del DataFrame: {df.shape}')

Shape del DataFrame: (15749, 18)


In [6]:
df_a = df[df['dns.rrtype'] == 'A']
print(f'Total de registros DNS tipo A: {len(df_a)}')

Total de registros DNS tipo A: 2849


In [7]:
domain_column = 'dns.rrname'  # Ajustar según los datos
unique_domains = df_a[domain_column].dropna().unique()
print(f'Total de dominios únicos: {len(unique_domains)}')

Total de dominios únicos: 177


In [8]:
def get_tld(domain):
    extracted = extract(domain)
    if extracted.suffix:
        return f"{extracted.domain}.{extracted.suffix}"
    return extracted.domain

In [None]:
df_domains = pd.DataFrame({'domain'ffffff: unique_domains})
df_domains['domain_tld'] = df_domains['domain'].apply(get_tld)

In [10]:
df_domains.to_csv('filtered_domains.csv', index=False)
print(df_domains.head())


                                       domain        domain_tld
0                        api.wunderground.com  wunderground.com
1                         stork79.dropbox.com       dropbox.com
2  hpca-tier2.office.aol.com.ad.aol.aoltw.net         aoltw.net
3        safebrowsing.clients.google.com.home              home
4                         fxfeeds.mozilla.com       mozilla.com


In [24]:
df_domains.describe()

Unnamed: 0,dga_label
count,177.0
mean,-0.960452
std,0.268869
min,-1.0
25%,-1.0
50%,-1.0
75%,-1.0
max,1.0


## Fase 2

In [20]:
import os
import google.generativeai as genai

GOOGLE_API_KEY = "AIzaSyAjgUio7sSx1-oZPm8hVsNGbkp17p8OyMQ"
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel("models/gemini-1.5-pro")

import re

def classify_domain(domain):
    prompt = f"""
E¿El dominio {domain} es generado por un algoritmo DGA (1) o legítimo (0)?
"""
    try:
        response = model.generate_content(prompt)
        text = response.text.strip()

        # Buscar el primer 0 o 1 en cualquier parte del texto
        match = re.search(r'[01]', text)
        if match:
            return int(match.group(0))
        else:
            print(f"[!] Respuesta inesperada para {domain}: '{text}'")
            return -1
    except Exception as e:
        print(f"[X] Error clasificando {domain}: {e}")
        return -1

In [21]:
response = model.generate_content("Clasifica el dominio api.wunderground.com como DGA (1) o legítimo (0). Solo responde con 1 o 0.")
print(response.text.strip())

0


In [None]:
df_domains["dga_label"] = df_domains["domain"].apply(classify_domain)
df_domains["dga_label"].value_counts()