# Laboratorio 5
**Threat Hunting**

## Part 1

In [151]:
import json
import pandas as pd
import re
from urllib.parse import urlparse

In [152]:
with open('large_eve.json', 'r') as file:
    records = [json.loads(line) for line in file]
print(f"Total records: {len(records)}")

Total records: 746909


In [153]:
dns_records = [record for record in records if record.get('event_type') == 'dns']
print(f"DNS records: {len(dns_records)}")

DNS records: 15749


In [154]:
print("\nTwo random DNS records:")
for record in dns_records[:2]:
    print(json.dumps(record, indent=2))


Two random DNS records:
{
  "timestamp": "2017-07-22T17:33:16.661646-0500",
  "flow_id": 1327836194150542,
  "pcap_cnt": 22269,
  "event_type": "dns",
  "vlan": 110,
  "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
  "src_port": 59680,
  "dest_ip": "2001:0500:0001:0000:0000:0000:803f:0235",
  "dest_port": 53,
  "proto": "UDP",
  "dns": {
    "type": "query",
    "id": 15529,
    "rrname": "api.wunderground.com",
    "rrtype": "A",
    "tx_id": 0
  }
}
{
  "timestamp": "2017-07-22T17:33:24.990320-0500",
  "flow_id": 2022925111925872,
  "pcap_cnt": 54352,
  "event_type": "dns",
  "vlan": 110,
  "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
  "src_port": 38051,
  "dest_ip": "2001:0500:0003:0000:0000:0000:0000:0042",
  "dest_port": 53,
  "proto": "UDP",
  "dns": {
    "type": "query",
    "id": 58278,
    "rrname": "stork79.dropbox.com",
    "rrtype": "A",
    "tx_id": 0
  }
}


In [155]:
df = pd.json_normalize(dns_records)
print(f"\nDataFrame shape: {df.shape}")


DataFrame shape: (15749, 18)


In [156]:
df_type_a = df[df['dns.rrtype'] == 'A']
print(f"\nType A records: {len(df_type_a)}")


Type A records: 2849


In [157]:
unique_domains = df_type_a['dns.rrname'].unique()
print(f"\nUnique domains: {len(unique_domains)}")


Unique domains: 177


In [158]:
def is_common_tld(tld):
    common_tlds = ['com', 'org', 'net', 'edu', 'gov', 'mil', 'io', 'co', 'uk', 'ru', 'us', 'eu', 'de', 'fr', 'jp']
    return tld.lower() in common_tlds

def get_tld(domain):
    """
    source: Claude Sonnet 3.7
    prompt: generate a function that extracts the TLD from a domain
    """
    #if domain is empty or not a string, return empty string
    if not domain or not isinstance(domain, str):
        return ''
    
    #if domain contains protocol, parse it
    if '://' in domain:
        parsed_url = urlparse(domain)
        domain = parsed_url.netloc
    
    #if domain contains www, remove it
    clean_domain = re.sub(r'^www\.', '', domain)
    
    #if domain contains path, remove it
    clean_domain = clean_domain.split('/')[0]
    
    #divide domain by points
    parts = clean_domain.split('.')
    
    #if domain has less than 2 parts, return domain
    if len(parts) < 2:
        return clean_domain
    
    if len(parts) == 2:
        #for cases like example.com
        return clean_domain
    else:
        #for cases like api.wunderground.com or safebrowsing.clients.google.com.home
        
        #check if domain ends with a custom subdomain (like .home)
        if len(parts) > 3 and not is_common_tld(parts[-1]):
            #for google.com.home, return home
            return parts[-1]
        else:
            #for api.wunderground.com, return wunderground.com
            return f"{parts[-2]}.{parts[-1]}"

In [159]:
df_tld = pd.DataFrame({'domain': unique_domains})
df_tld['domain_tld'] = df_tld['domain'].apply(get_tld)
print("\nFinal DataFrame with TLDs:")
print(df_tld.head())


Final DataFrame with TLDs:
                                       domain        domain_tld
0                        api.wunderground.com  wunderground.com
1                         stork79.dropbox.com       dropbox.com
2  hpca-tier2.office.aol.com.ad.aol.aoltw.net         aoltw.net
3        safebrowsing.clients.google.com.home              home
4                         fxfeeds.mozilla.com       mozilla.com


## Part 2

In [160]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

In [161]:
load_dotenv()

genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
model = genai.GenerativeModel(model_name='gemini-2.0-flash')

In [162]:
def classify_domain(domain):
    """
    Classify a domain as DGA (1) or legitimate (0) using Gemini
    """
    prompt = f"""You are a domain security expert. Analyze this domain name and classify it as DGA (Domain Generation Algorithm) or legitimate.
    
    Domain: {domain}
    
    DGA Indicators (if ANY of these are present, classify as DGA):
    1. Random character sequences (e.g., 'x7k9m2p4')
    2. Unusual character combinations (e.g., 'qwerty123', 'abc123')
    3. Lack of meaningful words or brand names
    4. Unusual length (very long or very short)
    5. Suspicious patterns in subdomains
    6. Repetitive patterns (e.g., 'aaa', '111')
    7. Mixed case usage (e.g., 'aBcDeF')
    8. Numbers mixed randomly with letters
    9. Unusual TLD combinations
    10. Subdomains that look like random strings
    
    Examples of DGA domains:
    - x7k9m2p4.example.com
    - abc123xyz.net
    - qwertyuiop.asia
    - 1234567890.org
    - xysad.google.com
    - xysad.microsoft.com
    - xysad.amazon.com
    - xysad.github.com
    
    Examples of legitimate domains:
    - google.com
    - microsoft.com
    - amazon.com
    - github.com
    
    IMPORTANT:
    - If ANY subdomain shows DGA characteristics, classify the entire domain as DGA
    - Even if the TLD is legitimate, suspicious subdomains indicate DGA
    - Respond with ONLY '1' for DGA or '0' for legitimate
    - Be conservative: if in doubt, classify as DGA
    """
    
    try:
        response = model.generate_content(prompt)
        result = response.text.strip()
        return int(result)
    except Exception as e:
        print(f"Error classifying {domain}: {str(e)}")
        return -1

In [177]:
# check if domain_tld_dga.csv exists
# This workaround is needed because the dataframe is too large to be processed in one go
if os.path.exists('domain_tld_dga.csv'):
    # Load the dataframe from the csv file
    df_tld = pd.read_csv('domain_tld_dga.csv')
    
    # Create a mask for unclassified domains (is_dga = -1)
    unclassified_mask = df_tld['is_dga'] == -1
    
    # Only classify domains that haven't been classified yet
    if unclassified_mask.any():
        df_tld.loc[unclassified_mask, 'is_dga'] = df_tld.loc[unclassified_mask, 'domain'].apply(classify_domain)
        # Save the dataframe to a csv file
        df_tld.to_csv('domain_tld_dga.csv', index=False)
        # Reload the dataframe from the csv file
        df_tld = pd.read_csv('domain_tld_dga.csv')

In [178]:
dga_domains = df_tld[df_tld['is_dga'] == 1]
print("\nDGA Domains (after removing duplicates):")
print(dga_domains[['domain', 'domain_tld']].drop_duplicates())
print(f"\nTotal unique DGA domains: {len(dga_domains.drop_duplicates())}")


DGA Domains (after removing duplicates):
                                             domain     domain_tld
2        hpca-tier2.office.aol.com.ad.aol.aoltw.net            net
6                        aolmtcmxm03.office.aol.com            com
7       aolmtcmxm02.office.aol.com.ad.aol.aoltw.net            net
8                        aolmtcmxm02.office.aol.com            com
10      aolmtcmxm03.office.aol.com.ad.aol.aoltw.net            net
11                       aolmtcmxm04.office.aol.com            com
15      aolmtcmxm04.office.aol.com.ad.aol.aoltw.net            net
18                         192.168.22.110phpmyadmin  110phpmyadmin
24             192.168.22.110phpmyadmin.localdomain    localdomain
27                             proxim.ntkrnlpa.info           info
32                      AOLDTCMA04.ad.aol.aoltw.net            net
42                tools.google.com.ad.aol.aoltw.net            net
43   safebrowsing.clients.google.com.hackerlabs.vpn            vpn
54      secure.infor