# Norway Analysis

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import collections
import datetime as dt
import enum
import gzip
import itertools
import ipaddress
import json
import posixpath
from pprint import pprint
import shutil
import sys
import tempfile
from typing import Iterable, NamedTuple, Tuple, Union
from urllib.parse import urlparse

import pandas as pd
import ujson
import IPython.display as idisplay
import ipywidgets as widgets

try:
    import netanalysis
except ModuleNotFoundError:
    !{sys.executable} -m ensurepip
    !{sys.executable} -m pip install --upgrade git+https://github.com/Jigsaw-Code/net-analysis.git

import netanalysis.ooni.bucket as ob
import netanalysis.ooni.analysis.dns as od
from netanalysis.ooni.measurement import Measurement


ooni = ob.Bucket()

## Fetch and save measurement data

In [6]:
import pathlib

DATA_DIR = pathlib.Path('/kaggle/working')
if not DATA_DIR.is_dir():
    DATA_DIR = pathlib.Path.home()
DATA_DIR /= 'ooni_data'
COUNTRY = 'NO'
file_list = list(ooni.list_files(dt.date(2020, 1, 1), dt.date.today(), 'webconnectivity', COUNTRY))

In [7]:
# Print stats
def calculate_data_size(files: Iterable[ob.FileEntry]) -> Tuple[int, collections.Counter]:
    total_size = 0
    type_size = collections.Counter()
    for entry in files:
        total_size += entry.size
        type_size[entry.test_type] += entry.size
    return (total_size, type_size)
total_size, type_size = calculate_data_size(file_list)
for test_type, size in type_size.most_common():
    print(f'{test_type}: {size:,}')
print(f'================\nData size: {total_size:,} bytes')
data_cost = 0.09 * total_size / 2**30  # $0.09 per GiB
print(f'Download cost: ${data_cost:.6f}')
print(f'Download time: {total_size / 85000000 * 8:.2f}s @ 85 Mbps, {total_size / 10000000 * 8:.2f}s @ 10 Mbps')

webconnectivity: 38,263,818
Data size: 38,263,818 bytes
Download cost: $0.003207
Download time: 3.60s @ 85 Mbps, 30.61s @ 10 Mbps


In [8]:
%%time
# Download files
for entry in file_list:
    local_filename = os.path.join(DATA_DIR, COUNTRY, entry.test_type, posixpath.basename(entry.filename))
    idisplay.clear_output(wait=True)
    if os.path.isfile(local_filename):
        print(f'Skipping {entry.filename}')
        continue
    print(f'Downloading {entry.filename}')
    os.makedirs(os.path.dirname(local_filename), exist_ok=True)
    with gzip.open(local_filename, mode='wt', encoding='utf-8') as local_file:
        with ooni.get_file(entry.filename) as remote_file:
            with gzip.GzipFile(fileobj=remote_file, mode='r') as input_file:
                for line in input_file:
                    # TODO: paralelize IO and CPU
                    measurement = ujson.loads(line)
                    ujson.dump(ob.trim_measurement(measurement,  1000), local_file)
                    local_file.write('\n')

Downloading raw/20201118/23/NO/webconnectivity/2020111823_NO_webconnectivity.n0.0.jsonl.gz
CPU times: user 3.41 s, sys: 172 ms, total: 3.58 s
Wall time: 19.9 s


In [9]:
!du -h $DATA_DIR

1.9M	/Users/fortuna/ooni_data/NO/webconnectivity
1.9M	/Users/fortuna/ooni_data/NO
8.0K	/Users/fortuna/ooni_data/CU/sniblocking
 40K	/Users/fortuna/ooni_data/CU/dnscheck
112K	/Users/fortuna/ooni_data/CU/facebookmessenger
 12K	/Users/fortuna/ooni_data/CU/httpinvalidrequestline
1.5M	/Users/fortuna/ooni_data/CU/webconnectivity
 20K	/Users/fortuna/ooni_data/CU/psiphon
 60K	/Users/fortuna/ooni_data/CU/tor
112K	/Users/fortuna/ooni_data/CU/telegram
 12K	/Users/fortuna/ooni_data/CU/dash
4.0K	/Users/fortuna/ooni_data/CU/urlgetter
1.1M	/Users/fortuna/ooni_data/CU/ndt
 12K	/Users/fortuna/ooni_data/CU/httpheaderfieldmanipulation
160K	/Users/fortuna/ooni_data/CU/whatsapp
3.1M	/Users/fortuna/ooni_data/CU
5.0M	/Users/fortuna/ooni_data


In [12]:
def get_local_measurements(directory: str):
    with os.scandir(directory) as it:
        for entry in it:
            with gzip.open(entry.path, 'r') as test_file:
                for line in test_file:
                    yield ujson.loads(line)

def is_ip(hostname):
    try:
        ipaddress.ip_address(domain)
        return True
    except ValueError:
        return False

# DNS Analysis

In [13]:
evaluator: od.Evaluator = od.Evaluator()
rows = []
for measurement in get_local_measurements(os.path.join(DATA_DIR, COUNTRY, 'webconnectivity')):
    m = Measurement(measurement)
    domain = m.hostname
    if is_ip(domain):
        continue
    # if domain == 'www.netflix.com':
    #     pprint(measurement)
    evaluator.add_control(m)
    rows.extend([o._asdict() for o in od.get_observations(m)])

for row in rows:
    evaluation = evaluator.evaluate(row['domain'], row['status'], row['answers'])
    row['eval'] = evaluation
    row['has_interference'] = 1 if evaluation.startswith('BAD') else 0

obs = pd.DataFrame.from_records(rows).sort_values(by='time', ascending=False)
del(rows)
obs

Unnamed: 0,time,client_country,client_asn,resolver_ip,resolver_asn,domain,query_type,failure,status,answers,explorer_url,eval,has_interference
2906,2020-11-18 23:58:52+00:00,NO,28795,127.0.0.2,2116,www.dw-world.de,A,,OK,"[92.123.155.42, 92.123.155.65]",https://explorer.ooni.org/measurement/20201118...,INCONCLUSIVE_CHECK_IPS,0
2903,2020-11-18 23:58:46+00:00,NO,28795,127.0.0.2,2116,www.ahram.org.eg,A,,OK,[196.219.246.52],https://explorer.ooni.org/measurement/20201118...,OK_MATCHES_CONTROL_IP,0
2897,2020-11-18 23:58:45+00:00,NO,28795,127.0.0.2,2116,www.gatesfoundation.org,A,,OK,[104.110.7.127],https://explorer.ooni.org/measurement/20201118...,INCONCLUSIVE_CHECK_IPS,0
2894,2020-11-18 23:58:43+00:00,NO,28795,127.0.0.2,2116,www.scribd.com,A,,OK,[151.101.238.152],https://explorer.ooni.org/measurement/20201118...,INCONCLUSIVE_CHECK_IPS,0
2892,2020-11-18 23:58:41+00:00,NO,28795,127.0.0.2,2116,www.dit-inc.us,A,,OK,[65.49.38.217],https://explorer.ooni.org/measurement/20201118...,OK_MATCHES_CONTROL_IP,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1439,2020-10-23 11:14:51+00:00,NO,12929,127.0.0.2,12929,www.mainichi.co.jp,A,,OK,"[143.204.55.109, 143.204.55.37, 143.204.55.111...",https://explorer.ooni.org/measurement/20201023...,INCONCLUSIVE_CHECK_IPS,0
1429,2020-10-23 11:14:49+00:00,NO,12929,127.0.0.2,12929,tinyurl.com,A,,OK,"[104.20.139.65, 104.20.138.65, 172.67.1.225]",https://explorer.ooni.org/measurement/20201023...,OK_MATCHES_CONTROL_IP,0
1421,2020-10-23 11:14:47+00:00,NO,12929,127.0.0.2,12929,sci-hub.se,A,,OK,[186.2.163.219],https://explorer.ooni.org/measurement/20201023...,OK_MATCHES_CONTROL_IP,0
1408,2020-10-23 11:14:46+00:00,NO,12929,127.0.0.2,12929,www.nbcnewyork.com,A,,OK,[104.75.77.45],https://explorer.ooni.org/measurement/20201023...,INCONCLUSIVE_CHECK_IPS,0


In [14]:
top_domains = obs.groupby(by='domain')['has_interference'].sum().sort_values(ascending=False)
top_ases = obs.value_counts(subset='client_asn')
top_statuses = obs.value_counts(subset='status')
top_evals = obs.value_counts(subset='eval')

display_columns = []
for d in [top_domains[:30], top_ases, top_statuses, top_evals]:
    w = widgets.Output(layout=widgets.Layout(margin='10px'))
    w.append_display_data(d)
    display_columns.append(w)
widgets.HBox(display_columns, layout=widgets.Layout(background='black'))

HBox(children=(Output(layout=Layout(margin='10px'), outputs=({'output_type': 'display_data', 'data': {'text/pl…

In [15]:
pivot = pd.pivot_table(obs, index=['domain'], columns=['client_asn', 'eval'], values=[], aggfunc=len, fill_value=0)
# pivot = pivot.reindex(top_sites.index).sort_index(axis='columns', level=1, ascending=False, key=lambda l: top_statuses[l])[top_ases.index]
pivot = pivot.reindex(top_domains.index).reindex(pd.MultiIndex.from_product([top_ases.index, top_evals.index]), axis='columns', fill_value=0)
with pd.option_context('display.max_rows', 150, 'display.max_columns', 24):
    display(pivot.iloc[:100, :24])


client_asn,2119,2119,2119,2119,2119,2119,2119,2119,9009,9009,9009,9009,9009,9009,9009,9009,28795,28795,28795,28795,28795,28795,28795,28795
eval,OK_MATCHES_CONTROL_IP,INCONCLUSIVE_CHECK_IPS,OK_MATCHES_CONTROL_ERROR,INCONCLUSIVE_BAD_CONTROL,BAD_NON_GLOBAL_IP,BAD_STATUS_NXDOMAIN,BAD_STATUS_generic_timeout_error,BAD_STATUS_dns_resolver_error,OK_MATCHES_CONTROL_IP,INCONCLUSIVE_CHECK_IPS,OK_MATCHES_CONTROL_ERROR,INCONCLUSIVE_BAD_CONTROL,BAD_NON_GLOBAL_IP,BAD_STATUS_NXDOMAIN,BAD_STATUS_generic_timeout_error,BAD_STATUS_dns_resolver_error,OK_MATCHES_CONTROL_IP,INCONCLUSIVE_CHECK_IPS,OK_MATCHES_CONTROL_ERROR,INCONCLUSIVE_BAD_CONTROL,BAD_NON_GLOBAL_IP,BAD_STATUS_NXDOMAIN,BAD_STATUS_generic_timeout_error,BAD_STATUS_dns_resolver_error
domain,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
astalavista.box.sk,1,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
www.blackhat.be,2,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
www.darpa.mil,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
occupystreams.org,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
www.ahram.org.eg,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
doubleclick.net,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
webmail.sso.bluewin.ch,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
w3schools.com,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
warc.jalb.de,0,0,2,1,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
web.archive.org,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
obs[obs['domain'] == 'www.match.com'].drop(columns=['domain'])

Unnamed: 0,time,client_country,client_asn,resolver_ip,resolver_asn,query_type,failure,status,answers,explorer_url,eval,has_interference
1448,2020-10-23 11:51:53+00:00,NO,2119,127.0.0.2,2119,A,,OK,[62.23.30.26],https://explorer.ooni.org/measurement/20201023...,INCONCLUSIVE_CHECK_IPS,0


In [20]:
obs[['client_asn', 'resolver_asn', 'resolver_ip']].value_counts()

client_asn  resolver_asn  resolver_ip    
8048        8048          201.249.172.70     7215
                          201.249.172.77     5821
                          201.249.172.74     5781
                          201.249.172.71     5147
                          127.0.0.2          4603
                          201.249.172.76     3294
                          201.249.172.72     2951
                          201.249.172.75     1470
            15169         127.0.0.2          1443
21826       15169         74.125.77.75       1437
8048        8048          201.249.215.5      1437
21826       15169         172.253.242.105    1437
8048        8048          201.249.215.8      1437
                          201.249.215.6      1435
                          201.249.172.78      277
            13335         108.162.213.64       79
21826       15169         172.253.242.39       38
                          74.125.77.72         17
                          172.253.242.37       17
8048    

In [21]:
obs[['client_asn', 'resolver_asn', 'status']].value_counts()

client_asn  resolver_asn  status               
8048        8048          OK                       39648
21826       15169         OK                        2901
8048        15169         OK                        1414
            8048          NXDOMAIN                   736
                          generic_timeout_error      250
                          SERVFAIL                   233
            13335         OK                          95
21826       15169         NXDOMAIN                    34
8048        15169         NXDOMAIN                    17
                          SERVFAIL                    12
21826       15169         SERVFAIL                    11
                          generic_timeout_error       11
6306        6306          OK                           7
11562       13335         OK                           6
28007       3356          OK                           2
8048        0             SERVFAIL                     1
                          OK            

In [352]:
!dig @201.249.172.77 eldolarparalelo.info


; <<>> DiG 9.10.6 <<>> @201.249.172.77 eldolarparalelo.info
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: SERVFAIL, id: 57965
;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 1

;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 4096
;; QUESTION SECTION:
;eldolarparalelo.info.		IN	A

;; Query time: 277 msec
;; SERVER: 201.249.172.77#53(201.249.172.77)
;; WHEN: Sat Nov 14 02:29:30 EST 2020
;; MSG SIZE  rcvd: 49



In [353]:
!dig @201.249.172.77 example.com


; <<>> DiG 9.10.6 <<>> @201.249.172.77 example.com
; (1 server found)
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: REFUSED, id: 50572
;; flags: qr rd; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 1

;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 4096
;; QUESTION SECTION:
;example.com.			IN	A

;; Query time: 72 msec
;; SERVER: 201.249.172.77#53(201.249.172.77)
;; WHEN: Sat Nov 14 02:29:44 EST 2020
;; MSG SIZE  rcvd: 40



In [21]:
set(obs[obs['domain'] == 'www.hrw.org']['explorer_url'].to_list())

{'https://explorer.ooni.org/measurement/20201023T114501Z_webconnectivity_NO_2119_n1_OosRi3o9udg6Ou2M?input=https%3A%2F%2Fwww.hrw.org%2F'}

In [8]:
class DomainIpValidator:
    class Result(enum.IntEnum):
        UNKNOWN = 0
        OK_MATCHES_CONTROL = 1
        INVALID_NOT_GLOBAL = 2

    def __init__(self):
        self._name_value = collections.defaultdict(set)
        self._name_failure = collections.defaultdict(set)

    def add_answer(self, name: str, value: Union[str, ipaddress.IPv4Address, ipaddress.IPv6Address]):
        self._name_value[name].add(value)
    
    def add_failure(self, domain, failure):
        self._name_failure[domain].add(failure)
    
    def is_valid_domain_ip(self, domain: str, test_value: Union[str, ipaddress.IPv4Address, ipaddress.IPv6Address]):
        if isinstance(test_value, (ipaddress.IPv4Address, ipaddress.IPv6Address)):
            if not test_value.is_global:
                return DomainIpValidator.Result.INVALID_NOT_GLOBAL
        cnames = []
        for valid_value in self._name_value[domain]:
            if test_value == valid_value:
                return DomainIpValidator.Result.OK_MATCHES_CONTROL
            if isinstance(valid_value, str):
                cnames.append(valid_value)
        for cname in cnames:
            result = self.is_valid_domain_ip(cname, test_value)
            if result != DomainIpValidator.Result.UNKNOWN:
                return result
        return DomainIpValidator.Result.UNKNOWN


# Collect control
domain_ip_validator = DomainIpValidator()
for m in get_local_measurements(os.path.join(DATA_DIR, COUNTRY, 'webconnectivity')):
    domain = urlparse(m['input']).hostname
    if is_ip(domain):
        # Skip entries like "1.1.1.1", "8.8.8.8", etc.
        continue
    try:
        dns_control = m['test_keys']['control']['dns']
    except (KeyError, TypeError):
        # Capture TypeError because some keys have value None.
        continue
    failure = dns_control.get('failure')
    if failure:
        domain_ip_validator.add_failure(domain, failure)
        continue
    if not dns_control.get('addrs'):
        continue
    for hostname in dns_control['addrs']:
        try:
            ip = ipaddress.ip_address(hostname)
            domain_ip_validator.add_answer(domain, ip)
        except ValueError:
            # TODO: Also add cname -> IP
            domain_ip_validator.add_answer(domain, hostname)

In [46]:
def evaluate_observation(o):
    if o.failure:
        return o.failure
    for answer in o.answers:
        result = domain_ip_validator.is_valid_domain_ip(o.domain, answer)
        if result != DomainIpValidator.Result.UNKNOWN:
            return result.name
    return DomainIpValidator.Result.UNKNOWN.name

eval = obs.assign(eval=obs.apply(evaluate_observation, axis='columns'))
eval

Unnamed: 0,time,client_country,client_asn,resolver_asn,domain,query_type,failure,answers,eval
0,2020-11-03 05:19:47+00:00,VE,8048,,www.sputniknews.cn,A,,[195.93.247.59],OK_MATCHES_CONTROL
1,2020-11-03 05:27:50+00:00,VE,8048,,twitter.com,A,,"[104.244.42.65, 104.244.42.129]",OK_MATCHES_CONTROL
2,2020-11-03 05:29:49+00:00,VE,8048,,rapidgator.net,A,,[195.211.222.116],OK_MATCHES_CONTROL
3,2020-11-03 05:30:48+00:00,VE,8048,,www.mizzima.com,A,,"[172.67.73.200, 104.26.2.233, 104.26.3.233]",OK_MATCHES_CONTROL
4,2020-11-03 05:31:49+00:00,VE,8048,,www.cesr.org,A,,[208.90.215.75],OK_MATCHES_CONTROL
...,...,...,...,...,...,...,...,...,...
38082,2020-11-01 03:01:29+00:00,VE,8048,,www.democracycaucus.net,A,,[173.214.172.75],OK_MATCHES_CONTROL
38083,2020-11-01 03:01:32+00:00,VE,8048,,www.shroomery.org,A,,"[104.27.196.89, 104.27.195.89]",OK_MATCHES_CONTROL
38084,2020-11-01 03:01:41+00:00,VE,8048,,visionvenezuela.com.ve,A,NXDOMAIN,,NXDOMAIN
38085,2020-11-01 03:01:44+00:00,VE,8048,,miamidiario.com,A,,"[104.27.143.209, 172.67.168.240, 104.27.142.209]",OK_MATCHES_CONTROL


In [47]:
for asn, asn_df in eval.groupby(by='client_asn'):
    print(f'======= AS{asn} ======')
    for result, result_df in asn_df.groupby(by='eval'):
        print(f'----- {result} ------')
        with pd.option_context('display.min_rows', 50):
            display(result_df['domain'].value_counts())
        print()
    print()
    

----- NXDOMAIN ------


dollar.nu    1
Name: domain, dtype: int64


----- OK_MATCHES_CONTROL ------


www.beerinfo.com             1
laopinion.com                1
www.mail2web.com             1
www.positive.org             1
www.helvetas.ch              1
www.accesoalajusticia.org    1
as.com                       1
Name: domain, dtype: int64



----- INVALID_NOT_GLOBAL ------


www.belmont.ag        1
shareaza.com          1
www.webspawner.com    1
sayhichat.com         1
Name: domain, dtype: int64


----- NXDOMAIN ------


www.malware.com                        28
voice.yahoo.jajah.com                  27
imesh.com                              26
warc.jalb.de                           26
mitm.watch                             26
sci-hub.tw                             26
debate.org.uk                          26
www.pgp.com                            26
deoxy.org                              25
www.eln-voces.com                      25
www.wallpapergate.com                  24
webmail.sso.bluewin.ch                 24
thepiratebay.se                        24
www.bearshare.com                      24
www.blacksandjews.com                  23
www.hitler.org                         22
seniat.gob.ve                           5
nuvipa.org                              4
tupamaro.org.ve                         4
shareaza.com                            4
cnbv.org.ve                             4
dolar-paralelo.net                      4
www.layevangelism.com                   4
dolar-permuta.com                 


----- OK_MATCHES_CONTROL ------


twitter.com                  104
www.un.org                    77
www.ohchr.org                 75
www.eea.europa.eu             56
addons.mozilla.org            56
www.worldrtd.net              55
www.advocate.com              54
www.netaddress.com            53
sourceforge.net               52
www.worldlingo.com            52
www.sexandu.ca                52
archive.org                   52
www.tialsoft.com              52
icao.maps.arcgis.com          51
www.linkedin.com              51
www.jmarshall.com             51
www.worldwildlife.org         51
www.unfpa.org                 50
www.ectaco.com                50
www.xroxy.com                 50
ultrasurf.us                  50
www.bing.com                  49
www.backtrack-linux.org       49
www.omct.org                  49
www.well.com                  49
                            ... 
www.coinbase.com               1
es.panampost.com               1
animeflv.net                   1
www.lemonde.fr                 1
www.elinfo


----- SERVFAIL ------


crackspider.net            1
marijuana.nl               1
www.islamdoor.com          1
www.videogamereview.com    1
shareaza.com               1
www.layevangelism.com      1
www.belmont.ag             1
www.latinmail.com          1
www.eelam.com              1
www.ihrc.org               1
occupystreams.org          1
delicious.com              1
Name: domain, dtype: int64


----- UNKNOWN ------


www.google.com                           157
www.bbc.com                              101
en.wikipedia.org                          98
www.who.int                               75
www.state.gov                             52
www.dailymail.co.uk                       52
www.facebook.com                          52
www.newnownext.com                        51
www.haaretz.com                           51
www.unwomen.org                           51
www.xbox.com                              50
www.bacardi.com                           50
www.economist.com                         50
www.wordreference.com                     49
www.nytimes.com                           48
www.viber.com                             48
www.cdc.gov                               48
www.cfr.org                               32
abs.twimg.com                             30
ar.m.wikipedia.org                        29
www.yahoo.com                             29
www.dw.com                                29
www.worldb


----- generic_timeout_error ------


marijuana.nl               22
www.belmont.ag             21
www.teenhealthfx.com       21
www.eelam.com              21
www.opioids.com             8
www.babylon-x.com           7
www.latinmail.com           5
moqavemat.ir                5
shareaza.com                5
www.layevangelism.com       5
www.videogamereview.com     4
www.tunnelbear.com          3
delicious.com               3
www.tripod.lycos.com        3
exitinternational.net       2
www.coinbase.com            2
psiphon.ca                  2
crackspider.net             2
www.engenderhealth.org      1
www.datingdirect.com        1
www.warchild.org            1
www.rockstargames.com       1
www.icj.org                 1
76crimes.com                1
www.gayscape.com            1
                           ..
www.circumcision.org        1
occupystreams.org           1
www.onlinedating.com        1
www.greennet.org.uk         1
www.slsknet.org             1
www.acdi-cida.gc.ca         1
dl.bintray.com              1
www.euthan


----- unknown_failure: lookup [DOMAIN] on [scrubbed]: server misbehaving ------


www.videogamereview.com        21
crackspider.net                21
psiphon.ca                     20
www.coinbase.com               19
www.tunnelbear.com             19
delicious.com                  19
www.layevangelism.com          19
occupystreams.org              17
shareaza.com                   17
www.latinmail.com              17
www.tripod.lycos.com           10
www.islamdoor.com               5
venezuelaaidlive.mdstrm.com     1
nuevaprensa.com.ve              1
vdebate.blogspot.com            1
www.eelam.com                   1
minuto30.com                    1
ovario2.com                     1
www.vpitv.com                   1
heroesdesaludve.org             1
dolar-permuta.com               1
dolarparalelo.tk                1
dolarparalelovenezuela.com      1
dollarparalelovenezuela.com     1
elpitazo.ml                     1
alekboyd.blogspot.co.uk         1
Name: domain, dtype: int64



----- OK_MATCHES_CONTROL ------


www.winespectator.com      1
www.grindr.com             1
www.oxfam.org              1
www.ariannelingerie.com    1
www.goodreads.com          1
www.ifc.org                1
Name: domain, dtype: int64



----- OK_MATCHES_CONTROL ------


www.photobucket.com          1
theglobalobservatory.org     1
www.towleroad.com            1
wikileaks.org                1
icao.maps.arcgis.com         1
www.ariannelingerie.com      1
www.dfid.gov.uk              1
archive.org                  1
rapidgator.net               1
www.blackhat.be              1
www.liveleak.com             1
www.ifad.org                 1
peacefire.org                1
forum.grasscity.com          1
www.frc.org                  1
www.kcna.kp                  1
krishna.com                  1
www.interactworldwide.org    1
upload.twitter.com           1
www.hivandhepatitis.com      1
www.medecinsdumonde.org      1
hotmail.msn.com              1
www.buddhanet.net            1
gitlab.com                   1
cannabis.com                 1
www.kraken.com               1
site.voicepulse.com          1
common-fund.org              1
www.wikia.com                1
www.ectaco.com               1
www.hanes.com                1
t.co                         1
www.beer


----- UNKNOWN ------


www.xinhuanet.com          1
www.apple.com              1
www.akdn.org               1
www.bittorrent.com         1
www.gamespot.com           1
www.messenger.com          1
store.steampowered.com     1
www.absolut.com            1
www.interpol.int           1
github.com                 1
translate.reference.com    1
www.state.gov              1
Name: domain, dtype: int64


----- unknown_failure: lookup [DOMAIN] on [scrubbed]: server misbehaving ------


marijuana.nl    1
Name: domain, dtype: int64



----- OK_MATCHES_CONTROL ------


efectococuyo.com    1
Name: domain, dtype: int64



----- OK_MATCHES_CONTROL ------


urijijami.com        1
www.urijijami.com    1
Name: domain, dtype: int64


----- UNKNOWN ------


www.urijijami.com    1
Name: domain, dtype: int64





In [123]:
with pd.option_context('display.min_rows', 50):
    display(eval[eval['eval'] == 'UNKNOWN'])

Unnamed: 0,time,client_country,client_asn,resolver_asn,domain,query_type,failure,answers,eval
12,2020-10-22 19:10:35+00:00,CU,27725,,www.interpol.int,A,,[69.192.180.105],UNKNOWN
18,2020-10-22 19:46:35+00:00,CU,27725,,www.quora.com,A,,[151.101.5.2],UNKNOWN
24,2020-10-22 19:30:38+00:00,CU,27725,,allo.google.com,A,,[172.217.0.174],UNKNOWN
27,2020-10-22 19:56:18+00:00,CU,27725,,www.dailymail.co.uk,A,,[23.195.96.180],UNKNOWN
28,2020-10-22 19:10:39+00:00,CU,27725,,www.dea.gov,A,,[23.78.96.241],UNKNOWN
31,2020-10-22 19:29:35+00:00,CU,27725,,kids.yahoo.com,A,,[74.6.136.150],UNKNOWN
32,2020-10-22 19:34:10+00:00,CU,27725,,www.snapchat.com,A,,[172.217.2.211],UNKNOWN
34,2020-10-22 19:49:40+00:00,CU,27725,,www.facebook.com,A,,[31.13.67.35],UNKNOWN
41,2020-10-22 19:28:30+00:00,CU,27725,,news.google.com,A,,[172.217.8.110],UNKNOWN
46,2020-10-22 19:18:40+00:00,CU,27725,,wikidata.org,A,,[208.80.154.224],UNKNOWN


In [138]:
# Control and experiment errors don't match :-(
# Experiment gets dns_nxdomain_error for this one
domain_ip_validator._name_value['voice.yahoo.jajah.com']

{'dns_name_error'}