# Cuba Analysis

This report analyzes censorship in Cuba using OONI data.

In [None]:
%load_ext autoreload
%autoreload 2

In [108]:
import collections
import datetime as dt
import enum
import gzip
import itertools
import ipaddress
import json
import posixpath
from pprint import pprint
import shutil
import sys
import tempfile
from typing import Iterable, NamedTuple, Tuple, Union
from urllib.parse import urlparse

import pandas as pd
import ujson
import IPython.display as idisplay
import ipywidgets as widgets

try:
    import netanalysis
except ModuleNotFoundError:
    !{sys.executable} -m ensurepip
    !{sys.executable} -m pip install --upgrade git+https://github.com/Jigsaw-Code/net-analysis.git

import netanalysis.ooni.bucket as ob
import netanalysis.ooni.analysis.dns as od
from netanalysis.ooni.measurement import Measurement


ooni = ob.Bucket()

## Fetch and save measurement data

In [173]:
import pathlib

DATA_DIR = pathlib.Path('/kaggle/working')
if not DATA_DIR.is_dir():
    DATA_DIR = pathlib.Path.home()
DATA_DIR /= 'ooni_data'
COUNTRY = 'CU'

In [10]:
%%time
# Download files
cost_usd_limit = 1.00
cost_usd_per_gib = 0.09
data_limit_bytes = cost_usd_limit / cost_usd_per_gib * 2**30
downloaded_bytes = 0
for entry in ooni.list_files(dt.date(2020, 10, 1), dt.date.today(), None, COUNTRY):
    local_filename = os.path.join(DATA_DIR, COUNTRY, entry.test_type, posixpath.basename(entry.filename))
    idisplay.clear_output(wait=True)
    if os.path.isfile(local_filename):
        print(f'Skipping {entry.filename}')
        continue
    if downloaded_bytes + entry.size > data_limit_bytes:
        print(f'Stopping: hit data limit of {data_limit_bytes / 2**30} GiB')
        break
    print(f'Downloading {entry.filename}')
    os.makedirs(os.path.dirname(local_filename), exist_ok=True)
    with gzip.open(local_filename, mode='wt', encoding='utf-8') as local_file:
        with ooni.get_file(entry.filename) as remote_file:
            with gzip.GzipFile(fileobj=remote_file, mode='r') as input_file:
                for line in input_file:
                    # TODO: paralelize IO and CPU
                    measurement = ujson.loads(line)
                    ujson.dump(ob.trim_measurement(measurement,  1000), local_file)
                    local_file.write('\n')
    downloaded_bytes += entry.size

idisplay.clear_output(wait=True)
print(f'Downloaded {downloaded_bytes:,} bytes.\nEstimated cost: ${downloaded_bytes / 2**30 / cost_usd_per_gib:.02f}')

Downloaded 0 bytes.
Estimated cost: $0.00
CPU times: user 2.76 s, sys: 296 ms, total: 3.05 s
Wall time: 1min 15s


In [7]:
!du -h $DATA_DIR

8.0K	/Users/fortuna/ooni_data/CU/sniblocking
 40K	/Users/fortuna/ooni_data/CU/dnscheck
112K	/Users/fortuna/ooni_data/CU/facebookmessenger
 12K	/Users/fortuna/ooni_data/CU/httpinvalidrequestline
1.6M	/Users/fortuna/ooni_data/CU/webconnectivity
 20K	/Users/fortuna/ooni_data/CU/psiphon
 60K	/Users/fortuna/ooni_data/CU/tor
112K	/Users/fortuna/ooni_data/CU/telegram
 12K	/Users/fortuna/ooni_data/CU/dash
4.0K	/Users/fortuna/ooni_data/CU/urlgetter
1.1M	/Users/fortuna/ooni_data/CU/ndt
 12K	/Users/fortuna/ooni_data/CU/httpheaderfieldmanipulation
160K	/Users/fortuna/ooni_data/CU/whatsapp
3.2M	/Users/fortuna/ooni_data/CU
3.2M	/Users/fortuna/ooni_data


In [11]:
def get_local_measurements(directory: str):
    with os.scandir(directory) as it:
        for entry in it:
            with gzip.open(entry.path, 'r') as test_file:
                for line in test_file:
                    yield ujson.loads(line)

def is_ip(hostname):
    try:
        ipaddress.ip_address(domain)
        return True
    except ValueError:
        return False

## DNS Analysis

Here we analyze the DNS measurements. We load all the web connectivity measurements, and extract each DNS query as a "DNS Observation", and put them on a DataFrame. We also build an Evaluator that collects the control observations.

In [118]:
evaluator: od.Evaluator = od.Evaluator()
rows = []
for measurement in get_local_measurements(os.path.join(DATA_DIR, COUNTRY, 'webconnectivity')):
    m = Measurement(measurement)
    domain = m.hostname
    if is_ip(domain):
        continue
    # if domain == 'www.netflix.com':
    #     pprint(measurement)
    evaluator.add_control(m)
    rows.extend([o._asdict() for o in od.get_observations(m)])

for row in rows:
    evaluation = evaluator.evaluate(row['domain'], row['status'], row['answers'])
    row['eval'] = evaluation
    row['has_interference'] = 1 if evaluation.startswith('BAD') else 0

obs = pd.DataFrame.from_records(rows).sort_values(by='time', ascending=False)
del(rows)
obs

Unnamed: 0,time,client_country,client_asn,resolver_ip,resolver_asn,domain,query_type,failure,status,answers,explorer_url,eval,has_interference
1402,2020-11-06 17:26:31+00:00,CU,27725,127.0.0.2,27725,14ymedio.com,A,,OK,[176.34.179.218],https://explorer.ooni.org/measurement/20201106...,OK_MATCHES_CONTROL_IP,0
1806,2020-11-05 22:27:35+00:00,CU,27725,127.0.0.2,27725,rsf.org,A,,OK,"[172.67.66.183, 104.25.94.108, 104.25.93.108]",https://explorer.ooni.org/measurement/20201105...,OK_MATCHES_CONTROL_IP,0
1807,2020-11-05 22:27:35+00:00,CU,27725,127.0.0.2,27725,rsf.org,AAAA,,OK,"[2606:4700:20::6819:5e6c, 2606:4700:20::ac43:4...",https://explorer.ooni.org/measurement/20201105...,INCONCLUSIVE_CHECK_IPS,0
1802,2020-11-05 22:27:22+00:00,CU,27725,127.0.0.2,27725,medicinacubana.blogspot.com,A,,OK,[142.250.64.225],https://explorer.ooni.org/measurement/20201105...,INCONCLUSIVE_CHECK_IPS,0
1803,2020-11-05 22:27:22+00:00,CU,27725,127.0.0.2,27725,medicinacubana.blogspot.com,AAAA,,OK,[2607:f8b0:4008:800::2001],https://explorer.ooni.org/measurement/20201105...,INCONCLUSIVE_CHECK_IPS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,2020-10-20 14:36:44+00:00,CU,27725,200.55.128.252,0,www.vatican.va,A,,OK,[185.152.70.33],https://explorer.ooni.org/measurement/20201020...,OK_MATCHES_CONTROL_IP,0
602,2020-10-20 14:36:44+00:00,CU,27725,200.55.128.252,0,howtogrowmarijuana.com,A,,OK,"[104.26.8.152, 172.67.71.153, 104.26.9.152]",https://explorer.ooni.org/measurement/20201020...,OK_MATCHES_CONTROL_IP,0
605,2020-10-20 14:36:39+00:00,CU,27725,200.55.128.252,0,www.om.org,A,,OK,[34.248.104.12],https://explorer.ooni.org/measurement/20201020...,INCONCLUSIVE_CHECK_IPS,0
599,2020-10-20 14:36:39+00:00,CU,27725,200.55.128.252,0,www.asstr.org,A,,OK,[64.71.155.222],https://explorer.ooni.org/measurement/20201020...,OK_MATCHES_CONTROL_IP,0


### Overview

Here you can see the top domains with detected interference, the ASes analyzed, the top response statuses from DNS queries, and the top evaluation results of the observations.

We can see that the bad results are dominated by NXDOMAIN responses.

In [126]:
top_domains = obs.groupby(by='domain')['has_interference'].sum().sort_values(ascending=False)
top_ases = obs.value_counts(subset='client_asn')
top_statuses = obs.value_counts(subset='status')
top_evals = obs.value_counts(subset='eval')

display_columns = []
for d in [top_domains[:30], top_ases, top_statuses, top_evals]:
    w = widgets.Output(layout=widgets.Layout(margin='10px'))
    w.append_display_data(d)
    display_columns.append(w)
widgets.HBox(display_columns, layout=widgets.Layout(background='black'))

HBox(children=(Output(layout=Layout(margin='10px'), outputs=({'output_type': 'display_data', 'data': {'text/pl…

### Domain Result Table

This pivot table shows the query count for each domain and evaluation. The domains are sorted by number of interferences detected, and the evaluation results are sorted by frequency.

In [120]:
pivot = pd.pivot_table(obs, index=['domain'], columns=['client_asn', 'eval'], values=[], aggfunc=len, fill_value=0)
# pivot = pivot.reindex(top_sites.index).sort_index(axis='columns', level=1, ascending=False, key=lambda l: top_statuses[l])[top_ases.index]
pivot = pivot.reindex(top_domains.index).reindex(pd.MultiIndex.from_product([top_ases.index, top_evals.index]), axis='columns', fill_value=0)
with pd.option_context('display.max_rows', 150, 'display.max_columns', 24):
    display(pivot.iloc[:100, :24])


client_asn,27725,27725,27725,27725,27725,27725
eval,OK_MATCHES_CONTROL_IP,INCONCLUSIVE_CHECK_IPS,INCONCLUSIVE_BAD_CONTROL,BAD_STATUS_NXDOMAIN,OK_MATCHES_CONTROL_ERROR,BAD_STATUS_SERVFAIL
domain,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
www.dharmanet.org,0,0,0,4,0,0
www.topdrawers.com,0,0,0,2,0,2
www.3wishes.com,0,0,0,2,0,0
www.mytrans.com.tw,0,0,0,2,0,0
www.fidh.org,1,0,0,2,0,0
www.911truth.org,0,0,0,2,0,0
www.nsa.gov,0,2,0,2,0,0
www.ilhr.org,0,0,0,2,0,0
www.tiktok.com,0,0,0,2,0,0
www.gearthblog.com,0,0,0,2,0,0


### Top domains per evaluation status

In [133]:
for asn, asn_df in obs.groupby(by='client_asn'):
    print(f'======= AS{asn} ======')
    for result, result_df in asn_df.groupby(by='eval'):
        print(f'----- {result} ------')
        with pd.option_context('display.max_rows', 50):
            display(result_df['domain'].value_counts()[:50])
        print()
    print()
    

----- BAD_STATUS_NXDOMAIN ------


www.dharmanet.org            4
occupystreams.org            2
www.fidh.org                 2
www.gearthblog.com           2
www.911truth.org             2
www.judaismconversion.org    2
www.netflix.com              2
www.3wishes.com              2
www.ilhr.org                 2
www.tiktok.com               2
krishna.com                  2
www.hackhull.com             2
www.ushmm.org                2
www.feedtheminds.org         2
www.mytrans.com.tw           2
www.nsa.gov                  2
www.topdrawers.com           2
Name: domain, dtype: int64


----- BAD_STATUS_SERVFAIL ------


www.topdrawers.com    2
Name: domain, dtype: int64


----- INCONCLUSIVE_BAD_CONTROL ------


crackspider.net            6
www.islamdoor.com          4
www.latinmail.com          4
shareaza.com               4
marijuana.nl               2
www.videogamereview.com    2
thepiratebay.se            2
delicious.com              2
imesh.com                  2
www.eelam.com              2
www.belmont.ag             2
www.layevangelism.com      2
mitm.watch                 2
www.modemmujer.org         1
www.diariodecuba.co        1
www.primaveradecuba.org    1
Name: domain, dtype: int64


----- INCONCLUSIVE_CHECK_IPS ------


www.google.com              13
en.wikipedia.org             8
www.state.gov                8
www.facebook.com             7
www.xbox.com                 6
www.who.int                  6
www.cdc.gov                  5
www.viber.com                5
www.unwomen.org              5
www.bacardi.com              5
www.wordreference.com        4
www.newnownext.com           4
www.nytimes.com              4
www.dailymail.co.uk          4
www.economist.com            4
translate.google.com         4
www.bbc.com                  4
teenadvice.about.com         4
groups.google.com            4
www.whatsapp.com             4
www.last.fm                  3
www.epa.gov                  3
online.wsj.com               3
www.casinotropez.com         3
mail.yahoo.com               3
www.nbcnews.com              3
www.cia.gov                  3
www.bittorrent.com           3
www.rackspace.com            3
www.aljazeera.net            3
ja.wikipedia.org             3
docs.google.com              3
www.nato


----- OK_MATCHES_CONTROL_ERROR ------


www.blacksandjews.com     4
voice.yahoo.jajah.com     4
www.wallpapergate.com     4
www.eln-voces.com         2
webmail.sso.bluewin.ch    2
warc.jalb.de              2
sci-hub.tw                2
www.bearshare.com         2
debate.org.uk             2
www.pgp.com               2
www.malware.com           2
deoxy.org                 2
www.ahram.org.eg          2
Name: domain, dtype: int64


----- OK_MATCHES_CONTROL_IP ------


twitter.com                   8
www.un.org                    6
www.ohchr.org                 6
www.unfpa.org                 5
www.worldrtd.net              4
www.vatican.va                4
www.lgbtqnation.com           4
www.xroxy.com                 4
www.sexandu.ca                4
www.backtrack-linux.org       3
lambdalegal.org               3
www3.iaisite.org              3
www.oic-oci.org               3
www.eea.europa.eu             3
www.democracynow.org          3
www.ananzi.co.za              3
www.premaritalsex.info        3
www.well.com                  3
common-fund.org               3
www.worldwildlife.org         3
www.jhr.ca                    3
www.linkedin.com              3
www.tawk.to                   3
www.msf.org                   3
instinctmagazine.com          3
www.clubdicecasino.com        3
www.agentprovocateur.com      3
xxx.lanl.gov                  3
www.tialsoft.com              3
www.ectaco.com                3
www.wftucentral.org           3
www.ifex





### No interference for telegram.org

There was no DNS interference for the 3 `telegram.org` queries from 2 tests. The IPv4 results match the control and we validate the IPv6 one using curl.

In [128]:
obs[obs['domain'] == 'telegram.org'][['time', 'client_asn', 'resolver_asn', 'query_type', 'failure', 'answers', 'eval']]

Unnamed: 0,time,client_asn,resolver_asn,query_type,failure,answers,eval
1877,2020-10-26 05:41:13+00:00,27725,27725,A,,"[149.154.167.99, 149.154.167.99]",OK_MATCHES_CONTROL_IP
2383,2020-10-22 20:52:44+00:00,27725,27725,AAAA,,"[2001:67c:4e8:1033:4:100:0:a, 2001:67c:4e8:103...",INCONCLUSIVE_CHECK_IPS
2382,2020-10-22 20:52:44+00:00,27725,27725,A,,[149.154.167.99],OK_MATCHES_CONTROL_IP


In [132]:
!curl -s --connect-to ::[2001:67c:4e8:1033:4:100:0:a]: https://telegram.org > /dev/null && echo 'OK' || echo 'FAIL'

OK


### One domain with SERVFAIL

The domain `www.topdrawers.com` was the only one with a good control to show a non-NXDOMAIN error, even though it also had a clear NXDOMAIN. We are not 100% sure, but the failure message suggests it's SERVFAIL because it says "usually a temporary error" and "the local server did not receive a response from an authoritative server", both of with are not the case for NXDOMAIN.

In [142]:
with pd.option_context('display.max_colwidth', 300):
    display(obs[obs['domain'] == 'www.topdrawers.com'][['time', 'client_asn', 'resolver_asn', 'query_type', 'failure', 'answers', 'eval']])

Unnamed: 0,time,client_asn,resolver_asn,query_type,failure,answers,eval
1897,2020-10-26 05:51:29+00:00,27725,27725,A,unknown_failure: lookup www.topdrawers.com: getaddrinfow: This is usually a temporary error during hostname resolution and means that the local server did not receive a response from an authoritative server.,,BAD_STATUS_SERVFAIL
1898,2020-10-26 05:51:29+00:00,27725,27725,AAAA,unknown_failure: lookup www.topdrawers.com: getaddrinfow: This is usually a temporary error during hostname resolution and means that the local server did not receive a response from an authoritative server.,,BAD_STATUS_SERVFAIL
98,2020-10-22 19:06:14+00:00,27725,27725,A,dns_nxdomain_error,,BAD_STATUS_NXDOMAIN
99,2020-10-22 19:06:14+00:00,27725,27725,AAAA,dns_nxdomain_error,,BAD_STATUS_NXDOMAIN


None of the 2 measurements open on the OONI Explorer

In [138]:
set(obs[obs['domain'] == 'www.topdrawers.com']['explorer_url'].to_list())

{'https://explorer.ooni.org/measurement/20201022T184640Z_webconnectivity_CU_27725_n1_zZuyMmN3UrmI0ng4?input=http%3A%2F%2Fwww.topdrawers.com%2F',
 'https://explorer.ooni.org/measurement/20201026T053844Z_webconnectivity_CU_27725_n1_g5rXUtF8oUXfAqVV?input=http%3A%2F%2Fwww.topdrawers.com%2F'}

### Inconclusive IPs

T

In [172]:
with pd.option_context('display.max_rows', 200):
    display(obs[obs['eval'] == 'INCONCLUSIVE_CHECK_IPS']['domain'].value_counts()[:200])

www.google.com                   13
en.wikipedia.org                  8
www.state.gov                     8
www.facebook.com                  7
www.xbox.com                      6
www.who.int                       6
www.cdc.gov                       5
www.viber.com                     5
www.unwomen.org                   5
www.bacardi.com                   5
www.wordreference.com             4
www.newnownext.com                4
www.nytimes.com                   4
www.dailymail.co.uk               4
www.economist.com                 4
translate.google.com              4
www.bbc.com                       4
teenadvice.about.com              4
groups.google.com                 4
www.whatsapp.com                  4
www.last.fm                       3
www.epa.gov                       3
online.wsj.com                    3
www.casinotropez.com              3
mail.yahoo.com                    3
www.nbcnews.com                   3
www.cia.gov                       3
www.bittorrent.com          

In [168]:
obs[obs['domain'] == 'psiphon.ca'][['time', 'client_asn', 'resolver_asn', 'query_type', 'failure', 'answers', 'eval']]

Unnamed: 0,time,client_asn,resolver_asn,query_type,failure,answers,eval
1760,2020-10-28 23:59:49+00:00,27725,27725,A,,"[104.18.88.225, 104.18.87.225]",OK_MATCHES_CONTROL_IP
803,2020-10-22 21:10:14+00:00,27725,27725,AAAA,,"[2606:4700::6812:58e1, 2606:4700::6812:57e1]",INCONCLUSIVE_CHECK_IPS
802,2020-10-22 21:10:14+00:00,27725,27725,A,,"[104.18.87.225, 104.18.88.225]",OK_MATCHES_CONTROL_IP


In [163]:
obs[obs['domain'] == 'www.whatsapp.com'][['time', 'client_asn', 'resolver_asn', 'query_type', 'failure', 'answers', 'eval']]

Unnamed: 0,time,client_asn,resolver_asn,query_type,failure,answers,eval
2575,2020-10-31 07:20:55+00:00,27725,0,A,,[157.240.14.52],INCONCLUSIVE_CHECK_IPS
1254,2020-10-23 13:23:25+00:00,27725,27725,A,,[31.13.67.52],INCONCLUSIVE_CHECK_IPS
2483,2020-10-22 20:54:23+00:00,27725,27725,A,,[157.240.14.52],INCONCLUSIVE_CHECK_IPS
2484,2020-10-22 20:54:23+00:00,27725,27725,AAAA,,[2a03:2880:f22c:1c5:face:b00c:0:167],INCONCLUSIVE_CHECK_IPS


In [164]:
!curl -s --connect-to ::157.240.14.52: https://www.whatsapp.com > /dev/null && echo 'OK' || echo 'FAIL'

OK


### Analyzing the resolvers

We try to query the resolvers directly, but unfortunately they are not accessible from outside the network, so we can't do tests from the outside.

In [143]:
obs[['client_asn', 'resolver_asn', 'resolver_ip']].value_counts()

client_asn  resolver_asn  resolver_ip   
27725       27725         127.0.0.2         2481
            0             200.55.128.252      31
                          200.55.128.187      26
                          200.55.128.162      18
            27725         200.55.128.245      17
            0             200.55.128.155       3
dtype: int64

In [144]:
!dig @200.55.128.252 www.tiktok.com


; <<>> DiG 9.10.6 <<>> @200.55.128.252 www.tiktok.com
; (1 server found)
;; global options: +cmd
;; connection timed out; no servers could be reached


We observe that the resolvers used are ISP resolvers. 

In [154]:
! curl https://ipinfo.io/200.55.128.252

{
  "ip": "200.55.128.252",
  "city": "Havana",
  "region": "Havana",
  "country": "CU",
  "loc": "23.1330,-82.3830",
  "org": "AS27725 Empresa de Telecomunicaciones de Cuba, S.A.",
  "timezone": "America/Havana",
  "readme": "https://ipinfo.io/missingauth"
}

In [153]:
obs[['resolver_ip', 'resolver_asn', 'status']].value_counts(sort=False)

resolver_ip     resolver_asn  status                                       
127.0.0.2       27725         NXDOMAIN                                           94
                              OK                                               2375
                              SERVFAIL                                           12
200.55.128.155  0             OK                                                  3
200.55.128.162  0             OK                                                 16
                              dns_host_or_service_not_provided_or_not_known       2
200.55.128.187  0             OK                                                 26
200.55.128.245  27725         OK                                                 17
200.55.128.252  0             OK                                                 30
                              dns_resolver_error                                  1
dtype: int64