In [15]:
import logging
import coloredlogs
import requests
from time import sleep

In [21]:
logging.basicConfig()
logger = logging.getLogger("poll-ooni")
# logger.setLevel(logging.DEBUG)
coloredlogs.install()
coloredlogs.install(level='DEBUG')
# coloredlogs.install(level='INFO')

In [17]:
config = {
    "sleep-times": {
        "ooni-poll": 60*5,
        "ooni-paginate": 2,
    }
}


In [314]:
# TODO make dry!
def is_nonempty_str(my_str: str) -> bool:
    return (type(my_str) == str) & (len(my_str) > 0)

class Alpha2 ():
    '''
    Represents an ISO alpha-2 country code.
    '''
    def __init__(self,
                 country_code: str):
        assert(is_nonempty_str(country_code))
        assert(len(country_code)==2)
        self.country_code = country_code
        
    def __str__(self):
        return f'{self.country_code}'
    
    def __repr__(self):
        return self.__str__()


# Getting the data we need

## Measuremnets from OONI

In [39]:
def api_query (query: str, results=[], queries=1, max_queries=None) -> list:
    '''Recursively query the API, up to `max_queries`. (If `max_queries=None`, we
    will paginate through the results as long as they run).
    '''
    base_url = 'https://api.ooni.io/api/v1/'
    query = '{!s}{!s}'.format(base_url, query)
    try:
        resp =  requests.get(query).json()
        results = results + resp['results']
        next_url = resp['metadata']['next_url']
        if max_queries is not None and queries > max_queries:
            return results
        if next_url:
            # sleep so as to not overwhelm the endpoint
            sleep(config['sleep-times']['ooni-paginate'])
            # remove base url to perfrom the query
            next_url = next_url.split('api/v1')[1]
            return api_query(next_url, results, queries+1, max_queries)
        return results
    except Exception as inst:
        # if we have an error,
        logger.warning("Error querying API: {!s}".format(inst))
        # just return what we've collected
        # (at worst, `results` will be `[]`)
        return results


In [309]:
def query_recent_measurements (max_queries=5) -> list:
    '''Queries all recent measurements, up to specified maximum number of queries.'''
    return api_query('measurements?test_name=web_connectivity&anomaly=true&order_by=test_start_time&limit=1000', max_queries=max_queries)

# def query_measurements_after (time) -> list:
#     '''Queries all measurements after time.'''
#     return api_query('measurements?test_name=web_connectivity&anomaly=true&order_by=test_start_time&limit=1000&since={!s}'.format(time))

# def get_measurement_time (measurement) -> str:
#     '''Returns a time format that can be queried'''
#     return measurement['measurement_start_time'][:-1]

# def most_recent_measurement_time (measurements) -> str:
#     return get_measurement_time(measurements[0])

def get_blocking_type (measurement) -> str:
    '''Get blocking type, if available.'''
    try:
        return measurement['scores']['analysis']['blocking_type']
    except:
        return None

In [269]:
measurements = query_recent_measurements()

2021-05-24 14:42:05 congratulations urllib3.connectionpool[30919] DEBUG Starting new HTTPS connection (1): api.ooni.io:443
2021-05-24 14:42:06 congratulations urllib3.connectionpool[30919] DEBUG https://api.ooni.io:443 "GET /api/v1/measurements?test_name=web_connectivity&anomaly=true&order_by=test_start_time&limit=1000 HTTP/1.1" 200 831636
2021-05-24 14:42:10 congratulations urllib3.connectionpool[30919] DEBUG Starting new HTTPS connection (1): api.ooni.io:443
2021-05-24 14:42:11 congratulations urllib3.connectionpool[30919] DEBUG https://api.ooni.io:443 "GET /api/v1//measurements?test_name=web_connectivity&anomaly=true&order_by=test_start_time&limit=1000&offset=1000 HTTP/1.1" 404 None


In [270]:
m = measurements[505]
m

{'anomaly': True,
 'confirmed': False,
 'failure': False,
 'input': 'http://www.xfocus.org/',
 'measurement_start_time': '2021-05-24T20:36:06Z',
 'measurement_url': 'https://ams-pg.ooni.org/api/v1/raw_measurement?report_id=20210524T185956Z_webconnectivity_US_30600_n1_IEQbhkwfoID1oz0z&input=http%3A%2F%2Fwww.xfocus.org%2F',
 'probe_asn': 'AS30600',
 'probe_cc': 'US',
 'report_id': '20210524T185956Z_webconnectivity_US_30600_n1_IEQbhkwfoID1oz0z',
 'scores': {'analysis': {'blocking_type': 'http-diff'},
  'blocking_country': 0.0,
  'blocking_general': 1.0,
  'blocking_global': 0.0,
  'blocking_isp': 0.0,
  'blocking_local': 0.0},
 'test_name': 'web_connectivity'}

In [143]:
m_detail = requests.get(m['measurement_url']).json()

m_detail['test_keys']['queries']
m_detail

2021-05-24 12:09:02 congratulations urllib3.connectionpool[30919] DEBUG Starting new HTTPS connection (1): ams-pg.ooni.org:443
2021-05-24 12:09:02 congratulations urllib3.connectionpool[30919] DEBUG https://ams-pg.ooni.org:443 "GET /api/v1/raw_measurement?report_id=20210524T153727Z_webconnectivity_BR_271354_n1_v6EcCDifm3IJDoaO&input=http%3A%2F%2Fwww.newnownext.com%2Ffranchise%2Fthe-backlot%2F HTTP/1.1" 200 129010


{'annotations': {'engine_name': 'ooniprobe-engine',
  'engine_version': '3.9.1',
  'platform': 'linux'},
 'data_format_version': '0.2.0',
 'input': 'http://www.newnownext.com/franchise/the-backlot/',
 'measurement_start_time': '2021-05-24 16:56:23',
 'probe_asn': 'AS271354',
 'probe_cc': 'BR',
 'probe_ip': '127.0.0.1',
 'probe_network_name': '<unknown>',
 'report_id': '20210524T153727Z_webconnectivity_BR_271354_n1_v6EcCDifm3IJDoaO',
 'resolver_asn': 'AS271354',
 'resolver_ip': '150.164.0.80',
 'resolver_network_name': '<unknown>',
 'software_name': 'ooniprobe-cli',
 'software_version': '3.9.1',
 'test_helpers': {'backend': {'address': 'https://wcth.ooni.io',
   'type': 'https'}},
 'test_keys': {'agent': 'redirect',
  'client_resolver': '150.164.0.80',
  'retries': None,
  'socksproxy': None,
  'network_events': [{'address': '200.143.247.42:80',
    'failure': None,
    'operation': 'connect',
    'proto': 'tcp',
    't': 2.219990817,
    'tags': ['tcptls_experiment']},
   {'address': '

## Get IP from URL

In [89]:
inputs = [m['input'] for m in measurements]

In [357]:
import socket
import urllib.parse

def get_hostname (url):
    return urllib.parse.urlparse(url).netloc

def get_ip (url: str) ->  Optional[str]:
    hostname = get_hostname(url)
    try:
        hostname = socket.gethostbyname(hostname)
        if hostname == '127.0.0.1':
            return None
        return hostname
    except Exception as inst:
            logger.warning(f"Error looking up IP of hostname {hostname}: {inst}")
            return None


In [174]:
my_ip = get_ip(inputs[505])
my_ip

'157.131.218.41'

## Get geolocation from IP

In [352]:
import geoip2.database
from typing import Optional

def ip_to_alpha2 (ip: str) -> Optional[Alpha2]:
    with geoip2.database.Reader('dbip-country-lite-2021-05.mmdb') as reader:
        try:
            response = reader.country(ip)
            return Alpha2(response.country.iso_code)
        except Exception as inst:
            # if we have an error,
            logger.warning(f"Error looking up country code of IP {ip}: {inst}")
            return None
    
get_alpha2(my_ip)

'US'

## Putting it all together

In [355]:
def url_to_alpha2 (url):
    maybe_ip = get_ip(url)
    if maybe_ip is None:
        return None
    return ip_to_alpha2(maybe_ip)


## Get TLD jurisdiction


In [359]:
from imp import reload
from src.w3techs import utils as w3techs_utils
reload(w3techs_utils)
from tldextract import extract

In [342]:
def get_tld_jurisdiction (url: str) -> Optional[Alpha2]:
    '''
    Takes a URL and gets an Alpha 2
    representing the jurisdiction of the URL's top-level domain.
    '''
    tld = extract(url)
    # get last item in url
    # e.g., '.com.br' should be '.br'
    tld = tld.suffix
    tld = tld.split('.')[-1]
    # put it
    tld_str = f'.{tld}'
#     return tld_str
    # TODO put htis logic in get_country
    cc =  w3techs_utils.get_country(tld_str)
    if cc is not None:
        return Alpha2(cc)
    return None

get_tld_jurisdiction('mycool.com.br')

BR

In [260]:
for m in measurements:
    url = m['input']
    print(url, end=' - ')
    juris = get_tld_jurisdiction(url)
    print(juris)

2021-05-24 13:50:39 congratulations root[30919] INFO Cannot find country for .
2021-05-24 13:50:39 congratulations root[30919] INFO Cannot find country for .
2021-05-24 13:50:39 congratulations root[30919] INFO Cannot find country for .


https://www.cna.com.tw/ - TW
https://nordvpn.com/ - US
https://www.change.org/ - US
http://www.tiananmenmother.org/ - US
http://www.ftchinese.com/ - US
https://www.ndi.org/ - US
http://www.hrea.org/ - US
https://blog.mozilla.org/ - US
http://www.ifeminists.com/ - US
http://ocsp.int-x3.letsencrypt.org/ - US
https://www.bbc.com/burmese/ - US
http://www.theepochtimes.com/ - US
https://www.brookings.edu/center/john-l-thornton-china-center/ - US
https://fedoramagazine.org/ - US
https://doubleclick.net/ - US
http://bancariosclassistas.blogspot.com/ - US
http://doubleclick.net/ - US
https://weblog.savetibet.org/ - US
http://www.animalliberationfront.com/ - US
https://bridges.torproject.org/ - US
http://www.zensur.freerk.com/ - US
https://bisexual.org/ - US
http://de.lirio.us/ - US
http://www.dailymail.co.uk/ - GB
http://peacefire.org/ - US
http://juntosomos-fortes.blogspot.com/ - US
http://www.goarch.org/ - US
https://git.io/ - US
http://guampanews.blogspot.com/ - US
http://polentanews.blogsp

http://www.mizzima.com/ - US
http://www.strana.ru/ - RU
https://hronikatm.com/ - US
http://www.newnownext.com/ - US
https://www.tunnelbear.com/ - US
http://www.deti-404.com/ - US
http://www.radioislam.org/ - US
http://www.radioislam.org/ - US
http://gaysdedireita.blogspot.com/ - US
http://nossacarasp.blogspot.com/ - US
http://de.lirio.us/ - US
http://www.proxyweb.net/ - US


# Model the datatype

In [302]:
def now () -> pd.Timestamp:
    return pd.Timestamp.utcnow()

def is_in_future (timestamp: pd.Timestamp) -> bool:
    return timestamp >  now()

is_in_future(  pd.Timestamp('2021-06-24T20:36:06Z'))

True

In [345]:
from psycopg2.extensions import cursor
from psycopg2.extensions import connection
import pandas as pd

class OONIWebConnectivityTest():
    '''
    Class to capture results of an OONI web connectivity test.
      - https://ooni.org/nettest/web-connectivity/
    See README for more details on these fields.
    
    This is where validation happens.
    TODO - Check for SQL injection attacks.
    '''
    def __init__(self,
                  blocking_type: str,
                  probe_alpha2: Alpha2,
                  input_url: str,
                  anomaly: bool,
                  confirmed: bool,
                  report_id: str,
                  input_ip_alpha2: Alpha2,
                  tld_jurisdiction_alpha2: Alpha2,
                  measurement_start_time: pd.Timestamp):
            # we only want stuff where blocking actually happened 
            assert(blocking_type != False)
            self.blocking_type = blocking_type
            
            assert(type(probe_alpha2) == Alpha2)
            self.probe_alpha2 = probe_alpha2
            
            assert(is_nonempty_str(input_url))
            self.input_url = input_url
            
            assert(type(anomaly) == bool)
            self.anomaly = anomaly
            
            assert(type(confirmed) == bool)
            self.confirmed = confirmed
            
            assert(is_nonempty_str(report_id))
            self.report_id = report_id
            
            # type is optional
            assert((type(input_ip_alpha2) == Alpha2) or 
                   (input_ip_alpha2 == None))
            self.input_ip_alpha2 = input_ip_alpha2
            
            # type is optional
            assert((type(tld_jurisdiction_alpha2) == Alpha2) or
                   (tld_jurisdiction_alpha2 == None))
            self.tld_jurisdiction_alpha2 = tld_jurisdiction_alpha2
            
            assert(type(measurement_start_time) == pd.Timestamp)
            # if the timestamp is in the future...
            if is_in_future(measurement_start_time):
                # set the time to now.
                self.measurement_start_time = now()
            # otherwise
            else:
                # set it to whenever it was reported
                self.measurement_start_time = measurement_start_time
            
    def create_table(
            self,
            cur: cursor,
            conn: connection):
        cmd = '''
          CREATE TABLE ooni_web_connectivity_test (
             blocking_type             VARCHAR,
             probe_alpha2              CHAR(2) NOT NULL,
             input_url                 VARCHAR NOT NULL,
             anomaly                   BOOLEAN NOT NULL,
             confirmed                 BOOLEAN NOT NULL,
             report_id                 VARCHAR NOT NULL,
             input_ip_alpha2           CHAR(2),
             tld_jurisdiction_alpha2   CHAR(2),
             measurement_start_time    TIMESTAMPZ NOT NULL,
          )
        '''
        cur.execute(cmd)
        conn.commit()

    def write_to_db(
            self,
            cur: cursor,
            conn: connection,
            commit=True,
    ):
        cur.execute(
            """
            INSERT INTO ooni_web_connectivity_test
            (blocking_type, probe_alpha2, input_url, anomaly, confirmed, report_id, 
            input_ip_alpha2, tld_jurisdiction_alpha2, measurement_start_time)   
            VALUES
            (%s, %s, %s, %s, %s, %s, %s, %s, %s)   
            """, (self.blocking_type,
                  self.probe_alpha2,
                  self.input_url,
                  self.anomaly,
                  self.confirmed,
                  self.report_id,
                  self.input_ip_alpha2,
                  self.tld_jurisdiction_alpha2,
                  self.measurement_start_time))
        if commit:
            return conn.commit()
        return

    def __str__(self):
        # TODO make DRY with write_to_db?
        # TODO do this in general?
        return f'{self.measurement_start_time} - {self.probe_alpha2} -> {self.input_ip_alpha2}, {self.tld_jurisdiction_alpha2} ({self.blocking_type} {self.input_url})'

    def __repr__(self):
        return self.__str__()


# Marshall from datatype

In [346]:
def ingest_api_measurement (measurement: dict) -> OONIWebConnectivityTest:
    blocking_type = get_blocking_type(measurement)
    probe_alpha2 = Alpha2(measurement['probe_cc'])
    input_url = measurement['input']
    anomaly = measurement['anomaly']
    confirmed = measurement['confirmed']
    report_id = measurement['report_id']
    input_ip_alpha2 = url_to_alpha2(input_url) 
    tld_jurisdiction_alpha2 = get_tld_jurisdiction(input_url)
    measurement_start_time = pd.Timestamp(measurement['measurement_start_time'])
    return OONIWebConnectivityTest(
        blocking_type,
        probe_alpha2,
        input_url,
        anomaly,
        confirmed,
        report_id,
        input_ip_alpha2,
        tld_jurisdiction_alpha2,
        measurement_start_time
    )

ingest_api_measurement(measurements[0])

2021-05-24 21:41:37+00:00 - CN -> US, US (http-failure http://hotgaylist.com/)

In [358]:
from multiprocessing import Pool

with Pool() as p:
    print(p.map(ingest_api_measurement, measurements))

2021-05-24 16:50:54 congratulations root[17841] INFO Cannot find country for .
2021-05-24 16:50:55 congratulations root[17841] INFO Cannot find country for .asia


[2021-05-24 21:41:37+00:00 - CN -> US, US (http-failure http://hotgaylist.com/), 2021-05-24 21:41:29+00:00 - JO -> US, US (http-failure https://www.clubhouseapi.com/), 2021-05-24 21:41:24+00:00 - CN -> CH, US (tcp_ip https://autistici.org/), 2021-05-24 21:41:23+00:00 - JO -> US, US (http-failure https://www.clubhouseapi.com/), 2021-05-24 21:41:13+00:00 - BR -> US, US (http-diff http://proxytools.sourceforge.net/), 2021-05-24 21:41:10+00:00 - BR -> CA, US (http-diff http://escombroshablaneros.blogspot.com/), 2021-05-24 21:41:01+00:00 - CN -> US, US (tcp_ip https://dns.google.com/), 2021-05-24 21:41:00+00:00 - US -> None, US (tcp_ip http://www.iwantim.com/), 2021-05-24 21:41:00+00:00 - TH -> US, US (http-diff http://www.no-porn.com/), 2021-05-24 21:41:00+00:00 - KZ -> US, US (http-failure https://www.pubg.com/), 2021-05-24 21:40:56+00:00 - CN -> US, US (dns http://www.ifex.org/), 2021-05-24 21:40:51+00:00 - CN -> US, US (http-failure http://www.partypoker.com/), 2021-05-24 21:40:39+00:00