In [1]:
import geopandas as gpd
import pandas as pd
import os
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Serif"
import pyipmeta as ipm
import requests
import wget
import shutil
import gzip
import json

from sklearn.linear_model import LinearRegression
import numpy as np

import geoplot.crs as gcrs
import geoplot
import os
from datetime import datetime
import socket 
import sys
import netaddr

In [2]:


import dask.dataframe as dd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import json
from dask.distributed import Client, LocalCluster


In [3]:
cluster = LocalCluster(n_workers=4, threads_per_worker=2, dashboard_address=':37581')  # Launches a scheduler and workers locally
client = Client(cluster) 
cluster.dashboard_link

'http://127.0.0.1:37581/status'

In [4]:
def get_files_no_pfx(dirname):
    
    files = [os.path.join(dirname, f) for f in os.listdir(dirname)]
    return files
def get_dirs(dirname):
    dirs = [os.path.join(dirname, f) for f in os.listdir(dirname) if os.path.isdir(os.path.join(dirname, f))]
    return dirs
def get_files(dirname, pfx):
    files = [os.path.join(dirname, f) for f in os.listdir(dirname) if os.path.isfile(os.path.join(dirname, f)) and pfx in f]
    return files

In [5]:
def int2ip24(x):
    ip =  str(netaddr.IPAddress(int(x)))
    return '.'.join(ip.split('.')[0:3]) + '.0'

def get_24_from_int(df):
    return df['ipint'].apply(lambda x: ip2int(int2ip24(x)))

def get_24(df):
    return df['ip'].apply(lambda x: '.'.join(x.split('.')[0:3]) + '.0')

def ip2int(x):
    return netaddr.IPAddress(x).value

def get_int_from_ip(df):
    return df['ip'].apply(lambda x: ip2int(x))

## Load ANT

In [18]:
dirname = '/data1/manasvini/internet_address_history_it101w-20221101/last10withipint'

files = list(set(get_files_no_pfx(dirname)))

ant_df = dd.read_parquet(get_files(dirname, 'parquet'), blocksize='200MB')


## Compute ever-alive IPs (set of IPs that responded in the last 9 surveys)

In [21]:
for s in range(81,92):
    ant_df['survey_'+str(s)] = ant_df['survey_'+str(s)].astype(int)

In [23]:
ant_filtered_df = ant_df[(ant_df['survey_83']==1)|(ant_df['survey_84']==1)|(ant_df['survey_85']==1)|(ant_df['survey_86']==1)|\
                (ant_df['survey_87']==1)|(ant_df['survey_88']==1)|(ant_df['survey_89']==1)|\
               (ant_df['survey_90']==1)|(ant_df['survey_91']==1)]

In [27]:
ant_filtered_df.to_parquet('/data1/manasvini/internet_address_history_it101w-20221101/last10withipint/everalive/', compression={"name": "gzip", "values": "lz4"}, write_index=False, overwrite=True)

## Load ANT everactive

In [6]:
dirname = '/data1/manasvini/internet_address_history_it101w-20221101/last10withipint/everalive'

files = list(set(get_files_no_pfx(dirname)))

ant_df = dd.read_parquet(get_files(dirname, 'parquet'), blocksize='200MB')


## Load Censys

In [8]:
survey_dirs = get_dirs('/data1/manasvini/censys_output_new/intermediate/surveywise')


survey_dfs = []
 
for d in sorted(survey_dirs):
    files = get_files(d, 'parquet')
    print(d)
    if len(files) > 0 and len(survey_dfs)==0:
        survey_dfs.append(dd.read_parquet(files, blocksize='200MB'))
    elif len(files) > 0:
        survey_num = d.split('_')[-1]
        survey_dfs.append(dd.read_parquet(files, blocksize='200MB')['censys_survey_'+survey_num])

censys_ddf = dd.concat(survey_dfs, axis=1, ignore_unknown_divisions=True).fillna(0)

censys_ddf['/24']  =  censys_ddf.map_partitions(get_24_from_int)



/data1/manasvini/censys_output_new/intermediate/surveywise/survey_83
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_84
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_85
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_86
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_87
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_88
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_89
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_90
/data1/manasvini/censys_output_new/intermediate/surveywise/survey_91


In [10]:
censys_ddf.to_parquet('/data1/manasvini/censys_address_history_new/', compression='gzip', write_index=False, overwrite=True )

## Load Censys index

In [11]:
files = get_files('/data1/manasvini/censys_address_history_new/', 'parquet')
censys_ddf = dd.read_parquet(files, blocksize='200MB')

In [12]:
censys_ddf.count().compute()

ipint               467313791
censys_survey_83    467313791
censys_survey_84    467313791
censys_survey_85    467313791
censys_survey_86    467313791
censys_survey_87    467313791
censys_survey_88    467313791
censys_survey_89    467313791
censys_survey_90    467313791
censys_survey_91    467313791
/24                 467313791
dtype: int64

## Merge indices

In [13]:

def merge_dfs(dfs):
    print(dfs[0].columns)
    return dd.concat(dfs).drop_duplicates(subset='ipint')

In [14]:
for p in range(10):
    print(p)
    os.makedirs('/data1/manasvini/censys_output_new/intermediate_censys_ant/ipints_'+ str(p), exist_ok=True)
    filtered_ant_df = ant_df[ant_df['ipint'].astype(int)%10 == p]['ipint'].astype(int).to_frame()
    filtered_censys_df = censys_ddf[censys_ddf['ipint'].astype(int)%10 == p]['ipint'].astype(int).to_frame()
    #total1 = filtered_ant_df.count().compute()
    #total2 = filtered_censys_df.count().compute()
    #total = total1 + total2
    merged_ddf = merge_dfs([filtered_censys_df, filtered_ant_df])
    merged_ddf = merged_ddf.dropna()
    merged_ddf['ipint'] = merged_ddf['ipint'].astype(int)
    #print('merged', p, 'before', 'censys=', total2, 'ant=', total1, 'overall=', total, 'after', merged_ddf.count().compute())

    merged_ddf.to_parquet('/data1/manasvini/censys_output_new/intermediate_censys_ant/ipints_'+str(p), write_index=False, overwrite=True, compression={"name": "gzip", "values": "lz4"},)
    del merged_ddf

0
Index(['ipint'], dtype='object')
1
Index(['ipint'], dtype='object')
2
Index(['ipint'], dtype='object')
3
Index(['ipint'], dtype='object')
4
Index(['ipint'], dtype='object')
5
Index(['ipint'], dtype='object')
6
Index(['ipint'], dtype='object')
7
Index(['ipint'], dtype='object')
8
Index(['ipint'], dtype='object')
9
Index(['ipint'], dtype='object')


## Load merged index  
In the previous steps, we create a survey-wise response table for ANT and Censys separately. We next merge the two so that we can look at the combined responses. Due to memory constraints, we have to do it in a roundabout way: I group IP addresses % 10 for each survey and do the merging per group and then concatenate everything together. Groupby/merge is expensive, but concatenate is not.   

In [16]:
ipdirs = get_dirs('/data1/manasvini/censys_output_new/intermediate_censys_ant/')

In [17]:
ipdfs = []
for d in ipdirs:
    files = get_files(d, 'parquet')
    ipdfs.append(dd.read_parquet(files, blocksize='200MB'))

In [18]:
ant_censys_ip_df = dd.concat(ipdfs)

In [23]:
for surveynum in range(83, 92):
    for p in range(10):
    
        surveystr = str(surveynum)
#         if surveynum == 84:
#             continue
        filtered_censys_ddf = censys_ddf[censys_ddf['ipint']%10==p]
        filtered_ant_censys_df =ant_censys_ip_df[ant_censys_ip_df['ipint']%10==p]
        filtered_ant_df = ant_df[ant_df['ipint']% 10 == p]
        
        filtered_ant_censys_df['survey_'+ surveystr] = int(0)
        filtered_censys_ddf['survey_'+ surveystr] = filtered_censys_ddf['censys_survey_'+surveystr].astype(int)
        filtered_ant_df['survey_'+surveystr] = filtered_ant_df['survey_'+surveystr].astype(int)
        #print(filtered_censys_ddf.columns, filtered_ant_ddf.columns)
        #print(filtered_ant_censys_ddf.count().compute())
        merged_df = dd.concat([filtered_ant_censys_df, \
                               filtered_censys_ddf[['ipint', 'survey_'+surveystr]],\
                               filtered_ant_df[['ipint', 'survey_'+surveystr]]])\
                    .groupby('ipint')['survey_'+surveystr].max().reset_index()

        print(merged_df.columns)
        print(merged_df.dtypes)
        os.makedirs('/data1/manasvini/ant_censys_merged_new/ant_censys_merged_'+str(p) + '/' + 'survey_' + surveystr, exist_ok=True)
        merged_df.to_parquet('/data1/manasvini/ant_censys_merged_new/ant_censys_merged_'+str(p) + '/' + 'survey_' + surveystr, write_index=False, overwrite=True, compression={"name": "gzip", "values": "lz4"},)
    

Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object


## Update ANT IP index with all IPs  
We want to make sure that the set of IPs we use for ANT and ANT + Censys are the same so we update the ANT dataset with the full set of IPs from the previous step

In [24]:
for p in range(10):
    for surveynum in range(83, 92):
        surveystr = str(surveynum)
#         if surveynum == 84:
#             continue
        #filtered_censys_ddf = censys_ddf[censys_ddf['ipint']%10==0]
        filtered_ant_censys_df =ant_censys_ip_df[ant_censys_ip_df['ipint']%10==p]
        filtered_ant_df = ant_df[ant_df['ipint']% 10 == p]
        
        filtered_ant_censys_df['survey_'+ surveystr] = int(0)
        #filtered_censys_ddf['survey_'+ surveystr] = filtered_censys_ddf['censys_survey_'+surveystr].astype(int)
        filtered_ant_df['survey_'+surveystr] = filtered_ant_df['survey_'+surveystr].astype(int)
        
        merged_df = dd.concat([filtered_ant_censys_df, \
                               #filtered_censys_ddf[['ipint', 'survey_'+surveystr]],\
                               filtered_ant_df[['ipint', 'survey_'+surveystr]]])\
                    .groupby('ipint')['survey_'+surveystr].max().reset_index()

        print(merged_df.columns)
        print(merged_df.dtypes)
        os.makedirs('/data1/manasvini/ant_merged_new/ant_merged_'+str(p) + '/' + 'survey_' + surveystr, exist_ok=True)
        merged_df.to_parquet('/data1/manasvini/ant_merged_new/ant_merged_'+str(p) + '/' + 'survey_' + surveystr, write_index=False, overwrite=True, compression={"name": "gzip", "values": "lz4"},)
        break

Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
Index(['ipint', 'survey_83'], dtype='object')
ipint        int64
survey_83    int64
dtype: object
