In [668]:
#import datapane as dp
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from functools import reduce
import numpy as np

In [669]:
#dp.login(token="c5e23cba0104dc599cc7b00360f57adfbcfb5d52")

# Need a list of features to aggregate a final score.
a1 = web censorshp block fraction
b1 = circumvention tool usage (tor relay user count)
b2 = VPN and Proxy usage
c1 = Internet shutdown duration
d1 = radar IPv6 adoption rate
d2 = radar TLS1_3 adoption rate
e1 = CTI index
f1 = data localisation ranking

In [728]:
a1_path = "/Users/coes3/phd-labs/fraganal/data_source/ooni/"

In [740]:
def a1_data_prep(path):
    files = Path(path).glob('*.csv')
    dfs = []

    for i in files:
        data = pd.read_csv(i, header=0)
        data['cc'] = i.name.split('_')[-1].strip('.csv')
        dfs.append(data)

    df = pd.concat(dfs, ignore_index=True)
    df.rename(columns={"measurement_start_day":"date", "confirmed_count":"blocked_count", "ok_count":"success_count"}, inplace = True)
    
    df = df[['date', 'blocked_count', 'success_count', 'cc']]
    
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    
    #  filter for only 2022 data
    df = df.loc[(df['date'] >= '2022-01-01') & (df['date'] <='2022-12-31')]
    
    df['pct_block'] = (df['blocked_count']/(df['success_count'] + df['blocked_count']) * 100)
    
    df = df[['date', 'cc', 'pct_block']]
                       
    df['month'] = df['date'].dt.to_period('M')
    
    #Get average monthly blocked sites as a percentage of none-blocked sites and clip to a max of 100 points
    #same number of test carried out, therefore a sum will over inflate figures. A Mean is a more accurate calculation.
    
    mon_df = df.groupby(['cc', 'month'])['pct_block'].mean().round(2).reset_index(name='a1')
    mon_df['a1'].clip(0, 100, inplace=True)
    
    return mon_df

In [741]:
a1_df = a1_data_prep(a1_path)

In [767]:
a1_df.sample

<bound method NDFrame.sample of     cc    month   a1
0   BR  2022-01  0.0
1   BR  2022-02  0.0
2   BR  2022-03  0.0
3   BR  2022-04  0.0
4   BR  2022-05  0.0
..  ..      ...  ...
62  RW  2022-04  0.0
63  RW  2022-05  0.0
64  RW  2022-08  0.0
65  RW  2022-10  0.0
66  RW  2022-11  0.0

[67 rows x 3 columns]>

In [1089]:
b1_path = "/Users/coes3/phd-labs/fraganal/data_source/tor/"
b1_filepath = "/Users/coes3/phd-labs/fraganal/data_source/wdi_pop/wdi_pop.csv"
cc_filepath = "/Users/coes3/phd-labs/fraganal/data_source/cc_alpha2_3.csv"

In [1090]:
def get_inet_pop(path):
  
    pop_df = pd.read_csv(path, header=0)
    pop_df.rename(columns={"2021 [YR2021] - Individuals using the Internet (% of population) [IT.NET.USER.ZS]":"pct", "2021 [YR2021] - Population, total [SP.POP.TOTL]":"pop"}, inplace = True)

    pop_df['pct'] = pop_df['pct'].replace('..', np.nan)
    pop_df['pop'] = pop_df['pop'].replace('..', np.nan)

    pop_df['pct'] = pd.to_numeric(pop_df['pct'])
    pop_df['pop'] = pd.to_numeric(pop_df['pop'])

    pop_df['pct'].fillna(pop_df['pct'].std(), inplace=True)
    pop_df['pct'] = round(pop_df['pct'])

    pop_df['inet_users'] = pop_df['pop'] * (pop_df['pct']/100)

    cc_df = pd.read_csv(cc_filepath)
    pop_df = pop_df.join(cc_df[['alpha-3', 'alpha-2']].set_index('alpha-3'), on='Country Code')
    pop_df = pop_df[['inet_users', 'alpha-2']]
    pop_df.rename(columns = {'alpha-2':'cc'}, inplace = True)

    return pop_df

In [1102]:
def b1_data_prep(path):
    #  function to prep tor data in pandas
    
    files = Path(path).glob('*.csv')
    dfs = []
    
    for i in files:
        data = pd.read_csv(i, header=5)
        dfs.append(data)
        
    df = pd.concat(dfs, ignore_index=True)
    df.dropna(subset = ['country'], inplace = True)
    df = df[(df["country"] != '??')]
    
    df['country']= df['country'].str.upper()
    df.rename(columns = {'country':'cc'}, inplace = True)
    
    columns = ['upper', 'lower', 'frac']
    df.drop(columns, inplace=True, axis=1)
    
    #filter df with only these countries
    cc_list = ['CN','BR','DE','IN','IQ', 'RW']
    df = df[df['cc'].isin(cc_list)]
    
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    
    #  filter for only 2022 data
    df = df.loc[(df['date'] >= '2022-01-01') & (df['date'] <='2022-12-31')]
    df['month'] = df['date'].dt.to_period('M')
    
    inet_df = get_inet_pop(b1_filepath)
    
    df = df.join(inet_df.set_index('cc'), on='cc')
    
    df['pct'] = (df['users']/ df['inet_users']) * 100
                        
    #Get average monthly users as a percentage of total internet users and clip to a max of 100 points
    #The number of users measures is best averaged out of a time span as a total sum will over inflate figures. A Mean is a more accurate calculation.
    
    
    mon_df = df.groupby(['cc', 'month'])['pct'].mean().round(1).reset_index(name='b1')
    

    
    return mon_df

In [1103]:
b1_df = b1_data_prep(b1_path)

In [1104]:
b1_df

Unnamed: 0,cc,month,b1
0,BR,2022-01,0.2
1,BR,2022-02,0.2
2,BR,2022-03,0.2
3,BR,2022-04,0.2
4,BR,2022-05,0.2
5,BR,2022-06,0.2
6,BR,2022-07,0.2
7,CN,2022-01,0.0
8,CN,2022-02,0.0
9,CN,2022-03,0.0


In [747]:
c1_path = "/Users/coes3/phd-labs/fraganal/data_source/shut/"

In [1111]:
def c1_data_prep(path):
    files = Path(path).glob('*.csv')
    dfs = []

    for i in files:
        data = pd.read_csv(i, header=0)
        dfs.append(data)

    df = pd.concat(dfs, ignore_index=True)

    cc_df = pd.read_csv(cc_filepath)
    df = df.join(cc_df[['name', 'alpha-2']].set_index('name'), on='country')
    df.rename(columns = {'alpha-2':'cc'}, inplace = True)

    cc_list = ['CN','BR','DE','IN','IQ', 'RW']
    df = df[df['cc'].isin(cc_list)]

    df['start_date'] = pd.to_datetime(df['start_date'])
    df['end_date'] = pd.to_datetime(df['end_date'])

    #  filter for only 2022 data 
    df = df.loc[(df['start_date'] >= '2022-01-01') & (df['start_date'] <='2022-12-31')]
    
    #  drop any row without a start or end date
    df.dropna(subset=['start_date', 'end_date'], inplace=True)
    
    #  create a duration column with day numbers betwee start and end date
    df['duration'] = df['end_date'] - df['start_date']
    df['duration'] = df['duration'].astype('timedelta64[D]')
    df.drop(columns=['end_date'], inplace=True)
    df.rename(columns = {'start_date':'date'}, inplace = True)
    df['event_count'] = 1
    #df.set_index('date', inplace=True)
    df = df[['date', 'cc', 'duration']]
    df['month'] = df['date'].dt.to_period('M')
   
    
    #Get sum of monthly shutdowns as a percentage of 365 days and clip to a max of 100 points
    mon_df = df.groupby(['cc', 'month'])['duration'].sum().reset_index(name='c1')
    mon_df['c1'] = round((mon_df['c1']/365) * 100)
    mon_df['c1'].clip(0, 100, inplace=True)
    
    
    return mon_df

In [1112]:
c1_df = c1_data_prep(c1_path)

In [1113]:
c1_df

Unnamed: 0,cc,month,c1
0,BR,2022-03,1.0
1,IN,2022-01,6.0
2,IN,2022-02,5.0
3,IN,2022-03,3.0
4,IN,2022-04,6.0
5,IN,2022-05,4.0
6,IN,2022-06,10.0
7,IN,2022-07,0.0
8,IN,2022-08,4.0
9,IN,2022-09,0.0


In [1114]:
d_path = "/Users/coes3/phd-labs/fraganal/data_source/radar/"

In [1158]:
def d_data_prep(proto_type, path):
    #  function to prep radar data in pandas
    
    files = Path(path).glob(f'{proto_type}Versions*.csv')
    dfs = []
    
    for i in files:
        data = pd.read_csv(i, index_col=None, header=0)
        data['cc'] = i.name.split('_')[-1].strip('.csv')
        dfs.append(data)
        
    df = pd.concat(dfs, ignore_index=True)

    df['date'] = pd.to_datetime(df['Serie_0 timestamps'], format='%Y-%m-%d')
    df['date'] = df['date'].dt.tz_convert(None)
    df['month'] = df['date'].dt.to_period('M')
    #  filter for only 2022 data
    df = df.loc[(df['date'] >= '2022-01-01') & (df['date'] <='2022-12-31')]
    
    if proto_type == "IP":
        df.rename(columns={"Serie_0  I Pv4":"ipv4", "Serie_0  I Pv6":"ipv6"}, inplace = True)
        df = df[['date', 'cc', 'ipv6', 'month']]
        df['pct'] = round(100 - df['ipv6'])
        
        #  Get average of monthly pct of version usage, clip to a max of 100 points
        mon_df = df.groupby(['cc', 'month'])['pct'].mean().round().reset_index(name='d1')
        mon_df['d1'].clip(0, 100, inplace=True)
    
    
    else:
        df.rename(columns={"Serie_0 timestamps":"date", "Serie_0  T L S 1.3":"tlsv1_3", "Serie_0  T L S 1.2":"tlsv1_2", 
                           "Serie_0  T L S  Q U I C":"tlsvquic", "Serie_0  T L S 1.0":"tlsv1_0", "Serie_0  T L S 1.1":"tlsv1_1"}, inplace = True)
        df['tls1_3'] = df['tlsv1_3'] + df['tlsvquic'] 
        df = df[['date', 'cc','tls1_3', 'month']]
        df['pct'] = round(100 - df['tls1_3'])
    
        #  Get aggregate of monthly pct of version usage, clip to a max of 100 points
        mon_df = df.groupby(['cc', 'month'])['pct'].mean().round().reset_index(name='d2')
        mon_df['d2'].clip(0, 100, inplace=True)
    
    return mon_df

In [1159]:
d1_df = d_data_prep('IP', d_path)

In [1160]:
d2_df = d_data_prep('TLS', d_path)

In [1161]:
d1_df

Unnamed: 0,cc,month,d1
0,BR,2022-01,73.0
1,BR,2022-02,73.0
2,BR,2022-03,72.0
3,BR,2022-04,73.0
4,BR,2022-05,73.0
...,...,...,...
67,RW,2022-08,90.0
68,RW,2022-09,91.0
69,RW,2022-10,92.0
70,RW,2022-11,90.0


In [1162]:
d2_df.sample(6)

Unnamed: 0,cc,month,d2
51,IQ,2022-04,25.0
33,DE,2022-10,17.0
45,IN,2022-10,17.0
36,IN,2022-01,24.0
26,DE,2022-03,18.0
3,BR,2022-04,21.0


In [1163]:
data_frames = [a1_df, b1_df, c1_df, d1_df, d2_df]

In [1164]:
df_merged = reduce(lambda  left,right: pd.merge(left, right, on=['month', 'cc'],
                                            how='left'), data_frames)

In [1165]:
df_merged

Unnamed: 0,cc,month,a1,b1,c1,d1,d2
0,BR,2022-01,0.0,0.2,,73.0,22.0
1,BR,2022-02,0.0,0.2,,73.0,24.0
2,BR,2022-03,0.0,0.2,1.0,72.0,21.0
3,BR,2022-04,0.0,0.2,,73.0,21.0
4,BR,2022-05,0.0,0.2,,73.0,21.0
...,...,...,...,...,...,...,...
62,RW,2022-04,0.0,0.0,,90.0,29.0
63,RW,2022-05,0.0,0.1,,91.0,30.0
64,RW,2022-08,0.0,,,90.0,21.0
65,RW,2022-10,0.0,,,92.0,21.0


In [1166]:
ann_df = df_merged.groupby('cc').agg({'a1': 'mean','b1': 'mean', 'c1':'sum', 'd1':'mean', 'd2':'mean'}).reset_index()

In [1167]:
mon_df = df_merged.groupby(['month', 'cc']).agg({'a1': 'mean', 'b1': 'mean', 'c1':'sum', 'd1':'mean', 'd2':'mean'}).reset_index()

In [1168]:
mon_df

Unnamed: 0,month,cc,a1,b1,c1,d1,d2
0,2022-01,BR,0.00,0.2,0.0,73.0,22.0
1,2022-01,CN,0.75,0.0,0.0,83.0,35.0
2,2022-01,DE,0.06,0.3,0.0,70.0,20.0
3,2022-01,IN,0.01,0.1,6.0,44.0,24.0
4,2022-01,IQ,0.00,0.0,0.0,99.0,19.0
...,...,...,...,...,...,...,...
62,2022-12,BR,0.00,,0.0,77.0,18.0
63,2022-12,CN,0.98,,0.0,82.0,23.0
64,2022-12,DE,0.00,,0.0,74.0,17.0
65,2022-12,IN,0.00,,1.0,52.0,16.0


In [1169]:
ann_df

Unnamed: 0,cc,a1,b1,c1,d1,d2
0,BR,0.0,0.2,1.0,74.166667,19.833333
1,CN,0.609167,0.0,0.0,82.25,32.583333
2,DE,0.01,0.385714,0.0,73.416667,19.083333
3,IN,0.001667,0.1,42.0,45.916667,20.5
4,IQ,0.0,0.028571,7.0,99.833333,19.833333
5,RW,0.0,0.025,0.0,91.0,25.571429


In [1170]:
def get_findex(tdf, index_type='year'):
    df = tdf.copy()
    
    if index_type == 'mon':     
        df['frag_index'] = round((df.sum(axis=1, numeric_only=True) / len(df.select_dtypes('number').columns)))
        findex_df = df.copy()
        findex_df = findex_df[['month', 'cc', 'frag_index']]
        findex_df.sort_values(by=['frag_index'], inplace=True)
        return findex_df
    elif index_type == 'year':
        df['frag_index'] = round((df.sum(axis=1, numeric_only=True) / len(df.select_dtypes('number').columns)))
        findex_df = df.copy()
        findex_df = findex_df[['cc', 'frag_index']]
        findex_df.sort_values(by=['frag_index'], inplace=True)
        return findex_df
    elif index_type == 'total':
        df['frag_index'] = round((df.sum(axis=1, numeric_only=True) / len(df.select_dtypes('number').columns)))
        findex_df = df.copy()
        findex_df = findex_df[['cc', 'frag_index']]     
        gfn = len(findex_df)
        global_findex = (findex_df['frag_index'].sum() / gfn)
        return global_findex
    

In [1171]:
cc_ann_findex_tbl = get_findex(ann_df, 'year' )

In [1172]:
cc_ann_findex_tbl

Unnamed: 0,cc,frag_index
0,BR,19.0
2,DE,19.0
3,IN,22.0
1,CN,23.0
5,RW,23.0
4,IQ,25.0


In [1173]:
cc_mon_findex_tbl = get_findex(mon_df,'mon')

In [1174]:
cc_mon_findex_tbl

Unnamed: 0,month,cc,frag_index
53,2022-10,IN,13.0
37,2022-07,IN,13.0
48,2022-09,IN,13.0
20,2022-04,IN,14.0
14,2022-03,IN,14.0
...,...,...,...
27,2022-05,IQ,25.0
35,2022-07,CN,25.0
38,2022-07,IQ,25.0
15,2022-03,IQ,25.0


In [1175]:
global_findex = get_findex(ann_df,'total')

In [1176]:
global_findex

21.833333333333332

In [1177]:
fig = go.Figure(data=[go.Table(
    header=dict(values=['Country Code', 'Fragmentation Index'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=cc_ann_findex_tbl.transpose().values.tolist(),
               fill_color='lavender',
               align='left'))
])

fig.show()

In [1021]:
def plot_graph(df):
    fig = px.line(df, x="month", y="frag_index", color="cc", hover_data=['frag_index'], template='simple_white')
    #fig.update_xaxes(dtick="M1", tickformat="%b\n%Y")
    fig.show()

In [None]:
plot_line_graph(cc_mon_findex_tbl)

In [996]:
fotn_filepath = "/Users/coes3/phd-labs/fraganal/data_source/FOTN_2022_Country_Score_Data.xlsx"

In [997]:
def fotn_score(path):
    def note(df):
        if df.Status == 'PF':
            return 'Partially Free'
        elif df.Status == 'NF':
            return 'Not Free'
        else:
            return 'Free'

    df = pd.read_excel(path, header=1, usecols="A:C, AB", engine='openpyxl')

    cc_df = pd.read_csv(c1_filepath)
    df = df.join(cc_df.set_index('Name'), on='Country')
    df.rename(columns = {'Code':'cc'}, inplace = True)

    cc_list = ['CN','BR','DE','IN','IQ', 'RW']
    df = df[df['cc'].isin(cc_list)]
    #df['description'] = df.apply(note, axis=1)

    #Add country code ISO alpha2
    #Join country name to df
    #cc_df = pd.read_csv(filepath_or_buffer="gs://etl-tooling/fotn_country_list.csv", storage_options={"token": credentials})
    #df = df.join(cc_df.set_index('Name'), on='Country')

    #df.rename(columns = {'Code':'country_code'}, inplace = True)
    #df.columns = df.columns.str.lower()
    #df = df[['country', 'country_code', 'edition', 'status', 'description', 'total']]
    return df

Compare with FOTN Score and state

In [1150]:
fotn_df = fotn_score(fotn_filepath)

In [1178]:
cc_ann_findex_tbl = cc_ann_findex_tbl.join(fotn_df.set_index('cc'), on='cc')

In [1179]:
cc_ann_findex_tbl

Unnamed: 0,cc,frag_index,Country,Edition,Status,Total
0,BR,19.0,Brazil,2022,PF,65
2,DE,19.0,Germany,2022,F,77
3,IN,22.0,India,2022,PF,51
1,CN,23.0,China,2022,NF,10
5,RW,23.0,Rwanda,2022,NF,37
4,IQ,25.0,Iraq,2022,PF,42
