## Connection to table in GDrive

In [1]:
from oauth2client.service_account import ServiceAccountCredentials
from df2gspread import df2gspread as d2g
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
import gspread
import warnings
warnings.filterwarnings("ignore")

---
### Extracted table from automatic sources

In [2]:
table1 = pd.read_csv('../datasets/Arxiv_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table1['source'] = 'Arxiv'
table1['source_type'] = 'Research'
table2 = pd.read_csv('../datasets/DBLP_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table2['source'] = 'DBLP'
table2['source_type'] = 'Research'
table3 = pd.read_csv('../datasets/Gitlab_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table3['source'] = 'Gitlab'
table3['source_type'] = 'Software registry'
table4 = pd.read_csv('../datasets/Googleplay_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table4['source'] = 'Google play'
table4['source_type'] = 'Software registry'
table5 = pd.read_csv('../datasets/Googlescholar_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table5['source'] = 'Google scholar'
table5['source_type'] = 'Research'
table6 = pd.read_excel('../datasets/github_monitoring_tools.xlsx')[['url','description','topic']]
table6['source'] = 'Github'
table6['source_type'] = 'Software registry'
table7 = pd.read_excel('../datasets/patents_monitoring_tools.xlsx')[['url','description','topic']]
table7['source'] = 'EU Patents Database'
table7['source_type'] = 'Software registry'

In [3]:
# Merge the tables
tableA = table1.append(table2, sort=True).reset_index()
tableB = tableA.append(table3, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableC = tableB.append(table4, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableD = tableC.append(table5, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableE = tableD.append(table6, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableF = tableE.append(table7, sort=True).reset_index()[['url','description','topic','source','source_type']]

In [4]:
automathic_table = tableF
automathic_table['name'] = 'NA'
automathic_table['method'] = 'Crawler search'

In [5]:
automathic_table.head()

Unnamed: 0,url,description,topic,source,source_type,name,method
0,http://arxiv.org/abs/1305.5959v2,Archiving the web is socially and culturally c...,wayback machine,Arxiv,Research,,Crawler search
1,http://arxiv.org/abs/1309.4016v1,The Internet Archive's (IA) Wayback Machine is...,wayback machine,Arxiv,Research,,Crawler search
2,http://arxiv.org/abs/1904.12636v1,"In designing a distributed service, three desi...",wayback machine,Arxiv,Research,,Crawler search
3,http://arxiv.org/abs/1801.10396v2,Web archiving services play an increasingly im...,wayback machine,Arxiv,Research,,Crawler search
4,http://arxiv.org/abs/1604.05923v1,This paper describes how born digital primary ...,wayback machine,Arxiv,Research,,Crawler search


---
### Connect with the table form manual search

In [6]:
## Connect to our service account
project = 
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name(project, scope)
gc = gspread.authorize(credentials)

In [7]:
##Get candidate data sheet from Google Drive
spreadsheet_key = 
worksheet_name = 'manual_search'

In [8]:
book = gc.open_by_key(spreadsheet_key)
worksheet = book.worksheet(worksheet_name)
data = worksheet.get_all_values()
manual_table = pd.DataFrame(data[1:], columns=data[0])

---
### Merge the tables and remove duplictes

In [9]:
manual_table.drop(columns='comment', inplace=True)
manual_table.head(2)

Unnamed: 0,name,url,description,topic,method,source,source_type
0,Detection of Zombie PCs Based on Email Spam An...,https://apps.webofknowledge.com/full_record.do...,we propose a system that detects botnets and z...,"""IP blocking""",Research search,Web of Science Database,Research
1,Engineering an Agent-based System for Product ...,https://users.isc.tuc.gr/~nispanoudakis/resour...,For developing pricing by companies,"""individual pricing""",Research search,Web of Science Database,Research


In [10]:
len(manual_table)

72

In [11]:
automathic_table.head(2)

Unnamed: 0,url,description,topic,source,source_type,name,method
0,http://arxiv.org/abs/1305.5959v2,Archiving the web is socially and culturally c...,wayback machine,Arxiv,Research,,Crawler search
1,http://arxiv.org/abs/1309.4016v1,The Internet Archive's (IA) Wayback Machine is...,wayback machine,Arxiv,Research,,Crawler search


In [12]:
len(automathic_table)

6306

In [15]:
#just taking into account the automatic table for the moment
table = automathic_table
#table = manual_table.append(
#    automathic_table, sort=True).reset_index()[['url','name','description','topic','method','source','source_type']]

In [16]:
#duplicates identification
list_id = list(table.url)
list_uniques = list(table.url.unique())
print('Total tools: {}\nUnique tools: {}\n======='.format(len(list_id),len(list_uniques)))
for i in range(0,15):
    sets = list(set([x for x in list_id if list_id.count(x) > i]))
    print('url with {} duplicates: {}'.format(i, len(sets)))
print('=======\nlines to remove: {}'.format(len(list_id) - len(list_uniques)))

Total tools: 6306
Unique tools: 5004
url with 0 duplicates: 5004
url with 1 duplicates: 676
url with 2 duplicates: 260
url with 3 duplicates: 134
url with 4 duplicates: 75
url with 5 duplicates: 46
url with 6 duplicates: 36
url with 7 duplicates: 31
url with 8 duplicates: 25
url with 9 duplicates: 11
url with 10 duplicates: 4
url with 11 duplicates: 2
url with 12 duplicates: 1
url with 13 duplicates: 1
url with 14 duplicates: 0
lines to remove: 1302


In [17]:
## Save the data back to a new sheet in the dataframe
table = table.drop_duplicates(subset ='url')

In [18]:
table['topic'] = [row.replace("'","") for row in table.topic]

In [19]:
len(table)

5004

In [20]:
table.head()

Unnamed: 0,url,description,topic,source,source_type,name,method
0,http://arxiv.org/abs/1305.5959v2,Archiving the web is socially and culturally c...,wayback machine,Arxiv,Research,,Crawler search
1,http://arxiv.org/abs/1309.4016v1,The Internet Archive's (IA) Wayback Machine is...,wayback machine,Arxiv,Research,,Crawler search
2,http://arxiv.org/abs/1904.12636v1,"In designing a distributed service, three desi...",wayback machine,Arxiv,Research,,Crawler search
3,http://arxiv.org/abs/1801.10396v2,Web archiving services play an increasingly im...,wayback machine,Arxiv,Research,,Crawler search
4,http://arxiv.org/abs/1604.05923v1,This paper describes how born digital primary ...,wayback machine,Arxiv,Research,,Crawler search


---
## Sampling

In [40]:
def sampleSize(population_size, margin_error=.05, confidence_level=.99, sigma=1/2):
    alpha = 1 - (confidence_level)
    # dictionary of confidence levels and corresponding z-scores
    # computed via norm.ppf(1 - (alpha/2)), where norm is
    zdict = {
        .90: 1.645,
        .91: 1.695,
        .99: 2.576,
        .97: 2.17,
        .94: 1.881,
        .93: 1.812,
        .95: 1.96,
        .98: 2.326,
        .96: 2.054,
        .92: 1.751
    }
    if confidence_level in zdict:
        z = zdict[confidence_level]
    else:
        from scipy.stats import norm
        z = norm.ppf(1 - (alpha/2))
    N = population_size
    M = margin_error
    a = z**2 * sigma**2 * (N / (N-1))
    b = M**2 + ((z**2 * sigma**2)/(N-1))
    return int(round(a/b,0))

In [41]:
sample_size = sampleSize(population_size=len(table),
                         margin_error=.05, confidence_level=.95, sigma=1/2)
print('Sample Size: {}'.format(sample_size))

Sample Size: 357


In [42]:
catalogue = pd.read_csv('catalogue.csv').fillna(method='ffill')
catalogue.columns = ['group','desciption','subgroup','terms','keywords']
catalogue = catalogue[['group','subgroup','keywords']]
catalogue.head(3)

Unnamed: 0,group,subgroup,keywords
0,Web Evidence,Internet snaps and internet archives,wayback machine
1,Web Evidence,Internet snaps and internet archives,internet snap
2,Web Evidence,Internet snaps and internet archives,webpage snap


In [43]:
table_augmented = pd.merge(table, catalogue, left_on='topic', right_on='keywords', how='left')
table_augmented['stratum'] = table_augmented['source'] + ' - ' + table_augmented['group']
#table_augmented['stratum'] = table_augmented['group']

In [44]:
groups = table_augmented.groupby('group').count()['url'].reset_index()
groups['proportion'] = groups.url/groups.url.sum()
groups['selection'] = [int(i) for i in groups.proportion*sample_size]

In [45]:
groups

Unnamed: 0,group,url,proportion,selection
0,Advertising,1647,0.329598,117
1,Compliance,284,0.056834,20
2,Dark Patterns,348,0.069642,24
3,Geoblocking,305,0.061037,21
4,Price Transparency Issues,285,0.057034,20
5,Scam (Online Fraud),985,0.197118,70
6,Unfair terms issues,334,0.06684,23
7,Web Evidence,809,0.161897,57


In [46]:
stratums = table_augmented.groupby('stratum').count()['url'].reset_index()
stratums['proportion'] = stratums.url/stratums.url.sum()
stratums['selection'] = [int(i) for i in stratums.proportion*sample_size]

In [47]:
stratums

Unnamed: 0,stratum,url,proportion,selection
0,Arxiv - Advertising,493,0.098659,35
1,Arxiv - Compliance,140,0.028017,10
2,Arxiv - Dark Patterns,304,0.060837,21
3,Arxiv - Geoblocking,147,0.029418,10
4,Arxiv - Price Transparency Issues,146,0.029218,10
5,Arxiv - Scam (Online Fraud),392,0.078447,28
6,Arxiv - Unfair terms issues,126,0.025215,9
7,Arxiv - Web Evidence,228,0.045627,16
8,DBLP - Advertising,375,0.075045,26
9,DBLP - Compliance,74,0.014809,5


In [48]:
np.random.seed(32)
matrix = []
for source in stratums.stratum:
    range_group = table_augmented[table_augmented.stratum == str(source)].index
    selection = stratums[stratums.stratum == str(source)].selection.values[0]
    #print(source, ' ', selection)
    group_sample = list(np.random.choice(range_group, selection))
    matrix.append(group_sample)
stratified_sample = sorted(set([item for sublist in matrix for item in sublist]))

In [49]:
table_augmented['first_sample'] = [1 if i in stratified_sample else 0 for i in table_augmented.index]        

In [50]:
table_augmented['first_sample'].sum()

329

In [51]:
#these are going to be excluded in the nex round of sam[ling]
index_first_sample = table_augmented[table_augmented['first_sample'] == 1].index

In [52]:
index_first_sample

Int64Index([   6,    7,   30,   38,   41,   48,   53,   58,   59,   62,
            ...
            4888, 4910, 4925, 4940, 4950, 4971, 4974, 4982, 4987, 4996],
           dtype='int64', length=329)

---

In [53]:
wks_name = 'clean_database'
d2g.upload(table_augmented, spreadsheet_key, wks_name, credentials=credentials, row_names=False)

<Worksheet 'clean_database' id:1326443644>

In [54]:
#d2g.upload(table_augmented[['topic','source']].drop_duplicates(subset ='topic'), spreadsheet_key,
#           'terms', credentials=credentials, row_names=False)

In [55]:
len(table_augmented)

5004