## Preliminary Results

In [1]:
from oauth2client.service_account import ServiceAccountCredentials
from df2gspread import df2gspread as d2g
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
import gspread
import warnings
warnings.filterwarnings("ignore")

---
### Connect to table

In [2]:
## Connect to our service account
project = 
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name(project, scope)
gc = gspread.authorize(credentials)

In [3]:
spreadsheet_key = 
book = gc.open_by_key(spreadsheet_key)

data = book.worksheet('web_extraction').get_all_values()
web_extraction = pd.DataFrame(data[1:], columns=data[0])
data = book.worksheet('manual_search').get_all_values()
manual_search = pd.DataFrame(data[1:], columns=data[0])
data = book.worksheet('external_sources').get_all_values()
external_sources = pd.DataFrame(data[1:], columns=data[0])
data = book.worksheet('questionnaires').get_all_values()
questionnaires = pd.DataFrame(data[1:], columns=data[0])

In [4]:
columns = ['url', 'description', 'group', 'topic','method', 'source','source_type', 'consensus'] #relevant will be decided by consensus

In [5]:
web_extraction = web_extraction[web_extraction['first_sample'] == '1']
web_extraction = web_extraction[columns]
web_extraction.head(2)

Unnamed: 0,url,description,group,topic,method,source,source_type,consensus
0,https://worldwide.espacenet.com/publicationDet...,A method of delivering content from a content ...,Advertising,Behavioural tracking,Crawler search,EU Patents Database,Software registry,1
1,http://arxiv.org/abs/1407.0697v1,SLA (Service level agreement) is defined by an...,Advertising,Behavioural tracking,Crawler search,Arxiv,Research,1


In [6]:
manual_search = manual_search[columns]
manual_search.head(2)

Unnamed: 0,url,description,group,topic,method,source,source_type,consensus
0,http://apps.webofknowledge.com/full_record.do?...,According to a sample of over thirty thousand ...,Advertising,Web beacon tracker,Research search,Web of Science Database,Research,1
1,http://apps.webofknowledge.com/full_record.do?...,This paper proposes a novel technique that uti...,Advertising,Web beacon tracker,Research search,Web of Science Database,Research,1


In [7]:
external_sources = external_sources[columns]
external_sources.head(2)

Unnamed: 0,url,description,group,topic,method,source,source_type,consensus
0,https://www.ghostery.com,,Advertising,Behavioural tracking,Research search,Other,Word of mouth,1
1,https://adverifai.com,,Advertising,Behavioural tracking,Research search,Other,Word of mouth,0


In [8]:
questionnaires = questionnaires[columns]
questionnaires.head(2)

Unnamed: 0,url,description,group,topic,method,source,source_type,consensus
0,,Consumer complaints,Consumer complaints,Consumer complaints,Questionnaire,riikka.rosendahl@kkv.fi,Consumer Agency,0
1,https://vico-research.com/,,Web Evidence,Digital Investigations,Questionnaire,marktwaechter@vzbv.de,Consumer Agency,0


In [9]:
table = pd.DataFrame()
table = table.append(web_extraction)
table = table.append(manual_search)
table = table.append(external_sources)
table = table.append(questionnaires)

In [10]:
n1 = len(web_extraction[web_extraction['consensus']=='1'])
n2 = len(manual_search[manual_search['consensus']=='1'])
n3 = len(external_sources[external_sources['consensus']=='1'])
n4 = len(questionnaires[questionnaires['consensus']=='1'])
print('Total number of relevant tools: {}\n\nWeb extraction and sample open databases: {}\nManual extraction private databases: {}\nExternal Sources and desk search: {}\nConsumer agencies responses: {}'.format(n1+n2+n3+n4,n1, n2, n3, n4))


Total number of relevant tools: 76

Web extraction and sample open databases: 27
Manual extraction private databases: 8
External Sources and desk search: 22
Consumer agencies responses: 19


In [11]:
# n1 = len(web_extraction)
# n2 = len(manual_search)
# n3 = len(external_sources)
# n4 = len(questionnaires)
# print('Total number of tools: {}\n\nWeb extraction and sample open databases: {}\nManual extraction closed databases: {}\Word of mouth: {}\nQuestionnaires responses: {}'.format(n1+n2+n3+n4,n1, n2, n3, n4))


In [12]:
table.columns

Index(['url', 'description', 'group', 'topic', 'method', 'source',
       'source_type', 'consensus'],
      dtype='object')

In [13]:
relevant_tools = table[table['consensus']=='1'].reset_index(drop=True)
#raw_tools = table.reset_index(drop=True)

In [14]:
len(relevant_tools)
#len(raw_tools)

76

In [15]:
relevant_tools\
    .groupby(['group','topic'])\
    .count()['url']\
    .reset_index()\
    .sort_values('url', ascending=False)\
    .reset_index(drop=True)\

Unnamed: 0,group,topic,url
0,Web Evidence,Domain registries,11
1,Scam (Online Fraud),General scam & Pishing,9
2,Web Evidence,Webpage snaps,8
3,Advertising,Fake reviews,8
4,Scam (Online Fraud),Financial online fraud,4
5,Price Transparency Issues,Price discrimination,4
6,Geoblocking,VPN,4
7,Unfair terms issues,General terms and conditions,3
8,Advertising,Behavioural tracking,3
9,Advertising,Web beacon tracker,3


In [16]:
# pd.pivot_table(
#     relevat_tools,
#     values=None,
#     index='topic',
#     columns='source_type',
#     aggfunc='count',
#     fill_value=0,
#     margins=False,
#     dropna=True,
#     margins_name='All',
# )['url'].to_csv('table.csv')

In [17]:
relevant_tools.to_excel("relevant_tools.xlsx")