## Connection to table in GDrive

In [1]:
from oauth2client.service_account import ServiceAccountCredentials
from df2gspread import df2gspread as d2g
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
import gspread

---
### Extracted table from automatic sources

In [2]:
table1 = pd.read_csv('../datasets/Arxiv_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table1['source'] = 'Arxiv'
table1['source_type'] = 'Research'
table2 = pd.read_csv('../datasets/DBLP_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table2['source'] = 'DBLP'
table2['source_type'] = 'Research'
table3 = pd.read_csv('../datasets/Gitlab_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table3['source'] = 'Gitlab'
table3['source_type'] = 'Software registry'
table4 = pd.read_csv('../datasets/Googleplay_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table4['source'] = 'Google play'
table4['source_type'] = 'Software registry'
table5 = pd.read_csv('../datasets/Googlescholar_results.csv', header=None).rename(columns={0:'url',1:'description',2:'topic'})
table5['source'] = 'Google scholar'
table5['source_type'] = 'Research'
table6 = pd.read_excel('../datasets/github_monitoring_tools.xlsx')[['url','description','topic']]
table6['source'] = 'Github'
table6['source_type'] = 'Software registry'
table7 = pd.read_excel('../datasets/patents_monitoring_tools.xlsx')[['url','description','topic']]
table7['source'] = 'EU Patents Database'
table7['source_type'] = 'Software registry'

In [3]:
# Merge the tables
tableA = table1.append(table2, sort=True).reset_index()
tableB = tableA.append(table3, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableC = tableB.append(table4, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableD = tableC.append(table5, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableE = tableD.append(table6, sort=True).reset_index()[['url','description','topic','source','source_type']]
tableF = tableE.append(table7, sort=True).reset_index()[['url','description','topic','source','source_type']]

In [4]:
automathic_table = tableF
automathic_table['name'] = 'NA'
automathic_table['method'] = 'Crawler search'

In [5]:
automathic_table.head()

Unnamed: 0,url,description,topic,source,source_type,name,method
0,http://arxiv.org/abs/1706.01560v1,The profitability of fraud in online systems s...,ai consumer fraud online,Arxiv,Research,,Crawler search
1,http://arxiv.org/abs/1805.00464v1,The e-commerce share in the global retail spen...,ai consumer fraud online,Arxiv,Research,,Crawler search
2,http://arxiv.org/abs/1906.04272v2,Given the magnitude of online auction transact...,ai consumer fraud online,Arxiv,Research,,Crawler search
3,http://arxiv.org/abs/1906.07974v1,Providers of online marketplaces are constantl...,ai consumer fraud online,Arxiv,Research,,Crawler search
4,http://arxiv.org/abs/1806.08910v1,We introduce the fraud de-anonymization proble...,ai consumer fraud online,Arxiv,Research,,Crawler search


---
### Connect with the table form manual search

In [6]:
## Connect to our service account
project = 
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name(project, scope)
gc = gspread.authorize(credentials)

In [7]:
##Get candidate data sheet from Google Drive
spreadsheet_key = 
worksheet_name = 'manual_search'

In [8]:
book = gc.open_by_key(spreadsheet_key)
worksheet = book.worksheet(worksheet_name)
data = worksheet.get_all_values()
manual_table = pd.DataFrame(data[1:], columns=data[0])

In [9]:
manual_table.head(2)

Unnamed: 0,name,url,description,topic,method,source,source_type
0,Detection of Zombie PCs Based on Email Spam An...,https://apps.webofknowledge.com/full_record.do...,we propose a system that detects botnets and z...,"""IP blocking""",Research search,Web of Science Database,Research
1,Engineering an Agent-based System for Product ...,https://users.isc.tuc.gr/~nispanoudakis/resour...,For developing pricing by companies,"""individual pricing""",Research search,Web of Science Database,Research


---
### Merge the tables and remove duplictes

In [10]:
manual_table.head()

Unnamed: 0,name,url,description,topic,method,source,source_type
0,Detection of Zombie PCs Based on Email Spam An...,https://apps.webofknowledge.com/full_record.do...,we propose a system that detects botnets and z...,"""IP blocking""",Research search,Web of Science Database,Research
1,Engineering an Agent-based System for Product ...,https://users.isc.tuc.gr/~nispanoudakis/resour...,For developing pricing by companies,"""individual pricing""",Research search,Web of Science Database,Research
2,Automatic Detection of Airline Ticket Price an...,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,Might include references to tools detecting pr...,"""price discrimination"" AND detection",Research search,Web of Science Database,Research
3,Web services security proxy: A centralized thi...,https://apps.webofknowledge.com/full_record.do...,In this paper a thin-client framework based on...,"""IP blocking""",Research search,Web of Science Database,Research
4,Share or Not: Investigating the Presence of La...,https://ieeexplore.ieee.org/stamp/stamp.jsp?tp...,We investigate the presence of large-scale add...,"""geoblocking""",Research search,Web of Science Database,Research


In [11]:
automathic_table.head()

Unnamed: 0,url,description,topic,source,source_type,name,method
0,http://arxiv.org/abs/1706.01560v1,The profitability of fraud in online systems s...,ai consumer fraud online,Arxiv,Research,,Crawler search
1,http://arxiv.org/abs/1805.00464v1,The e-commerce share in the global retail spen...,ai consumer fraud online,Arxiv,Research,,Crawler search
2,http://arxiv.org/abs/1906.04272v2,Given the magnitude of online auction transact...,ai consumer fraud online,Arxiv,Research,,Crawler search
3,http://arxiv.org/abs/1906.07974v1,Providers of online marketplaces are constantl...,ai consumer fraud online,Arxiv,Research,,Crawler search
4,http://arxiv.org/abs/1806.08910v1,We introduce the fraud de-anonymization proble...,ai consumer fraud online,Arxiv,Research,,Crawler search


In [12]:
table = manual_table.append(
    automathic_table, sort=True).reset_index()[['url','name','description','topic','method','source','source_type']]

In [13]:
#duplicates identification
list_id = list(table.url)
list_uniques = list(table.url.unique())
print('Total tools: {}\nUnique tools: {}\n======='.format(len(list_id),len(list_uniques)))
for i in range(0,10):
    sets = list(set([x for x in list_id if list_id.count(x) > i]))
    print('url with {} duplicates: {}'.format(i, len(sets)))
print('=======\nlines to remove: {}'.format(len(list_id) - len(list_uniques)))

Total tools: 2127
Unique tools: 1944
url with 0 duplicates: 1944
url with 1 duplicates: 145
url with 2 duplicates: 30
url with 3 duplicates: 6
url with 4 duplicates: 2
url with 5 duplicates: 0
url with 6 duplicates: 0
url with 7 duplicates: 0
url with 8 duplicates: 0
url with 9 duplicates: 0
lines to remove: 183


In [14]:
## Save the data back to a new sheet in the dataframe
table = table.drop_duplicates(subset ='url')

---
## Sampling

In [15]:
sample_size = 80

In [16]:
groups = table.groupby('source').count()['url'].reset_index()
groups['proportion'] = groups.url/groups.url.sum()
groups['selection'] = [int(i) for i in groups.proportion*sample_size]

In [17]:
groups

Unnamed: 0,source,url,proportion,selection
0,Arxiv,436,0.22428,17
1,DBLP,219,0.112654,9
2,EU Patents Database,151,0.077675,6
3,Github,177,0.091049,7
4,Gitlab,67,0.034465,2
5,Google play,413,0.212449,16
6,Google scholar,418,0.215021,17
7,HeinOnline database,17,0.008745,0
8,Web of Science Database,46,0.023663,1


In [18]:
np.random.seed(32)
matrix = []
for source in groups.source:
    range_group = table[table.source == str(source)].index
    selection = groups[groups.source == str(source)].selection.values[0]
    print(source, ' ', selection)
    group_sample = list(np.random.choice(range_group, selection))
    matrix.append(group_sample)
stratified_sample = sorted(set([item for sublist in matrix for item in sublist]))

Arxiv   17
DBLP   9
EU Patents Database   6
Github   7
Gitlab   2
Google play   16
Google scholar   17
HeinOnline database   0
Web of Science Database   1


In [19]:
table['sample'] = [1 if i in stratified_sample else 0 for i in table.index]        

In [23]:
table['sample'].sum()

74

---

In [21]:
wks_name = 'clean_database'
d2g.upload(table, spreadsheet_key, wks_name, credentials=credentials, row_names=False)

<Worksheet 'clean_database' id:753128839>

In [22]:
d2g.upload(table[['topic','source']].drop_duplicates(subset ='topic'), spreadsheet_key,
           'terms', credentials=credentials, row_names=False)

<Worksheet 'terms' id:1326733241>