## Exploring IT/AI tools for monitoring online markets (EU Patents DB)

In [2]:
from flashtext import KeywordProcessor
import time
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### Time of execution
start = time.time()

monitoring_terms = ['monitoring personal pricing','personal pricing','monitoring individual pricing','individual pricing',
                    'monitoring price discrimination','price discrimination',
                    'ip blocking','geoblocking','geoblocking monitoring',
                    'scam','scam monitoring','monitoring identity theft','identity theft',
                    'do not call registry','auction scam','telemarketing scam','credit scam',
                    'loan scam','sweepstakes scam','refund scam','web beacon',
                    'ad profiling','behavioural online+tracking','influencer marketing ','fake followers','misleading advertising',
                    'cookie monitoring','cookie tracking','ad blocking','market monitoring',
                    'fake reviews','fake reviews detection','tracking fake reviews',
                    'scarcity cues','website timer','monitoring scarcity cues','monitoring website timer']

print('terms to use for monitoring: {}'.format(len(monitoring_terms)))

patents = pd.read_csv('http://data.patentsview.org/20190312/download/patent.tsv.zip',
                      sep='\t', error_bad_lines=False, verbose=False)


print('total patents database: {}'.format(len(patents)))

def extract(vec, dictionary, info=False):
    matrix = []
    for line in vec:
        matrix.append(dictionary.extract_keywords(str(line), span_info=info))
    return matrix

#proccess
data = patents
dictionary = KeywordProcessor()
dictionary.add_keywords_from_list(monitoring_terms)
extracted = extract(data.abstract, dictionary)
row = [list(set(i)) if len(i)>0 else '' for i in extracted]
data['matches'] = [str(i).replace('[', '').replace(']', '') for i in row]
data['count_matches'] = [len(i) for i in extracted]
data['count_unique_matches'] = [len(set(i)) for i in extracted]

#get the data matches
data_matches = data[data['count_matches'] > 0]
data_matches['date_str'] = ["".join(i.split('-')) for i in data_matches.date]
links = []
for i in data_matches.index:
    line = data_matches.loc[i]
    link = "https://worldwide.espacenet.com/publicationDetails/biblio?"+\
    "CC="+str(line.country)+\
    "&NR="+str(line.id)+\
    "&KC="+str(line.kind)+\
    "&date="+str(line.date_str)+\
    "&locale=en_EP"
    links.append(link)
data_matches['link'] = links
print('number of matches found: {}, {}%'.format(len(data_matches), round(100*len(data_matches)/len(data),5)))

#export data
export = data_matches.sort_values('count_matches', ascending=False)
export = export[['title','link','abstract','matches']].drop_duplicates(subset ="link")
export.columns = ['name','url','description','topic']
export.to_excel('../datasets/patents_monitoring_tools.xlsx')
print('Exported in /datasets/patents_monitoring_tools.xlsx')
end = time.time()
print('Elapsed time: {}'.format(time.strftime("%H:%M:%S", time.gmtime(end - start))))

terms to use for monitoring: 37


b'Skipping line 4505264: expected 11 fields, saw 12\n'
b'Skipping line 4540085: expected 11 fields, saw 12\nSkipping line 4570473: expected 11 fields, saw 12\n'
b'Skipping line 4610402: expected 11 fields, saw 12\nSkipping line 4652985: expected 11 fields, saw 12\n'
b'Skipping line 4662863: expected 11 fields, saw 12\n'


total patents database: 6957999
number of matches found: 151, 0.00217%
Exported in /datasets/patents_monitoring_tools.xlsx
Elapsed time: 00:15:07
