In [1]:
! pip install --quiet warcio ftfy langid blingfire bs4 fastwarc fastcore fasttext
! pip install --upgrade scikit-learn
! pip install fastwarc

Requirement already up-to-date: scikit-learn in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (1.2.2)


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup, SoupStrainer
from fastcore.parallel import parallel
import blingfire
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import re
import requests
import json
import os
import glob
from datetime import datetime
from scipy.sparse import csr_matrix
import itertools
from multiprocessing import Pool

## Methodology
- Construct a vocubulary of 6-12 character n-grams from the seed websites
- Using the above vocubulary, create a TFIDF matrix of all of the data
- Create a single document using the average of the seed data
- Find the closest neighbors from these single document

In [3]:
# Functions to read WARC files

from fastwarc import ArchiveIterator
from fastwarc.stream_io import PythonIOStreamAdapter
from io import BytesIO, StringIO

def retrieve_record(offset, stream):
    stream.seek(offset)
    payload = BytesIO()
    try:
        record = next(ArchiveIterator(stream))
        headers = record.headers
        buf = record.reader.read(4096)
        while buf:
            payload.write(buf)
            buf = record.reader.read(4096)
    except Exception as e:
        print(e) # sensible handling here
        return None
    return headers, payload

def extract(offset,f):
    try:
        headers, payload = retrieve_record(offset, f)
        payload.seek(0)
        payload = payload.read()
        extracted = dict(headers)
        extracted['offset'] = str(offset)
        extracted['payload'] = payload
    except:
        extracted = {"error":True}

    return extracted

In [50]:
def SubmitToScorer(dataframe,accessKey:str, scoringURL:str, teamName:str='NoTeamGiven', verbose=False):
    """ Returns the score of your submission. See example notebook for submission requirements. """

    import requests, pandas

    score = None
    x = requests.post(scoringURL,
        params={"code":accessKey},
        headers={'teamName':teamName},
        json=dataframe.to_dict())

    if 200 == x.status_code:
        score = float(x.text.split(':')[1].strip('}'))
        if verbose:
            print('Submission returning 200. Our score:')
            print(x.text)
    else:
        if verbose:
            print('Non-200 status code returned:')
            print(x.status_code)
            print(x.text)
        pass
    
    return [score, x.status_code, x.text]

In [7]:
def get_score(fnames):
    f1 = open(submission_template_fpath,'r')
    js = json.load(f1)
    f1.close()

    df = pd.DataFrame.from_dict(js)
    
    d = list()
    for fname in fnames:
        theme_num = fname.split('-')[-1][0]
        d = d + [(ln.strip() + '\t' + theme_num).split('\t') for ln in open(fname,'r').readlines()]

    df = pd.DataFrame(d)
    df.columns = ['score', 'urls', 'sample_content', 'theme']
    del df['score']
    del df['sample_content']
    df[["theme"]] = df[["theme"]].apply(pd.to_numeric)

    #scoringURL="https://dq23score.azurewebsites.net/api/DQ23ValidationScoreUpload"
    #accessKey="hUSP8Q5nhnJmurXATzsVRQhrjMMXYzRy9oaVPbkv5jaHAzFuLlNgrA=="
    accessKey = "xCYVOL7dV4ow4XeMbqC6rIyAtsdkbp_twfdIaG8aLDXaAzFuKq4PaA=="
    scoringURL = 'https://dq23grandfinal.azurewebsites.net/api/DQ23Train'
    
    score, statuscode, responseText = SubmitToScorer(df,accessKey,scoringURL,teamName='SAY NO TO TECHNICAL DEBT',verbose=True)
    return score

In [4]:
import fasttext

class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = "/home/azureuser/cloudfiles/code/Examples/models/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text, k=1) # returns top 2 matching languages
        return predictions[0][0].split("__")[-1]

LANGUAGE = LanguageIdentification()



In [5]:
def parse_html(content, do_langid = True, do_ftfy = False):
    try:
        content = content.decode("UTF-8")
    except:
        content = str(content)
        
    parsed = BeautifulSoup(content).get_text().lower()
    if do_ftfy:
        ret = {"content": ftfy.fix_text(' '.join(parsed.text.split()))}
    else:
        ret = {"content": ' '.join(parsed.split())}
    if do_langid:
        try:
            ret['lang'] = LANGUAGE.predict_lang(ret['content'])
        except:
            ret['lang'] = None
    return ret

In [6]:
def load_offsets(fname):
    with open(fname, "rt") as myfile:
        offsets = myfile.readlines()
    offsets = [int(i.split(":")[0]) for i in offsets]
    return offsets

def load_data(fname, offsets):
    with open(fname, 'rb') as f:
        parsed = [extract(o,f) for o in tqdm(offsets)]
    df = pd.DataFrame(parsed)
    # Pandas doesn't like -'s in column names
    df.columns = df.columns.str.replace("-","_")
    # Call it url
    df['url'] = df['WARC_Target_URI']
    return df

def parse_data(df):
    parsed = parallel(parse_html, df.payload.values, progress=False, do_langid=True)
    df_parsed = pd.DataFrame(parsed)
    df['payload'] = df_parsed.content
    if 'lang' in df_parsed.columns:
        df['lang'] = df_parsed.lang
    return df

def filter_data(df, filter_en=True):
    if filter_en:
        df = df[df.lang=='en']
    df = df.drop_duplicates(subset=['url'])
    return df

def create_tfidf(df, min_df=5, max_df=0.05, ngram_range=(1,1), analyzer='word', vocabulary=None):
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, analyzer=analyzer, vocabulary=vocabulary, \
                                 ngram_range=ngram_range)
    X = vectorizer.fit_transform(df['payload'])
    return X, vectorizer
    
def create_common(df, vectorizer, num_overlap=2, theme_num=None):
    docs = df[df['label'] == True]['payload'].values
    docs = [doc + ' ' + d['theme'][theme_num]['query'].lower() for doc in docs]
    docs_vec = vectorizer.transform(docs)
    r,c = docs_vec.nonzero()
    docs_df = pd.DataFrame({'r':r, 'c':c, 'v':docs_vec[r,c].tolist()[0]})
    docs_df = docs_df.merge(pd.DataFrame(docs_df.groupby(['c']).size().to_frame('size')), on='c')
    docs_df = docs_df[docs_df['size'] >= num_overlap][['c', 'v']].groupby(['c']).mean().reset_index()

    common_vector = csr_matrix((1, docs_vec.shape[1]), dtype=docs_vec.dtype)

    for i,row in docs_df.iterrows():
        common_vector[0,row['c']] = row['v']
        
    return common_vector

def find_nearest(X, common):
    dists = pairwise_distances(X, common, metric='cosine') 
    dists = 1 - dists # 1 is similar, 0 is far
    dists = np.transpose(dists)[0]
    inds = dists.argsort()[::-1]
    return inds, dists

def write_results(inds, dists, df, fname_base, theme_num):
    seeds = d['theme'][theme_num]['seeds']
    out = list()
    for ind in df[df['label'] == True].index:
        out.append(str(dists[ind]) + '\t' + df.loc[ind]['url'] + '\t' + df.loc[ind]['payload'][0:100])
    for ind in inds:
        url = df.iloc[ind]['url']
        if url not in seeds: 
            out.append(str(dists[ind]) + '\t' + df.iloc[ind]['url'] + '\t' + df.iloc[ind]['payload'][0:100])
        if len(out) >= 100:
            break
    out = '\n'.join(out)
    fname = fname_base + 'theme-' + str(theme_num) + '.tsv'
    open(fname,'w').write(out)

In [8]:
data_dir = '/home/azureuser/cloudfiles/code/Data/test'
submission_template_fpath = '/home/azureuser/cloudfiles/code/Data/test/test.json'

parsed_dir = './parsed'
if os.path.exists(parsed_dir) == False:
    os.mkdir(parsed_dir)

results_dir = './results'
if os.path.exists(results_dir) == False:
    os.mkdir(results_dir)

In [18]:
d = json.load(open(submission_template_fpath))

theme_nums = list(d['theme'].keys())
print(d)

{'theme': {'3': {'fname': '/home/azureuser/cloudfiles/code/Data/test/full3.warc', 'offset': '/home/azureuser/cloudfiles/code/Data/test/full3_offsets.txt', 'query': '3.5mm headphone alternatives', 'seeds': ['https://www.whathifi.com/advice/apple-lightning-headphones-everything-you-need-to-know', 'https://www.soundguys.com/was-ditching-the-headphone-jack-a-good-idea-13825/', 'https://www.pcmag.com/picks/the-best-phones-with-a-headphone-jack', 'https://www.macworld.com/article/668694/best-lightning-headphones-for-iphone-ipad.html', 'https://www.digitaltrends.com/home-theater/sony-wh-1000xm4-alternatives-under-100-dollars/']}, '4': {'fname': '/home/azureuser/cloudfiles/code/Data/test/full4.warc', 'offset': '/home/azureuser/cloudfiles/code/Data/test/full4_offsets.txt', 'query': 'coastal garden considerations', 'seeds': ['https://www.rhs.org.uk/plants/for-places/coastal-areas', 'https://www.gardenersworld.com/how-to/grow-plants/plants-for-a-coastal-garden/', 'https://www.themiddlesizedgarden

In [None]:
#%%capture

for theme_num in theme_nums:
    offsets = load_offsets(d['theme'][theme_num]['offset'])
    df = load_data(d['theme'][theme_num]['fname'], offsets)
    df = parse_data(df)
    df = filter_data(df, filter_en=True)
    df = df[['url', 'payload']]
    df['label'] = df['url'].apply(lambda x: True if x in d['theme'][theme_num]['seeds'] else False)

    fname = parsed_dir + '/parsed-' + theme_num + '.tsv'
    df.to_csv(fname, sep='\t', index=False, header=True)

In [21]:
# Sanity check

for theme_num in theme_nums:
    fname = parsed_dir + '/parsed-' + theme_num + '.tsv'
    df = pd.read_csv(fname, sep='\t')
    df = df[df['url'].isna() == False]
    df = df[df['payload'].isna() == False]
    print(df[df['label'] == True]['url'])

1574     https://www.digitaltrends.com/home-theater/son...
4519     https://www.whathifi.com/advice/apple-lightnin...
8879     https://www.macworld.com/article/668694/best-l...
15360    https://www.soundguys.com/was-ditching-the-hea...
33720    https://www.pcmag.com/picks/the-best-phones-wi...
Name: url, dtype: object
119      https://www.thespruce.com/rock-garden-design-2...
8146     https://www.themiddlesizedgarden.co.uk/how-to-...
14210    https://www.gardenersworld.com/how-to/grow-pla...
22095    https://www.daviddomoney.com/garden-by-the-coa...
37613    https://www.rhs.org.uk/plants/for-places/coast...
Name: url, dtype: object
8861     https://www.wordstream.com/blog/ws/2022/03/10/...
15627    https://copyhackers.com/2016/02/how-to-write-e...
26060    https://www.outbrain.com/help/advertisers/enga...
29093          https://venngage.com/blog/engaging-content/
29489    https://www.semrush.com/blog/content-writing-h...
Name: url, dtype: object
13018    https://www.gemsociety.org/arti

In [27]:
clean_bools = [False]
parsed_dirs = [parsed_dir]

min_dfs_vocab = [3]
min_dfs = [5]
max_dfs = [0.2]
num_overlaps = [1]
ngram_ranges = [(6,12)]

params1 = (itertools.product(*[clean_bools, parsed_dirs]))
params2 = list(itertools.product(*[min_dfs_vocab, min_dfs, max_dfs, num_overlaps, ngram_ranges]))

def myprocess(theme_num):
    print(theme_num)
    for clean_bool, parse_dir in params1:
        fname = parse_dir + '/parsed-' + theme_num + '.tsv'
        df = pd.read_csv(fname, sep='\t')
        df = df[df['url'].isna() == False]
        df = df[df['payload'].isna() == False]
        if clean_bool:
            df['payload'] = df['payload'].str.replace(r'[^a-z0-9]', ' ', regex=True).str.replace('\s+',' ', regex=True)
        #df['label'] = df['url'].apply(lambda x: True if x in d['theme'][theme_num]['seeds'] else False)
        for min_df_vocab, min_df, max_df, num_overlap, ngram_range in params2:
            #print(datetime.now().time(), theme_num, min_df, max_df, num_overlap, ngram_range)
            _, vectorizer = create_tfidf(df[df['label'] == True], min_df=min_df_vocab, max_df=1.0, ngram_range=ngram_range, analyzer='char')
            words = vectorizer.get_feature_names_out()
            vocabulary = dict(zip(words, range(len(words))))
            print(len(vocabulary))
            X, vectorizer = create_tfidf(df, min_df=min_df, max_df=max_df, ngram_range=ngram_range, analyzer='char', vocabulary=vocabulary)
            common_vector = create_common(df, vectorizer, num_overlap=num_overlap, theme_num=theme_num)
            inds, dists = find_nearest(X, common_vector)
            fname_base = results_dir + '/' + '-'.join([str(x) for x in [clean_bool, min_df_vocab, min_df, max_df, num_overlap, ngram_range]]) + '-'
            write_results(inds, dists, df, fname_base, theme_num)

p = Pool(4)
res = p.map(myprocess, theme_nums)
p.close()

3564



7338
9541
3126
16703


  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])


In [47]:
df_sub = pd.DataFrame()
fnames = glob.glob(results_dir + '/*.tsv')
for fname in fnames:
    theme_num = fname.split('-')[-1][0] # Encoded in the filename, probably not ideal
    df_temp = pd.read_csv(fname, sep='\t', header=None)
    df_temp.columns = ['score', 'urls', 'content']
    df_temp['theme'] = int(theme_num)
    if df_sub.empty:
        df_sub = df_temp
    else:
        df_sub = pd.concat([df_sub, df_temp], axis=0)

df_sub = df_sub.reset_index(drop=True)
del df_sub['score']
del df_sub['content']

In [69]:
teamName = 'SAY NO TO TECHNICAL DEBT'
accessKey = "xCYVOL7dV4ow4XeMbqC6rIyAtsdkbp_twfdIaG8aLDXaAzFuKq4PaA=="
scoringURL = 'https://dq23grandfinal.azurewebsites.net/api/DQ23Train'

scoringURL="https://dq23grandfinal.azurewebsites.net/api/DQ23TrainScoreUpload"
accessKey="dz6Gi3PMJC6Smi3kCbelHIY4Cbh_lcxGnPsG17e_75tnAzFu7aG_ag=="
    
score, statuscode, responseText = SubmitToScorer(df_sub, accessKey, scoringURL, teamName=teamName, verbose=True)

Non-200 status code returned:
500
DQ23 Error: Errored in custom code. Please speak to an admin.


In [68]:
df_sub.to_csv('SAY_NO_TO_TECHNICAL_DEBT.csv', sep=',', header=True, index=False)

In [75]:
f1 = open('../test_seed_submission.json','r')
js = json.load(f1)
f1.close()

df = pd.DataFrame.from_dict(js)

scoringURL="https://dq23grandfinal.azurewebsites.net/api/DQ23TrainScoreUpload"
accessKey="dz6Gi3PMJC6Smi3kCbelHIY4Cbh_lcxGnPsG17e_75tnAzFu7aG_ag=="
#accessKey = "xCYVOL7dV4ow4XeMbqC6rIyAtsdkbp_twfdIaG8aLDXaAzFuKq4PaA=="
#scoringURL = 'https://dq23grandfinal.azurewebsites.net/api/DQ23Train'
    
score, statuscode, responseText = SubmitToScorer(df, accessKey, scoringURL, teamName='test', verbose=True)

Submission returning 200. Our score:
{'score': 0.09264491119558448}
