In [None]:
from selectolax.parser import HTMLParser
import sagemaker
from sagemaker import get_execution_role
import boto3
import json
from warcio.archiveiterator import ArchiveIterator
import re
import pandas as pd
from time import time
from collections import defaultdict

In [None]:
# !pip install sagemaker==1.7.0

In [None]:
sess = sagemaker.Session()
role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources on your behalf

In [None]:
region_name = boto3.Session().region_name

In [None]:
bucket = # complete with: "your-bucket"
print(bucket)
prefix = "models/lang_ident/"

In [None]:
# Create model endpoint

In [None]:
!wget -O model.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
!tar -czvf langid.tar.gz model.bin
model_location = sess.upload_data("langid.tar.gz", bucket=bucket, key_prefix=prefix)
!rm langid.tar.gz model.bin

In [None]:
model_location

In [None]:
container = sagemaker.amazon.amazon_estimator.get_image_uri("us-east-1", "blazingtext", "latest")

In [None]:
container

In [None]:
model_name = "blazing-text-cc-news"
create_model_response = sagemaker.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    PrimaryContainer = container)

In [None]:
model.name

In [None]:
predictor = sagemaker.predictor

In [None]:
endpoint_name = "my-endpoint-name"
predictor = sagemaker.RealTimePredictor(endpoint=endpoint_name, 
                                   sagemaker_session=sess,
                                   serializer=json.dumps,
                                   deserializer=sagemaker.predictor.json_deserializer)

In [None]:
sentences = ["hi which language is this?",
             "en que idioma es esto",
             "mon nom est Pierre",
             "Dem Jungen gab ich einen Ball.",
             "আমি বাড়ি যাবো."]
payload = {"instances" : sentences}
predictions = predictor.predict(payload)
print(predictions)

In [None]:
session = boto3.Session()
s3 = session.client("s3")

In [None]:
def get_keys_from_prefix(session, bucket, prefix):
    """
    :param prefix: string, prefix to search for the keys
    :return: list, list of keys
    """
    keys = []

    try:
        objects = session.list_objects(Bucket=bucket, Prefix=prefix)
        is_truncated = objects["IsTruncated"]

        for i in objects["Contents"]:
            keys.append(i["Key"])

        if is_truncated:
            last_key = keys[-1]

            while is_truncated:
                objs = session.list_objects(Bucket=bucket,
                                            Prefix=prefix, 
                                            Marker=last_key)
                is_truncated = objs["IsTruncated"]

                for i in objs["Contents"]:
                    keys.append(i["Key"])

                if is_truncated:
                    last_key = keys[-1]
    except KeyError:
        print("Prefix doesn't exist")
        keys = None

    return keys

In [None]:
crawl_2019 = get_keys_from_prefix(session=s3, bucket="commoncrawl", prefix="crawl-data/CC-NEWS/2019/")

In [None]:
def get_tree(html):
    return HTMLParser(html)

In [None]:
def get_h1(tree):
    results = tree.tags("h1")
    if len(results) != 1:
        return None
    return results[0]

In [None]:
def get_base_text(tree):
    if tree.body is None:
        return None
    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    text = tree.body.text(separator='\n')
    return text

def match_domain(domain):
    regex = r"^[^\/]+:\/\/[^\/]*?\.?([^\/.]+)(\.[^\/.]+)(?::\d+)?\/"
    reg_ex = re.compile(regex)
    return reg_ex.match(domain)

def doc_preprocess(doc):
    doc = doc.replace("\n\n","")
    doc = re.sub(' +', ' ', doc)
    return doc

def text_preprocess(text):
    text = text.strip()
    return text

def get_n_words(text):
    return len(re.findall(r'\w+', text))

def is_lang_1(model, text, e_lang, threshold):
    o_lang, prob = model.predict(text.replace("\n", ""))
    e_lang = f"__label__{e_lang}"
    return (o_lang[0] == e_lang) & (prob[0]>threshold)

def is_lang_2(model, text, e_lang, threshold):
    response = model.predict({"instances":[text.replace("\n", "")]})
    prob = response[0]["prob"]
    o_lang = response[0]["label"]
    e_lang = f"__label__{e_lang}"    
    return (o_lang[0] == e_lang) & (prob[0] > threshold)

def make_doc(o_text, min_words, model, lang="es", threshold=0.5):
    full_text = ""
    for section in doc_preprocess(o_text).split("\t"):
        for sub_section in section.split("\n"):
            sub_section = text_preprocess(sub_section)
            n = get_n_words(sub_section)
            if (n > min_words) and (match_domain(sub_section) is None):
                if is_lang_2(model, sub_section, lang, threshold):
                    full_text += sub_section
                    full_text += "\t"
    return full_text

In [None]:
def process_file(path, model):
    texts = defaultdict(lambda: [])
    with open(path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':
                try:
                    domain = record.rec_headers.get_header('WARC-Target-URI')
                    match = match_domain(domain)
                    content = record.content_stream().read()
                    tree = get_tree(content)
                    text = get_base_text(tree)
                    title = get_h1(tree)
                    if text != None and title != None:
                        doc = make_doc(text, min_words=50, model=model, lang="es", threshold=0.5)
                        if len(doc)>0:
                            texts[match[2]].append((domain, match[0], doc))
                    else:
                        continue
                except Exception as e:
                    print(e)
    return texts

In [None]:
warctemp = "tmp1.warc.gz"

In [None]:
count = 0
for url in crawl_2019:
    print(len(crawl_2019) - count)
    count += 1
    s3.download_file("commoncrawl", url, warctemp)
    texts = defaultdict(lambda: [])
    texts = process_file(warctemp, predictor)
    new_url = url.replace(".warc.gz", ".json")
    s3.put_object(
        Body=(bytes(json.dumps(texts).encode('UTF-8'))),
        Bucket=bucket,
        Key=new_url
    )

In [None]:
print("Finished!")