In [86]:
import re
import pymongo
from collections import Counter
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt

In [2]:
client = pymongo.MongoClient()
db = client['test']
tldr = db['tldr']
tldrcl = db['tldr_clean']

In [138]:
def cleanstr(text):
    text = text.lower()
    # replace parenthesis by comma
    text = re.sub(r'\s*[\(\)]', ' , ', text)
    # remove annoying signs
    text = re.sub(r'[^A-Za-z\,\.\?\']+', r" ", text)
    # keep first sign in sequence of sign
    text = re.sub(r'\s*([\,\.\?\' ])([\,\.\?\' ]+|$)', r" \1 ", text)
    # remove surrounded signs
    text = re.sub(r'[^A-Za-z]*([A-Za-z]+)[^A-Za-z]*', r" \1 ", text)
    # compress spaces
    text = re.sub(r'\s\s+', r" ", text)
    return text.strip()

class LanguageIndex():
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = Counter()
        
    def create_index(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = sorted(
            w
            for w, num in self.vocab.items()
            if num > 18500
        )
        self.word2idx["<pad>"] = 0
        self.idx2word[0] = "<pad>"
        self.word2idx["<noword>"] = 1
        self.idx2word[1] = "<noword>"
        self.word2idx["<start>"] = 2
        self.idx2word[2] = "<start>"
        self.word2idx["<end>"] = 3
        self.idx2word[3] = "<end>"
        
        for i,word in enumerate(self.vocab):
            self.word2idx[word] = i + 4
            self.idx2word[i+4] = word
            
    def update_vocab(self, text):
        self.vocab.update(text.split())
        
LangIdx = LanguageIndex()

In [12]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [103]:
# tldrcl.drop()
datalen = tldr.estimated_document_count()

for idx, data in enumerate(tldr.find({})):
    dataid  = data['_id']
    
    content = data['content']
    content = cleanstr(content)
    summary = data['summary']
    summary = cleanstr(summary)
    
    LangIdx.update_vocab(content)
    LangIdx.update_vocab(summary)
    
    tldrcl.insert_one(
        {
            "_id": dataid,
            "content": content,
            "summary": summary
        }                   
    )
    if (idx % 1000 == 0):
        update_progress(idx / datalen)

Progress: [####################] 100.0%


In [73]:
datalen = tldrcl.estimated_document_count()

for idx, data in enumerate(tldrcl.find({})):
    LangIdx.update_vocab(data['summary'])
    LangIdx.update_vocab(data['content'])
    if (idx % 1000 == 0):
        update_progress(idx / datalen)

Progress: [####################] 100.0%


In [158]:
LangIdx.vocab = store
LangIdx.create_index()
print(len(LangIdx.vocab))

2460


In [134]:
import json
with open('vocab.txt', 'w') as f:
    json.dump(store, f)

In [173]:
def looseword2idx(word):
    return LangIdx.word2idx.get(word, 1)

datalen = tldrcl.estimated_document_count()

for idx, data in enumerate(tldrcl.find({})):
    summary_words = data['summary'].split()
    content_words = data['content'].split()
    
    tldrcl.update_one(
        {'_id': data['_id']},
        {'$set': {
            'summary_vec_must_word': [looseword2idx(w) for w in summary_words if looseword2idx(w) != 1],
            'content_vec_must_word': [looseword2idx(w) for w in content_words if looseword2idx(w) != 1],
        }}
    )
    
    if (idx % 1000 == 0):
        update_progress(idx / datalen)

Progress: [####################] 100.0%


In [156]:
tldrcl.delete_many(
    {
        'content_len': {'$lt': 125}
    })

<pymongo.results.DeleteResult at 0x7fef31eb71c8>

In [147]:
tldrcl.delete_many(
    {
        '$or': [
            {'summary_len': {'$gt': 28}},
            {'content_len': {'$gt': 295}}
        ]
    })

  """


{'n': 719266, 'ok': 1.0}

In [157]:
tldrcl.estimated_document_count()

1699783

In [170]:
t=[*range(10)]

In [171]:
t.remove(1)

In [172]:
t

[0, 2, 3, 4, 5, 6, 7, 8, 9]