In [80]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [81]:
import title_maker_pro.urban_dictionary_scraper
import logging
import pickle
from scipy import stats
import pandas as pd
import stanza
from tqdm.notebook import tqdm
from collections import OrderedDict
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from multiprocessing.pool import ThreadPool
import io
import itertools
import numpy as np
import re
import random
from bs4 import BeautifulSoup

In [82]:
logging.basicConfig(level=logging.INFO)
session = urban_dictionary_scraper.get_session(throttle=0.1, expiry = (7*24*3600))

In [83]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 120kB [00:00, 9.53MB/s]                    
2023-01-06 23:23:31 INFO: Downloading default packages for language: en (English)...
2023-01-06 23:23:31 INFO: File exists: /Users/jingchengyang/stanza_resources/en/default.zip.
2023-01-06 23:23:34 INFO: Finished downloading models and saved to /Users/jingchengyang/stanza_resources.


In [84]:
all_urls = urban_dictionary_scraper.fetch_all_word_urls(session)
with open("all_urls.pickle", "wb") as f:
    pickle.dump(all_urls, f, pickle.HIGHEST_PROTOCOL)

FeatureNotFound: Couldn't find a tree builder with the features you requested: html.parser. Do you need to install a parser library?

In [None]:
with open("all_urls.pickle", "rb") as f:
    to_fetch = pickle.load(f)
    
with open("all_words.pickle", "rb") as f:
    already_done = pickle.load(f)
    for key in already_done.keys():
        del to_fetch[key]
        
done = 100 * len(already_done) / (len(already_done) + len(to_fetch))
print(f"Done {done:.2f} percent")

In [None]:
t = ThreadPool(5)
#with ThreadPoolExecutor(max_workers=5) as executor:
try:
    fetch_all_definitions(session, to_fetch, already_done, save_interval=10000, executor=t)    
finally:
    t.terminate()
    t.join()

In [None]:


with open("data/all_words.pickle", "rb") as f:
    words = pickle.load(f)
    
    

In [None]:
def is_clean(word, min_upvotes=20, max_word_length=40, max_symbols=2, allow_upper=False, min_word_length=4):
    if word.upvotes < min_upvotes:
        return False
    elif len(word.word) > max_word_length:
        return False
    elif len(word.word) < min_word_length:
        return False
    elif len(re.findall(r"[^\w .]", word.word)) > max_symbols:
        return False
    elif not allow_upper and word.word.isupper():
        return False
    else:
        return True
    
clean_list = [
    (k, title_maker_pro.urban_dictionary_scraper.UrbanDictionaryWord(
        title=e.title,
        url=e.url,
        definitions=[x for x in e.definitions if is_clean(x)],
    ))
    for k,e in words.items() if any(is_clean(x) for x in e.definitions)
]
random.shuffle(clean_list)
cleaned_words = OrderedDict(clean_list)

print(f"Words reduced by {len(cleaned_words) / len(words)}")

with open("data/cleaned_words_all_def_min_upvotes_20_max_len_40_min_len_4_no_upper_randomized.pickle", "wb") as f:
    pickle.dump(cleaned_words, f, pickle.HIGHEST_PROTOCOL)

In [None]:
nlp = stanza.Pipeline(processors="tokenize,pos")
def proper_noun_guess(word):
    query = word.title.upper().strip().strip("\"").strip()
    for definition in word.definitions:
        try:
            doc = nlp(definition.examples[0])
        except IndexError:
            print(f"{query}: INDEX ERROR")
            return False
        for sentence in doc.sentences:
            last_prop = []
            for word in sentence.words:
                if word.upos == "PROPN":
                    last_prop.append(word.text.upper())
                    if query == " ".join(last_prop):
                        return True
                else:
                    last_prop = []
               
pbar = tqdm(total=len(cleaned_words.values()))
for i, item in enumerate(cleaned_words.values()):
    t = proper_noun_guess(item)
    if t:
        print(f"{item.title}: {t}")
        
    pbar.update()
    
    if i > 1000:
        break
    
proper_noun_guess(next(iter(words.values())))

In [None]:
defns = pd.DataFrame(
    [
        [e.word, e.meaning, e.examples[0], e.creation_epoch, e.upvotes, e.downvotes]
        for e in itertools.chain.from_iterable(e.definitions for e in words.values())
    ],
    columns=["word", "meaning", "example", "creation_epoch", "upvotes", "downvotes"]
)

In [None]:
smoothing_prior = 20
defns["smoothed_upvotes"] = defns["upvotes"] / (defns["upvotes"] + defns["downvotes"] + smoothing_prior)

In [None]:
defns["smoothed_upvotes"].quantile(np.linspace(0.1, 1, 10))

In [None]:
cleaned_defs = defns[:]
# cleaned_defs = cleaned_defs[cleaned_defs["smoothed_upvotes"] >= 0.2]
cleaned_defs = cleaned_defs[cleaned_defs["upvotes"] >= 20]
cleaned_defs = cleaned_defs[cleaned_defs.word.str.len() <= 40]
cleaned_defs = cleaned_defs[cleaned_defs.word.str.len() >= 4]
cleaned_defs = cleaned_defs[~cleaned_defs.word.str.isupper()]

cleaned_defs = cleaned_defs[cleaned_defs.word.str.count("[^\w .]") <= 2]
print(f"Reduction from {len(defns)} to {len(cleaned_defs)} ({len(cleaned_defs) / len(defns)})")

In [None]:
cleaned_defs[cleaned_defs.word.str.upper().str.contains(",")].sample(20)

In [None]:
defns.word.str.count("[^\w ].").describe()

In [None]:
defns[defns.word.str.len() > 40].sample(n=20)

In [None]:
defns[defns.word.str.count("[^\w .]") > 2].sample(n=20)

In [None]:
(defns["meaning"].str.len() + defns["example"].str.len()).quantile(np.linspace(0.01, 1, 100))

In [None]:
lng_defs = defns[defns["meaning"].str.len() > 985]
(lng_defs["upvotes"] + lng_defs["downvotes"]).describe()

In [None]:
lng_defs = defns[defns["meaning"].str.len() < 985]
(lng_defs["upvotes"] + lng_defs["downvotes"]).describe()