In [1]:
from typing import List
import datetime as dt
import re
import os
from collections import defaultdict
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from enum import Enum
from random import choices
from random import randint
import json

mystem = Mystem()

import nltk
nltk.download("stopwords")
#--------#

from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/annikura/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
with open('data_jokes_clean.json', 'r') as file:
    json_jokes = json.load(file)['posts']

In [3]:
def process_to_tokens(text):
    text = (' '.join(text.split(r'\n'))).lower()
    
    while True:
        new_s = re.sub("\\.\\.+", ".", text)
        if new_s == text:
            break
        text = new_s
    
    word = ''
    number = ''
    
    tokens = []
    
    is_whitespace = re.compile('\s')
    for sym in text:
        if sym.isalpha():
            if number:
                tokens.append(number)
                number = ''
            word += sym
            continue
        if sym.isnumeric():
            if word:
                tokens.append(word)
                word = ''
            number += sym
            continue
        if number:
                tokens.append(number)
        if word:
                tokens.append(word)
        number, word = '', ''
        if is_whitespace.match(sym) or sym == r'\\' or sym == "'":
            continue
        tokens.append(sym)
    if number:
                tokens.append(number)
    if word:
            tokens.append(word)
    return tokens

In [4]:
process_to_tokens("I hope you're all getting your Walter Cronkite jokes in order...... \nHe's next.. ")

['i',
 'hope',
 'you',
 're',
 'all',
 'getting',
 'your',
 'walter',
 'cronkite',
 'jokes',
 'in',
 'order',
 '.',
 'he',
 's',
 'next',
 '.']

In [5]:
words = defaultdict(int)

for joke in json_jokes:
    if len(joke['title']) + len(joke['text']) < 100:
        text = process_to_tokens(joke['title'] + " " + joke['text'])
        for word in text:
            if word not in english_stopwords and word.isalpha() and len(word) > 4:
                words[word] += 1        
tags = dict(list(sorted(words.items(), key=lambda x: x[1], reverse=True))[:10])

In [6]:
jokes = []

archive = {} 

words = defaultdict(int)

for joke in json_jokes:
    if len(joke['title']) + len(joke['text']) < 300 and "edit :" not in joke['text']:
        text = process_to_tokens(joke['title'] + " " + joke['text'])
        archive[' '.join(text)] = joke
        jokes.append((joke['tag'], text))
        for tag in tags:
            if tag in text:
                jokes.append((tag, text))

In [7]:
class Dictionary:
    def __init__(self):
        self.cnt = 0
        self.d = {}
        self.rev_d = {}
    
    def add_tokens(self, tokens):
        for token in tokens:
            if token not in self.d:
                self.d[token] = self.cnt
                self.rev_d[self.cnt] = token
                self.cnt += 1
                
    def get_id(self, token):
        return self.d[token]
    
    def get_token(self, i):
        return self.rev_d[i]
    
    def get_cnt(self):
        return self.cnt

In [8]:
dictionary = Dictionary()

for _, joke in jokes:
    dictionary.add_tokens(joke)

In [9]:
dictionary.get_cnt()

25243

In [10]:
class SpecialTokens(Enum):
    START = 0,
    END = 1
    
    
class UniGrammGenerator:
    def __init__(self):
        self.table = defaultdict(lambda: defaultdict(int))
        self.cnt = 0
        
    def generate_next_word(self, context, _):
        possible_words = self.table[context]
        return choices(list(possible_words.keys()), weights=list(possible_words.values()))
        
    def generate(self, context):
        text = []
        while(text[-1] != SpecialTokens.END):
            new_word = self.generate_next_word(context, None)
            text += new_word
        return " ".join(text[:-1])
    
    
    def get_prob(self, context, last_n):
        return self.cnt
    
    def add_ngram(self, context, ngram):
        self.table[context][ngram] += 1
        self.cnt += 1
    
    def add_ngrams(self, context, ngrams):
        for ngram in ngrams:
            self.add_ngram(context, ngram)
    
    def learn_one_text(self, context, text):
        tokens = text + [SpecialTokens.END]
        self.add_ngrams(context, tokens)
    
    def learn(self, data):
        for context, text in data:
            self.learn_one_text(context, text)
            
    
class NGrammGenerator:
    def __init__(self, N):
        assert N > 1
        self.N = N
        self.table = defaultdict(lambda: defaultdict(int))
        self.context_table = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        self.context_sums = defaultdict(lambda: defaultdict(int))
        self.context_weights = defaultdict(int)
        self.dict = set()
        self.d = Dictionary()
        
        self.worse = NGrammGenerator(N - 1) if N > 2 else UniGrammGenerator()
        
    def get_prob(self, context, last_n):
        return self.context_sums[context][last_n]
        
    def generate_next_word(self, context, last_n):
        context_size = self.context_sums[context][last_n] * 1000 // self.N
        wider_size = self.worse.get_prob(context, last_n[:-1])
        
        if context_size + wider_size - 1 <= 0:
            return self.worse.generate_next_word(context, last_n[:-1])
        r = randint(0, context_size + wider_size - 1)
        if r >= context_size:
            return self.worse.generate_next_word(context, last_n[:-1])
        possible_words = self.context_table[context][last_n]
        return choices(list(possible_words.keys()), weights=list(possible_words.values()))
        
    def generate(self, context):
        text = [SpecialTokens.START]
        while(text[-1] != SpecialTokens.END):
            last_n = text[-self.N + 1:][::-1]
            last_n = last_n + (self.N - 1 - len(last_n)) * [None]
            while True:
                new_word = self.generate_next_word(context, tuple(last_n))
                if new_word[0] == SpecialTokens.END or new_word[0].isalpha() or len(text) == 1 or text[-1].isalpha():
                    break
            text += new_word
        return " ".join(text[1:-1])
    
    def add_ngram(self, context, ngram):
        self.table[ngram[1:]][ngram[0]] += 1
        self.context_table[context][ngram[1:]][ngram[0]] += 1
        self.context_sums[context][ngram[1:]] += 1
        self.context_weights[context] += 1
    
    def add_ngrams(self, context, ngrams):
        for ngram in ngrams:
            self.add_ngram(context, ngram)
    
    def learn_one_text(self, context, text):
        for word in text:
            self.dict.add(word)
        tokens = [SpecialTokens.START] + text + [SpecialTokens.END]
        ngrams = []
        for i in range(self.N):
            ngrams.append([None] * i + tokens)
        self.add_ngrams(context, zip(*ngrams))
    
    def learn(self, data):
        self.worse.learn(data)
        for context, text in data:
            self.d.add_tokens(text)
            self.learn_one_text(context, text)
            

In [11]:
generator = NGrammGenerator(4)

In [12]:
generator.learn(jokes)

In [13]:
generator.context_table.keys()

dict_keys([None, 'jokes', 'would', 'difference', 'black', 'women', 'walks', 'never', 'people', 'always', 'favorite'])

In [25]:
dist = 0

while dist < 6:
    joke = generator.generate('black')
    f_joke = set(process_to_tokens(joke))
    closest = ''
    dist = 100000000

    for _, j in jokes:
        j_s = set(j)
        d = min(len(j_s) + len(f_joke) - 2 * len(j_s.intersection(f_joke)), len(f_joke) - len(j_s.intersection(f_joke)))
        if dist > d:
            dist = d
            closest = ' '.join(j)
        
print(joke)
print(dist, closest)
archive[closest]

a black man walking along carrying a tv down the street . when we see a black guy walks into a bar . a black guy a mexican and a jew walk in to a bar the bartender sees the parrot and says , she got that from me , my wife on the other side .
21 so my friend and i are walking down the street . when we see a black man across the road carrying a large , expensive looking television . i look over to my friend and say , " holy shit dude ! is that yours ? " and he says , " no . mine is back home cleaning my shoes . "


{'id': '101iqt',
 'tag': None,
 'score': 17,
 'title': 'So my friend and I are walking down the street...',
 'text': 'when we see a black man across the road carrying a large, expensive looking television. I look over to my friend and say, "Holy shit dude! Is that yours?" And he says, "No. Mine is back home cleaning my shoes."'}

In [24]:
for i in range(10):
    name = choices(list(generator.context_weights.keys()), list(generator.context_weights.values()))[0]
    print("{}:{}".format(name, generator.generate(name)))

None:i hate the fucking eagles , man !
None:how many mosquitoes does it take to change a lightbulb ? two , but i came here to get into ? the time between when i get home .
None:why does waldo wear stripes ? because he was looking for .
None:there ’ s a new wheelchair party forming but it doesn t have any patients
black:a black hole was telling me a story . it sucked me right in .
None:my girlfriend left me i came home to find that complex branching pathways have been cut into his field ?
None:i rang my boss and asked how he was proud that his son had sex . he was trying to get a comb from a policeman pulls him over . the stoned driver waits for it to burn some calories .
never:why should you never trust an atom they make up everything .
None:best asian joke im writing a speech for best friend s 21 st he is asian points for immaturity and vulgarity must be short help me r / askreddit / comments / skeh 8 i m funny i started my new job , my boss offered me a nice stable job . i want to go