In [234]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
import heapq
import nltk
from nltk.corpus import stopwords
import itertools
import gensim
%matplotlib inline

In [235]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/MarkPotanin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [236]:
english_stopwords = set(stopwords.words('english'))

In [237]:
dataset = fetch_20newsgroups(
    remove=('headers', 'footers', 'quotes')
)

In [238]:
pured_documents = []
for i, doc in enumerate(dataset.data):
    tokens = gensim.utils.lemmatize(doc)
    document = []
    for token in tokens:
        word = token.split('/')[0]
        if word not in english_stopwords:
            document.append(word)
    pured_documents.append(document)    
    if i % 500 == 0:
        print 'Processed: ', i, 'documents from', len(dataset.data)

Processed:  0 documents from 11314
Processed:  500 documents from 11314
Processed:  1000 documents from 11314
Processed:  1500 documents from 11314
Processed:  2000 documents from 11314
Processed:  2500 documents from 11314
Processed:  3000 documents from 11314
Processed:  3500 documents from 11314
Processed:  4000 documents from 11314
Processed:  4500 documents from 11314
Processed:  5000 documents from 11314
Processed:  5500 documents from 11314
Processed:  6000 documents from 11314
Processed:  6500 documents from 11314
Processed:  7000 documents from 11314
Processed:  7500 documents from 11314
Processed:  8000 documents from 11314
Processed:  8500 documents from 11314
Processed:  9000 documents from 11314
Processed:  9500 documents from 11314
Processed:  10000 documents from 11314
Processed:  10500 documents from 11314
Processed:  11000 documents from 11314


In [239]:
text = nltk.Text(list(itertools.chain.from_iterable(pured_documents)))

In [226]:
class NltkHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        candidates = Counter()
        for word in words:
            for w in self.explain(word)[:5]:
                candidates[w] += 1
        for word in words:
            if word in candidates:
                del candidates[word]
        return [x for x, _ in candidates.most_common(len(candidates))]
    
    def explain(self, word):
        return text._word_context_index.similar_words(word)
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [281]:
class mixed(object):
    def __init__(self):
        pass
    
    def merge_two_dicts(x, y):
    
        z = x.copy()
        z.update(y)
        return z
    def guess(self, words):
        candidates = Counter()
        for word in words:
            for w in self.explain(word)[:5]:
                candidates[w] += 1
        for word in words:
            if word in candidates:
                del candidates[word]
        a=dict(model.most_similar(positive=words))
        b=dict(candidates.most_common(len(candidates)))
        c=merge_two_dicts(a,b)
        
        return sorted(c, key=c.get, reverse=True)
    
    def explain(self, word):
        a=[x for x, _ in model.most_similar(positive=[word])]+text._word_context_index.similar_words(word)
        return np.random.choice(a,len(a))
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [228]:
from gensim.models import Word2Vec

In [286]:
model = Word2Vec(pured_documents, size=100, window=5, min_count=5, workers=4,sg=1)

In [230]:
class GensimHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        return [x for x, _ in model.most_similar(positive=words)]
    
    def explain(self, word):
        return [x for x, _ in model.most_similar(positive=[word])]
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [None]:
sorted(d, key=.get, reverse=True)[:5]

In [231]:
words_universe = model.vocab.keys()
words_universe_set = set(model.vocab.keys())

In [232]:
def calc_score(player1, player2, word):
    explanation = [x for x in player1.explain(word) if x in words_universe_set][:10]
    exp_score = 0.
    if word in explanation:
        exp_score -= 1.0
    guess_score = 0.
    if len(explanation) > 0:
        for pref in xrange(10):
            guess = player2.guess(explanation[:pref + 1])[:10]
            try:
                pos = guess.index(word)
                player1.explanation_result(pos)
                player2.guess_result(pos)
                guess_score += 0.9 ** pos
                exp_score += 0.9 ** pos
            except ValueError:
                player1.explanation_result(None)
                player2.guess_result(None)
    return exp_score, guess_score


def play(player1, player2, rounds, seed=42):
    random_gen = np.random.RandomState(seed)
    player1_exp_score = 0.
    player1_guess_score = 0.
    player2_exp_score = 0.
    player2_guess_score = 0.
    for _ in xrange(rounds):
        word = random_gen.choice(words_universe)
        player1_exp, player2_guess = calc_score(player1, player2, word)
        player2_exp, player1_guess = calc_score(player2, player1, word)
        player1_explanation = player1.explain(word)
        player2_explanation = player1.explain(word)
        
        player1_exp_score += player1_exp
        player2_exp_score += player2_exp
        player1_guess_score += player1_guess
        player2_guess_score += player2_guess
        
    return (
        (player1_exp_score, player1_guess_score, player1_guess_score + player1_exp_score),
        (player2_exp_score, player2_guess_score, player2_guess_score + player2_exp_score)
    )

In [285]:

%%time
play(mixed(), GensimHatPlayer(), 1000)

CPU times: user 2min 47s, sys: 1.69 s, total: 2min 48s
Wall time: 1min 25s


((2627.0703817790027, 3238.650122383999, 5865.720504163002),
 (3238.650122383999, 2627.0703817790027, 5865.720504163002))

In [240]:
text.similar('sdas')

No matches


In [282]:
%%time
play(mixed(), NltkHatPlayer(), 1000)

CPU times: user 8min 18s, sys: 6.5 s, total: 8min 25s
Wall time: 4min 19s


((817.2088766669998, 869.6168147179999, 1686.8256913849996),
 (869.6168147179999, 817.2088766669998, 1686.8256913849996))

In [243]:
%%time
play(GensimHatPlayer(), NltkHatPlayer(), 1000)

CPU times: user 48.3 s, sys: 1.19 s, total: 49.5 s
Wall time: 25.7 s


((130.590049074, 135.362607879, 265.952656953),
 (135.362607879, 130.590049074, 265.952656953))