In [49]:
from collections import namedtuple, OrderedDict, defaultdict
import gensim.utils 
import pandas as pd 
import numpy as np 
import logging

import statsmodels.api as sm
from random import sample

import multiprocessing

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [180]:
import glob 
review_file = "/Users/jakubmojsiejuk/Documents/agh/filmweb-nlp/reviews_for_bert*.csv"

reviews = pd.DataFrame()
for rev_file in glob.glob(review_file):
    r = pd.read_csv(rev_file)
    reviews = pd.concat([reviews, r])
    
reviews = reviews.loc[(reviews['rating'] <= 3) | (reviews['rating'] >= 8)]
# msk = np.random.rand(len(radical_reviews)) < 0.7
# radical_reviews['sent'] = radical_reviews['rating'].apply(lambda x: 'pos' if x > 5 else 'neg')
# X_train, y_train = radical_reviews['content'][msk].tolist(), radical_reviews['sent'][msk].tolist()
# X_test, y_test = radical_reviews['content'][~msk].tolist(), radical_reviews['sent'][~msk].tolist()

In [189]:
TRAIN_TEST_RATIO = 0.7

def create_sentiment_document(review):
    split = 'train' if np.random.rand() < TRAIN_TEST_RATIO else 'test'
    index = review.id
    sentiment = int(review['rating'] > 5)
#     sentiment = review['rating']
    tokens = gensim.utils.to_unicode(review['content']).split()
    return SentimentDoc(tokens, [index], split, sentiment)


reviews['id'] = range(1, len(reviews)+1)
SentimentDoc = namedtuple('sentiment_doc', 'words tags split sentiment')

reviews['docs'] = reviews.apply(create_sentiment_document, axis=1)


# msk = np.random.rand(len(reviews)) < TRAIN_TEST_RATIO
# train = reviews['docs'][msk].tolist()
# test = reviews['docs'][~msk].tolist()
all_docs = reviews['docs'].tolist()
train_docs = [doc for doc in all_docs if doc.split == 'train']
test_docs = [doc for doc in all_docs if doc.split == 'test']

In [190]:
print(len(train_docs), len(test_docs), len(all_docs))

945 413 1358


In [191]:
common_kwargs = dict(
    vector_size=200, epochs=30, min_count=15,
    sample=0, workers=multiprocessing.cpu_count(), negative=5, hs=1,
)

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, **common_kwargs),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, window=10, alpha=0.05, comment='alpha=0.05', **common_kwargs),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, window=5, **common_kwargs),
]

for model in simple_models:
    model.build_vocab(all_docs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dbow,d200,n5,hs,mc15,t8) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8) vocabulary scanned & state initialized
Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8) vocabulary scanned & state initialized


In [192]:
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

In [193]:
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [194]:
from sklearn.linear_model import LogisticRegression
def logistic_predictor_from_data(train_targets, train_regressors):
    """Fit a statsmodel logistic predictor on supplied data"""
    logit = sm.Logit(train_targets, train_regressors)
    try:
        predictor = logit.fit(disp=0, method='bfgs')
    except Exception as e:
        print(f"ERROR encountered {e}")
        print(train_targets)
        return None
#     print(predictor.summary())
    return predictor

def multinomial_predictor_from_data(train_targets, train_regressors):
    """Fit a statsmodel logistic predictor on supplied data"""
    clf = LogisticRegression(random_state=-0).fit(train_targets, train_regressors)
    return clf 
#     logit = sm.MNLogit(train_targets, train_regressors)
#     try:
#         predictor = logit.fit(disp=0, method='bfgs')
#     except Exception as e:
#         print(f"ERROR encountered {e}")
#         print(train_targets)
#         return None
# #     print(predictor.summary())
#     return predictor

def error_rate_for_model(test_model, train_set, test_set):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    train_targets = [doc.sentiment for doc in train_set]
    train_regressors = [test_model.docvecs[doc.tags[0]] for doc in train_set]
    train_regressors = sm.add_constant(train_regressors)
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_set]
    test_regressors = sm.add_constant(test_regressors)

#     if predictor is None:
#         return (1.0, len(test_regressors), len(test_regressors), None)
    
#     Predict & evaluate
    test_predictions = predictor.predict(test_regressors)
#     p = np.rint(test_predictions)
#     clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=200, penalty='l1',
#                              multi_class='multinomial').fit(train_regressors, train_targets)
#     print(clf.score(train_regressors, train_targets))
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_set])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

In [195]:
from collections import defaultdict
error_rates = defaultdict(lambda: 1.0) 

from random import shuffle
shuffled_alldocs = all_docs[:]
shuffle(shuffled_alldocs)

for model in simple_models:
    print("Training %s" % model)
    model.train(shuffled_alldocs, total_examples=len(shuffled_alldocs), epochs=model.epochs)

    print("\nEvaluating %s" % model)
    err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))

for model in [models_by_name['dbow+dmm'], models_by_name['dbow+dmc']]:
    print("\nEvaluating complex %s" % model)
    err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))

Training Doc2Vec(dbow,d200,n5,hs,mc15,t8)

Evaluating Doc2Vec(dbow,d200,n5,hs,mc15,t8)

0.184019 Doc2Vec(dbow,d200,n5,hs,mc15,t8)

Training Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)





Evaluating Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)

0.259080 Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)

Training Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)





Evaluating Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)

0.370460 Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)


Evaluating complex Doc2Vec(dbow,d200,n5,hs,mc15,t8)+Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)





0.246973 Doc2Vec(dbow,d200,n5,hs,mc15,t8)+Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)


Evaluating complex Doc2Vec(dbow,d200,n5,hs,mc15,t8)+Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)

0.208232 Doc2Vec(dbow,d200,n5,hs,mc15,t8)+Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)





In [196]:
print("Err_rate Model")
for rate, name in sorted((rate, name) for name, rate in error_rates.items()):
    print("%f %s" % (rate, name))

Err_rate Model
0.184019 Doc2Vec(dbow,d200,n5,hs,mc15,t8)
0.208232 Doc2Vec(dbow,d200,n5,hs,mc15,t8)+Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)
0.246973 Doc2Vec(dbow,d200,n5,hs,mc15,t8)+Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)
0.259080 Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8)
0.370460 Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8)


In [141]:
doc_id = np.random.randint(simple_models[0].docvecs.count)  # Pick random doc; re-run cell for more examples
print('for doc %d...' % doc_id)
for model in simple_models:
    inferred_docvec = model.infer_vector(all_docs[doc_id].words)
    print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))

for doc 16...
Doc2Vec(dbow,d200,n5,hs,mc15,t8):
 [(17, 0.9894669055938721), (3223, 0.30570924282073975), (230, 0.3021594285964966)]
Doc2Vec("alpha=0.05",dm/m,d200,n5,hs,w10,mc15,t8):
 [(17, 0.95763099193573), (1876, 0.3645714223384857), (4726, 0.3634454607963562)]
Doc2Vec(dm/c,d200,n5,hs,w5,mc15,t8):
 [(17, 0.9145972728729248), (1232, 0.2573408782482147), (2882, 0.2285965383052826)]


In [99]:
import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(all_docs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    s = sims[index]
    i = sims[index][0]
    words = ' '.join(all_docs[i].words)
    print(u'%s %s: «%s»\n' % (label, s, words))

TARGET (3146): «"Green Hornet 3D" to filmowy hołd złożony wszystkim tym, którzy nie chcą dorosnąć, którzy marzą o niesamowitych przygodach, bohaterskich wyczynach i zabawkach od których włosy staną dęba. Zamiast standardowej adaptacji komiksu twórcy postanowili zrobić film fanowski i o fanach, zarazić widzów entuzjazmem do kolorowych historyjek, które rozbudzają wyobraźnię milionów chłopców na całym świecie. Ogólny zarys fabuły jest rzecz jasna wzięty z komiksu o Zielonym Szerszeniu. Brit Reid jest zepsutym dziedzicem medialnej fortuny, dla którego życie to jedna, nieustająca balanga. Zmienia się to, kiedy jego ojciec umiera w wyniku reakcji uczuleniowej na pszczeli jad. Wtedy Brit poznaje mechanika ojca, Kato, w którym odnajduje bratnią duszę. Razem wyruszają nocą na wyprawę, która ma być dziecinnym dowcipem, a kończy się narodzinami superbohatera udającego superzłoczyńcę. Panowie nie wzięli pod uwagę, że ta posada jest już zajęta, co będzie miało swoje brutalnie bolesne konsekwencje.

In [102]:
import random

word_models = simple_models[:]

def pick_random_word(model, threshold=10):
    # pick a random word with a suitable number of occurences
    while True:
        word = random.choice(model.wv.index2word)
        if model.wv.vocab[word].count > threshold:
            return word

target_word = pick_random_word(word_models[0])
# or uncomment below line, to just pick a word from the relevant domain:
# target_word = 'comedy/drama'

for model in word_models:
    print('target_word: %r model: %s similar words:' % (target_word, model))
    for i, (word, sim) in enumerate(model.wv.most_similar(target_word, topn=10), 1):
        print('    %d. %.2f %r' % (i, sim, word))
    print()

target_word: 'wszelką' model: Doc2Vec(dbow,d100,n5,mc2,t8) similar words:
    1. 0.43 'ludzie:'
    2. 0.41 'zamknięty'
    3. 0.40 'określili),'
    4. 0.38 '(MGM)Screen'
    5. 0.38 '9.'
    6. 0.38 'świętej'
    7. 0.38 'Zawstydzony'
    8. 0.38 'szukasz'
    9. 0.38 'dryfującym'
    10. 0.37 'rozróba'

target_word: 'wszelką' model: Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t8) similar words:
    1. 0.61 'cenę'
    2. 0.50 'zapłacić'
    3. 0.48 'odpowiedzialność'
    4. 0.47 'przestrzec'
    5. 0.47 'młodu'
    6. 0.46 'kraty'
    7. 0.46 'sądem'
    8. 0.46 'stylizację,'
    9. 0.46 'karę'
    10. 0.44 'uchronić'

target_word: 'wszelką' model: Doc2Vec(dm/c,d100,n5,w5,mc2,t8) similar words:
    1. 0.72 'nieudolną'
    2. 0.70 'granicą'
    3. 0.69 'efektowną,'
    4. 0.69 'dobrą'
    5. 0.69 'fasadą'
    6. 0.68 'sprayem'
    7. 0.68 'wiktoriańską'
    8. 0.67 'sprzeniewierzonymi'
    9. 0.67 'dotykiem,'
    10. 0.67 'cudzą'

