In [1]:
import numpy as np
import pandas as pd

from datasets import load_dataset, Split
from nltk import FreqDist
from common.util import preprocess, create_co_matrix, most_similar, ppmi

from common.util import compute_ppmi_matrix
from common.gensim_fns import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = load_dataset(path='csv', data_files='dataset/yelp/train_light.csv',
                             quotechar='"', split=Split.TRAIN)

test_dataset = load_dataset(path='csv', data_files='dataset/yelp/test_light.csv',
                            quotechar='"', split=Split.TRAIN)

Generating train split: 45500 examples [00:00, 167081.21 examples/s]
Generating train split: 9750 examples [00:00, 178839.29 examples/s]


In [3]:
train_dataset, test_dataset

(Dataset({
     features: ['rating', 'review', 'split'],
     num_rows: 45500
 }),
 Dataset({
     features: ['rating', 'review', 'split'],
     num_rows: 9750
 }))

In [4]:
train_dataset.unique('rating')

[1, 2, 3, 4, 5]

In [5]:
if False:
    def is_positive_label(row):
        is_positive = int(row['rating'] >= 4)
        return {'labels': is_positive}

    train_dataset = train_dataset.map(is_positive_label)
    test_dataset = test_dataset.map(is_positive_label)

    make_vocab(train_dataset['review'], 'dataset/yelp/', 
            special_tokens=['[UNK]', '[SEP]', '[CLS]'],
            min_freq=2)

In [6]:
docs = train_dataset['review']
print(type(docs))
print()


tokens = [simple_preprocess(doc) for doc in docs]
print(type(tokens))
print()


dictionary = corpora.Dictionary(tokens)
print(type(dictionary))
print()


vocab = list(dictionary.token2id.keys())

<class 'list'>

<class 'list'>

<class 'gensim.corpora.dictionary.Dictionary'>



In [7]:
len(docs)

45500

In [8]:
len(tokens)

45500

In [9]:
dictionary.token2id['delicious'], dictionary.token2id['mediocre'], dictionary.token2id['positive'], dictionary.token2id['negative']

(3000, 2172, 1098, 2604)

In [10]:
len(vocab)

56276

In [11]:
word_freq = FreqDist([word for review in tokens for word in review])

window_size = 2
co_occurrences = {}
pmis = {}

for review in tokens:
    for i in range(len(review)):
        for j in range(i + 1, min(i + window_size + 1, len(review))):
            pair = (review[i], review[j])
            co_occurrences[pair] = co_occurrences.get(pair, 0) + 1

total_co = sum(co_occurrences.values())
for (word_i, word_j), count in co_occurrences.items():
    p_i = word_freq[word_i] / total_co
    p_j = word_freq[word_j] / total_co
    p_ij = count / total_co
    pmi = np.log2(p_ij / (p_i * p_j))
    pmis[(word_i, word_j)] = pmi

In [12]:
vocab_size = len(vocab)
co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int16)
word_to_id = dictionary.token2id

for (word_i, word_j), count in co_occurrences.items():
    i, j = word_to_id[word_i], word_to_id[word_j]
    co_matrix[i, j] = count
    co_matrix[j, i] = count

co_matrix

array([[8, 5, 0, ..., 0, 0, 0],
       [5, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [13]:
pd.DataFrame(co_matrix, index=vocab, columns=vocab)

Unnamed: 0,been,believe,cannot,care,days,for,has,havent,is,life,...,fsg,knave,smlz,duane,iplanned,ingersoll,rollerblades,vacuous,grieved,manacotti
been,8,5,0,2,2,9,11,11,15,1,...,0,0,0,0,0,0,0,0,0,0
believe,5,1,1,0,1,11,1,1,7,1,...,0,0,0,0,0,0,0,0,0,0
cannot,0,1,1,2,1,7,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
care,2,0,2,2,1,24,3,1,34,2,...,0,0,0,0,0,0,0,0,0,0
days,2,1,1,1,10,66,2,0,9,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ingersoll,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
rollerblades,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
vacuous,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
grieved,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
ppmi_matrix = compute_ppmi_matrix(co_occurrences, word_freq, 
                                  len(vocab), word_to_id)

np.set_printoptions(precision=2)
ppmi_matrix

array([[0.  , 1.6 , 0.  , ..., 0.  , 0.  , 0.  ],
       [1.6 , 2.71, 3.6 , ..., 0.  , 0.  , 0.  ],
       [0.  , 3.6 , 4.48, ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]], dtype=float16)

In [15]:
pd.DataFrame(ppmi_matrix, index=vocab, columns=vocab)

Unnamed: 0,been,believe,cannot,care,days,for,has,havent,is,life,...,fsg,knave,smlz,duane,iplanned,ingersoll,rollerblades,vacuous,grieved,manacotti
been,0.00000,1.596680,0.000000,0.000000,0.000000,0.000000,0.000000,7.968750,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
believe,1.59668,2.714844,3.599609,0.000000,2.068359,0.424561,0.000000,7.949219,0.000000,2.925781,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cannot,0.00000,3.599609,4.484375,3.564453,2.953125,0.657715,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
care,0.00000,0.000000,3.564453,1.645508,1.033203,0.515625,0.419922,6.914062,1.013672,2.890625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
days,0.00000,2.068359,2.953125,1.033203,4.742188,2.363281,0.222778,0.000000,0.000000,2.279297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ingersoll,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.281250,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rollerblades,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
vacuous,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
grieved,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# SVD
from sklearn.utils.extmath import randomized_svd

wordvec_size = 100
U, S, V = randomized_svd(ppmi_matrix, n_components=wordvec_size, 
                         n_iter='auto', random_state=42)

In [17]:
word_vecs = U[:, :wordvec_size]

In [26]:
querys = ['positive', 'negative', 'expensive', 'cheap']
for query in querys:
    most_similar(query, word_to_id, vocab, word_vecs, top=5)


[query] positive
 negative: 0.7555453181266785
 complaints: 0.6026131510734558
 attention: 0.5990016460418701
 poor: 0.5978432297706604
 lacking: 0.5527915954589844

[query] negative
 positive: 0.7555453181266785
 rating: 0.7044532895088196
 complaint: 0.6831194758415222
 reviewers: 0.6232730746269226
 reviews: 0.6072635054588318

[query] expensive
 priced: 0.7846865653991699
 overpriced: 0.7298570275306702
 high: 0.7000605463981628
 cheap: 0.6866163015365601
 reasonable: 0.6536356210708618

[query] cheap
 priced: 0.7016828656196594
 expensive: 0.6866163015365601
 inexpensive: 0.6564647555351257
 overpriced: 0.646662175655365
 high: 0.6287281513214111
