In [1]:
from keybert import KeyBERT

doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).
      """

# word embedding tutorial:
# https://github.com/MaartenGr/KeyBERT/blob/master/docs/guides/embeddings.md

import spacy

# using spacy transformer
# installation: https://spacy.io/usage
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
kw_model = KeyBERT(model=nlp)

# using gensim models
# list of available models: https://github.com/RaRe-Technologies/gensim-data
import gensim.downloader as api
ft = api.load('fasttext-wiki-news-subwords-300')
kw_model = KeyBERT(model=ft)


keywords = kw_model.extract_keywords(doc)

print(keywords)

  from .autonotebook import tqdm as notebook_tqdm
  incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask


[('labeled', 0.7936), ('situations', 0.7866), ('instances', 0.786), ('examples', 0.7758), ('algorithm', 0.7721)]


In [9]:
import pandas as pd
from pathlib import Path

import random

random.seed(a=40)
p = 0.1

from pathlib import Path
dataset_path = Path('dataset.csv')

review = pd.read_csv(dataset_path)

In [18]:
review.review_text = review.review_text.astype('str')

review_sorted = review.sort_values(by='review_text', key=lambda x: x.str.len(), ascending=False)


In [27]:
review_sorted.iloc[11:30]

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
5772010,483980,Mad Father,Mad Father is a Wolf Editor Horror Game (think...,1,1
5934657,527710,Crown Champion: Legends of the Arena,This game is beyond awesome. It has SO much po...,1,0
2512879,246090,Spacebase DF-9,"When I first saw it, I was amazed. I love Dwar...",-1,0
2512868,246090,Spacebase DF-9,"When I first saw it, I was amazed. I love Dwar...",-1,0
2316987,242050,Assassin's Creed IV Black Flag,Let me first say that this is a terrible port....,1,1
5761496,48190,Assassin's Creed Brotherhood,Assassin's Creed: Brotherhood is the second ch...,1,0
2150631,238750,Might & Magic X - Legacy,Greets. To anyone who have doubts about purcha...,-1,1
2609879,24980,Mass Effect 2 (2010),I have a hard time speaking in a balanced mann...,1,1
2609897,24980,Mass Effect 2 (2010),I have a hard time speaking in a balanced mann...,1,1
4857956,374320,DARK SOULS™ III,This review won't focus much on story since ...,1,0


In [24]:
review_sorted.loc[4911650]['review_text']

"'Thea' is definitely good for several play-throughs, and worth the money at full price.&nbsp;The quest writing, the music and the voice talent are all really, really excellent. I give it 4 out of 5 stars. The game has the potential of becoming a classic, 5 stars, a really great game, if they add more quests and if they balance the early, mid- and end-game better.&nbsp;By the end-game, I am winning all fights/quests easily on auto-resolve, however this may be due to the AI fighting my side using the best card combos? Also, it would be great if you could start play as the other races, e.g., Orcs, Goblins, Elves and Dwarves and not always have to start as humans (this would requiring play balancing as the other races are typically stronger and end-game characters). Try it, you will like it if you like games like Civ, and especially if you like card combat games.  I have played 'Thea' all the way through three times now.&nbsp;It took me about a day each time, but I was progressively makin

In [26]:
sample_2 = review_sorted.loc[4911650]['review_text']

print(len(sample_2))

keywords_sample_1 = kw_model.extract_keywords(sample_2)

print(keywords_sample_1)

8173
[('quest', 0.3093), ('thea', 0.3045), ('rpg', 0.2822), ('quests', 0.2803), ('combat', 0.2363)]


In [28]:
sample_2 = review_sorted.loc[1031183]['review_text']

print(sample_2)
print(len(sample_2))

keywords_sample_2 = kw_model.extract_keywords(sample_2)

print(keywords_sample_2)

8015
[('phalanx', 0.4516), ('tactics', 0.3749), ('flank', 0.3249), ('armies', 0.317), ('battle', 0.3087)]
