# User Review Corpus from RPG Games Published in Steam

## Scraping Data

In [1]:
from bs4 import BeautifulSoup
import requests

def get_reviews(appid, params={'json':1}, lang="english", filter="toprated"):
        url=f'https://store.steampowered.com/appreviews/{appid}'
        response = requests.get(url=url, params=params)
        return response.json()
    
def get_n_reviews(appid, n):
    reviews = []
    cursor = '*'
    params = {
            'json' : 1,
            'filter' : 'recent',
            'language' : 'english',
            'day_range' : 28,
            'review_type' : 'all',
            'purchase_type' : 'steam'
            }

    while n > 0:
        params['cursor'] = cursor.encode()
        params['num_per_page'] = 100
        n -= 1

        response = get_reviews(str(appid))
        cursor = response['cursor']
        reviews += response['reviews']

        if len(response['reviews']) < 100: break

    return reviews

n=100
reviews = get_n_reviews(1086940,100)

## Text Vectorization Process

In [2]:
import re, string, unicodedata
 
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

documents = []

def tokenize(s):
    s = remove_non_ascii(s)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    res = regex.sub('', s)
    # res = res.split()
    return res

for review in reviews:
    # clean_review = tokenize(review['review'])
    # for clean in clean_review:
        # documents.append(clean)
    clean = remove_non_ascii(review['review'])
    documents.append(tokenize(clean))

documents

['Its DnD for people with no friends',
 'Karlach',
 'Im not any fancy reviewer however I am a gamer and consumer like the rest of you kindly requesting you offer a few minutes to hear what I have to share\nLet me start being brief Larian Studios has blown me away\n\nFor some context of who I am and where I come from I have loved the divinity games for a few years now and was hyped for them being announced to make the successor to Balldurs Gate I  II \nI am a also a Gamemaster and have been for Dungeons  Dragons nearly fourteen years now Ive been in the DnD sphere of games and RPGs since the beginnings of 2nd Edition when I was really young I have experienced each system across the board numerous titles accompanying the array of editions published \n\nTo speak of BG III itself What Larian did here not only blew away my expectations as a DD experience but honored many of the greatest markers of DD and RPG and CRPG history that can be found as DD has evolved over its many years What Laria

In [3]:
from gensim import corpora
import spacy, re, string
from spacy.tokens import Token

tag_getter = lambda token: token.text in ("h1", "h2", "i", "\n", "\n\n", "\u3000", "\u3000\u3000", '\n\u3000')
Token.set_extension("is_tag", getter=tag_getter, force=True)
nlp = spacy.load('en_core_web_sm')

texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num and not w._.is_tag:
            text.append(w.lemma_.strip())
    texts.append(text)
print(texts)

[['dnd', 'people', 'friend'], ['Karlach'], ['m', 'fancy', 'reviewer', 'gamer', 'consumer', 'like', 'rest', 'kindly', 'request', 'offer', 'minute', 'hear', 'share', 'let', 'start', 'brief', 'Larian', 'Studios', 'blow', 'away', 'context', 'come', 'love', 'divinity', 'game', 'year', 'hype', 'announce', 'successor', 'Balldurs', 'Gate', '', 'II', 'Gamemaster', 'dungeon', '', 'dragon', 'nearly', 'year', 've', 'DnD', 'sphere', 'game', 'rpg', 'beginning', 'Edition', 'young', 'experience', 'system', 'board', 'numerous', 'title', 'accompany', 'array', 'edition', 'publish', 'speak', 'BG', 'III', 'Larian', 'blow', 'away', 'expectation', 'dd', 'experience', 'honor', 'great', 'marker', 'DD', 'RPG', 'CRPG', 'history', 'find', 'dd', 'evolve', 'year', 'Larian', 'create', 'perfect', 'gateway', 'Baldurs', 'gate', 'old', 'classic', 'fervor', 'easy', 'accessibility', 'edition', 'DD', 'Baldurs', 'Gate', 'III', 'impossible', 'perfect', 'place', 'begin', 'play', 'DD', 'like', 'feel', 'like', 'access', 'dd', '

In [4]:
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

{'dnd': 0, 'friend': 1, 'people': 2, 'Karlach': 3, '': 4, 'BG': 5, 'Baldurs': 6, 'Balldurs': 7, 'CRPG': 8, 'DD': 9, 'DnD': 10, 'Edition': 11, 'Gamemaster': 12, 'Gaming': 13, 'Gate': 14, 'II': 15, 'III': 16, 'Larian': 17, 'RPG': 18, 'Studios': 19, 'access': 20, 'accessibility': 21, 'accompany': 22, 'adoration': 23, 'adore': 24, 'ago': 25, 'amount': 26, 'announce': 27, 'array': 28, 'away': 29, 'begin': 30, 'beginning': 31, 'blow': 32, 'board': 33, 'brief': 34, 'captivate': 35, 'care': 36, 'catch': 37, 'category': 38, 'choose': 39, 'classic': 40, 'close': 41, 'closely': 42, 'come': 43, 'comfortable': 44, 'consumer': 45, 'context': 46, 'create': 47, 'creation': 48, 'curate': 49, 'dd': 50, 'deal': 51, 'developer': 52, 'difficult': 53, 'divinity': 54, 'dragon': 55, 'dungeon': 56, 'easy': 57, 'edition': 58, 'enjoy': 59, 'evolve': 60, 'excruciating': 61, 'exemplary': 62, 'expectation': 63, 'experience': 64, 'eye': 65, 'fan': 66, 'fancy': 67, 'feel': 68, 'fervor': 69, 'finally': 70, 'find': 71,

## Utilizing Doc2Bow to Convert Doc to Bag of Words

In [5]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1)],
 [(4, 2),
  (5, 3),
  (6, 3),
  (7, 1),
  (8, 1),
  (9, 3),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 3),
  (15, 1),
  (16, 5),
  (17, 4),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 2),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 3),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 3),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 2),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 3),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 1),
  (74, 7),
  (75, 2),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1),


## Utilizing TF-IDF

In [6]:
from gensim import models
tfidf = models.TfidfModel(corpus)

for document in tfidf[corpus]:
       print(document)

[(0, 0.7449272763999774), (1, 0.4717432314698918), (2, 0.4717432314698918)]
[(3, 1.0)]
[(4, 0.060101307189314314), (5, 0.19481514606071504), (6, 0.07829542706418977), (7, 0.06493838202023834), (8, 0.049913055222909765), (9, 0.19481514606071504), (10, 0.06493838202023834), (11, 0.06493838202023834), (12, 0.06493838202023834), (13, 0.06493838202023834), (14, 0.09015196078397147), (15, 0.06493838202023834), (16, 0.3246919101011917), (17, 0.1395509137023247), (18, 0.049913055222909765), (19, 0.08224760497078366), (20, 0.06493838202023834), (21, 0.06493838202023834), (22, 0.06493838202023834), (23, 0.06493838202023834), (24, 0.06493838202023834), (25, 0.06493838202023834), (26, 0.06493838202023834), (27, 0.06493838202023834), (28, 0.049913055222909765), (29, 0.12987676404047668), (30, 0.06493838202023834), (31, 0.049913055222909765), (32, 0.12987676404047668), (33, 0.06493838202023834), (34, 0.06493838202023834), (35, 0.06493838202023834), (36, 0.12987676404047668), (37, 0.06493838202023834

## Utilizing Bi-gramming

In [7]:
import gensim
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
texts

[['dnd', 'people', 'friend'],
 ['Karlach'],
 ['m',
  'fancy',
  'reviewer',
  'gamer',
  'consumer',
  'like',
  'rest',
  'kindly',
  'request',
  'offer',
  'minute',
  'hear',
  'share',
  'let',
  'start',
  'brief',
  'Larian',
  'Studios',
  'blow',
  'away',
  'context',
  'come',
  'love',
  'divinity',
  'game',
  'year',
  'hype',
  'announce',
  'successor',
  'Balldurs',
  'Gate',
  'II',
  'Gamemaster',
  'dungeon',
  'dragon',
  'nearly',
  'year',
  've',
  'DnD',
  'sphere',
  'game',
  'rpg',
  'beginning',
  'Edition',
  'young',
  'experience',
  'system',
  'board',
  'numerous',
  'title',
  'accompany',
  'array',
  'edition',
  'publish',
  'speak',
  'BG',
  'III',
  'Larian',
  'blow',
  'away',
  'expectation',
  'dd',
  'experience',
  'honor',
  'great',
  'marker',
  'DD',
  'RPG',
  'CRPG',
  'history',
  'find',
  'dd',
  'evolve',
  'year',
  'Larian',
  'create',
  'perfect',
  'gateway',
  'Baldurs',
  'gate',
  'old',
  'classic',
  'fervor',
  'easy'

In [8]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [9]:
# Save generated corpus

corpora.MmCorpus.serialize('data/gensim_corpus.mm', corpus)