# User Review Corpus from RPG Games Published in Steam

## Scraping Data

In [48]:
import requests

def get_reviews(appid, params={'json':1}, lang="english", filter="toprated"):
        url=f'https://store.steampowered.com/appreviews/{appid}'
        response = requests.get(url=url, params=params)
        return response.json()
    
def get_n_reviews(appid, n):
    reviews = []
    cursor = '*'
    params = {
            'json' : 1,
            'filter' : 'recent',
            'language' : 'english',
            'day_range' : 28,
            'review_type' : 'all',
            'purchase_type' : 'steam'
            }

    while n > 0:
        params['cursor'] = cursor.encode()
        params['num_per_page'] = 100
        n -= 1

        response = get_reviews(str(appid))
        cursor = response['cursor']
        reviews += response['reviews']

        if len(response['reviews']) < 100: break

    return reviews

n=100
reviews = get_n_reviews(1086940,100)

## Text Vectorization Process

In [49]:
import re, string
 
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

documents = []

def tokenize(s):
    s = remove_non_ascii(s)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    res = regex.sub('', s)
    return res

for review in reviews:
    clean = remove_non_ascii(review['review'])
    documents.append(tokenize(clean))

documents

['Its DnD for people with no friends',
 'Karlach',
 'This game ruined my life',
 'Im not any fancy reviewer however I am a gamer and consumer like the rest of you kindly requesting you offer a few minutes to hear what I have to share\nLet me start being brief Larian Studios has blown me away\n\nFor some context of who I am and where I come from I have loved the divinity games for a few years now and was hyped for them being announced to make the successor to Balldurs Gate I  II \nI am a also a Gamemaster and have been for Dungeons  Dragons nearly fourteen years now Ive been in the DnD sphere of games and RPGs since the beginnings of 2nd Edition when I was really young I have experienced each system across the board numerous titles accompanying the array of editions published \n\nTo speak of BG III itself What Larian did here not only blew away my expectations as a DD experience but honored many of the greatest markers of DD and RPG and CRPG history that can be found as DD has evolved o

In [50]:
# Saving review list in a .txt file
with open("BG3_reviews.txt", "w") as output:
    for row in documents:
        output.write(str(row) + "\n")

In [51]:
 # opening the file in read mode 
test_file = open("BG3_reviews.txt", "r") 
  
# reading the file
data = test_file.read()

# Split file elements when newline("\n") is seen
data_into_list = data.split("\n") 
print(data_into_list)

test_file.close()

['Its DnD for people with no friends', 'Karlach', 'This game ruined my life', 'Im not any fancy reviewer however I am a gamer and consumer like the rest of you kindly requesting you offer a few minutes to hear what I have to share', 'Let me start being brief Larian Studios has blown me away', '', 'For some context of who I am and where I come from I have loved the divinity games for a few years now and was hyped for them being announced to make the successor to Balldurs Gate I  II ', 'I am a also a Gamemaster and have been for Dungeons  Dragons nearly fourteen years now Ive been in the DnD sphere of games and RPGs since the beginnings of 2nd Edition when I was really young I have experienced each system across the board numerous titles accompanying the array of editions published ', '', 'To speak of BG III itself What Larian did here not only blew away my expectations as a DD experience but honored many of the greatest markers of DD and RPG and CRPG history that can be found as DD has 

In [52]:
from gensim import corpora
import spacy
from spacy.tokens import Token

tag_getter = lambda token: token.text in ("h1", "h2", "i", "\n", "\n\n", "\u3000", "\u3000\u3000", '\n\u3000')
Token.set_extension("is_tag", getter=tag_getter, force=True)
nlp = spacy.load('en_core_web_sm')

texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num and not w._.is_tag:
            text.append(w.lemma_.strip())
    texts.append(text)
print(texts)

[['dnd', 'people', 'friend'], ['Karlach'], ['game', 'ruin', 'life'], ['m', 'fancy', 'reviewer', 'gamer', 'consumer', 'like', 'rest', 'kindly', 'request', 'offer', 'minute', 'hear', 'share', 'let', 'start', 'brief', 'Larian', 'Studios', 'blow', 'away', 'context', 'come', 'love', 'divinity', 'game', 'year', 'hype', 'announce', 'successor', 'Balldurs', 'Gate', '', 'II', 'Gamemaster', 'dungeon', '', 'dragon', 'nearly', 'year', 've', 'DnD', 'sphere', 'game', 'rpg', 'beginning', 'Edition', 'young', 'experience', 'system', 'board', 'numerous', 'title', 'accompany', 'array', 'edition', 'publish', 'speak', 'BG', 'III', 'Larian', 'blow', 'away', 'expectation', 'dd', 'experience', 'honor', 'great', 'marker', 'DD', 'RPG', 'CRPG', 'history', 'find', 'dd', 'evolve', 'year', 'Larian', 'create', 'perfect', 'gateway', 'Baldurs', 'gate', 'old', 'classic', 'fervor', 'easy', 'accessibility', 'edition', 'DD', 'Baldurs', 'Gate', 'III', 'impossible', 'perfect', 'place', 'begin', 'play', 'DD', 'like', 'feel',

In [53]:
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

{'dnd': 0, 'friend': 1, 'people': 2, 'Karlach': 3, 'game': 4, 'life': 5, 'ruin': 6, '': 7, 'BG': 8, 'Baldurs': 9, 'Balldurs': 10, 'CRPG': 11, 'DD': 12, 'DnD': 13, 'Edition': 14, 'Gamemaster': 15, 'Gaming': 16, 'Gate': 17, 'II': 18, 'III': 19, 'Larian': 20, 'RPG': 21, 'Studios': 22, 'access': 23, 'accessibility': 24, 'accompany': 25, 'adoration': 26, 'adore': 27, 'ago': 28, 'amount': 29, 'announce': 30, 'array': 31, 'away': 32, 'begin': 33, 'beginning': 34, 'blow': 35, 'board': 36, 'brief': 37, 'captivate': 38, 'care': 39, 'catch': 40, 'category': 41, 'choose': 42, 'classic': 43, 'close': 44, 'closely': 45, 'come': 46, 'comfortable': 47, 'consumer': 48, 'context': 49, 'create': 50, 'creation': 51, 'curate': 52, 'dd': 53, 'deal': 54, 'developer': 55, 'difficult': 56, 'divinity': 57, 'dragon': 58, 'dungeon': 59, 'easy': 60, 'edition': 61, 'enjoy': 62, 'evolve': 63, 'excruciating': 64, 'exemplary': 65, 'expectation': 66, 'experience': 67, 'eye': 68, 'fan': 69, 'fancy': 70, 'feel': 71, 'fer

## Utilizing Doc2Bow to Convert Doc to Bag of Words

In [54]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1)],
 [(4, 1), (5, 1), (6, 1)],
 [(4, 7),
  (7, 2),
  (8, 3),
  (9, 3),
  (10, 1),
  (11, 1),
  (12, 3),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 5),
  (20, 4),
  (21, 1),
  (22, 2),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 2),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 2),
  (44, 1),
  (45, 1),
  (46, 3),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 3),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 3),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 3),
  (75, 1),
  (76, 1),
  (77, 2),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (9

## Utilizing TF-IDF

In [55]:
from gensim import models
tfidf = models.TfidfModel(corpus)

for document in tfidf[corpus]:
       print(document)

[(0, 0.7085580347880611), (1, 0.44871152876861703), (2, 0.544613142778809)]
[(3, 1.0)]
[(4, 0.25854569685973533), (5, 0.6142610908349744), (6, 0.7455450589478421)]
[(4, 0.12370069092907361), (7, 0.06135930493983475), (8, 0.19889287792667457), (9, 0.09203895740975213), (10, 0.06629762597555819), (11, 0.050957799740599496), (12, 0.19889287792667457), (13, 0.050957799740599496), (14, 0.06629762597555819), (15, 0.06629762597555819), (16, 0.06629762597555819), (17, 0.10685392051692241), (18, 0.050957799740599496), (19, 0.2547889987029975), (20, 0.14247189402256322), (21, 0.050957799740599496), (22, 0.08396915325114013), (23, 0.050957799740599496), (24, 0.06629762597555819), (25, 0.06629762597555819), (26, 0.06629762597555819), (27, 0.06629762597555819), (28, 0.050957799740599496), (29, 0.06629762597555819), (30, 0.06629762597555819), (31, 0.050957799740599496), (32, 0.13259525195111638), (33, 0.06629762597555819), (34, 0.050957799740599496), (35, 0.13259525195111638), (36, 0.066297625975558

## Utilizing Bi-gramming

In [56]:
import gensim
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
texts

[['dnd', 'people', 'friend'],
 ['Karlach'],
 ['game', 'ruin', 'life'],
 ['m',
  'fancy',
  'reviewer',
  'gamer',
  'consumer',
  'like',
  'rest',
  'kindly',
  'request',
  'offer',
  'minute',
  'hear',
  'share',
  'let',
  'start',
  'brief',
  'Larian',
  'Studios',
  'blow',
  'away',
  'context',
  'come',
  'love',
  'divinity',
  'game',
  'year',
  'hype',
  'announce',
  'successor',
  'Balldurs',
  'Gate',
  'II',
  'Gamemaster',
  'dungeon',
  'dragon',
  'nearly',
  'year',
  've',
  'DnD',
  'sphere',
  'game',
  'rpg',
  'beginning',
  'Edition',
  'young',
  'experience',
  'system',
  'board',
  'numerous',
  'title',
  'accompany',
  'array',
  'edition',
  'publish',
  'speak',
  'BG',
  'III',
  'Larian',
  'blow',
  'away',
  'expectation',
  'dd',
  'experience',
  'honor',
  'great',
  'marker',
  'DD',
  'RPG',
  'CRPG',
  'history',
  'find',
  'dd',
  'evolve',
  'year',
  'Larian',
  'create',
  'perfect',
  'gateway',
  'Baldurs',
  'gate',
  'old',
  'cla

In [57]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

## Save generated corpus

In [58]:
corpora.MmCorpus.serialize('data/steamreview_corpus.mm', corpus)