In [None]:
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import pprint 
import os

from alive_progress import alive_bar
import time
import sys

from boardgamegeek import BGGClient

In [None]:
bgg = BGGClient()

# DATA EXTRACTION

In [None]:
hot_items = bgg.hot_items('boardgame')
miss = {}
data = []
for item in hot_items:
    try:
        game = bgg.game(game_id=item.id, comments=True)
        with alive_bar(len(game.comments), force_tty=True) as bar:
            for comment in game.comments:
                com_data = {}
                com_data["id"] = item.id
                com_data["title"] = item.name
                com_data["user"] = comment.commenter
                com_data["comment"] = comment.comment
                com_data["rating"] = comment.rating
                data.append(com_data)
                time.sleep(.01)
                bar()
    except:
        print('error')
        miss[item.id] = item.name

In [None]:
with open("comment_data_demo.json", 'w') as f:
    json.dump(data, f, indent=2) # indent=2 is not needed but makes the file human-readable if the data is nested

# DATA WRANGLING

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import json
import os
import numpy as np

with open("comment_data_demo.json", 'r') as f:
    post_list = json.load(f)
    
print(f'Amount of comments before any formatting: {len(post_list)}')
    
df = pd.json_normalize(post_list)    

In [None]:
df.head(1)

## Reading raw data
#### File formating
Not all fields are readable and need to be modified.

In [None]:
#There are no unredable data
path_original_data=r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'
df.to_csv(os.path.join(path_original_data,'comment_data_demo.csv'), index=False)

## Data cleaning and restructuring procedures

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import json
import os
import numpy as np
path_original_data=r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'
df=pd.read_csv(os.path.join(path_original_data,'comment_data_demo.csv'), low_memory=False)

In [2]:
df.head(1)

Unnamed: 0,id,title,user,comment,rating
0,387866,Star Wars: Unlimited,8janek8,pl,


Perform a quick superficial analysis of the data.

In [3]:
# Print some statitiscts of the 'comment' field

print(str(round(len(df[df.comment.notnull()])/len(df)*100,2))+'%')
print(df.comment.str.len().max())
print(df.comment.str.len().min())
print(df.comment.str.len().mean())

#Search the number of comments containing searched words within the text of the message
print()

pattern = "random"
print(len(df[df.comment.str.contains(pattern) & df.comment.notnull()]))
print(len(df[df.comment.str.match(pattern) & df.comment.notnull()]))
print(len(df[df.comment.str.fullmatch(pattern) & df.comment.notnull()]))

99.99%
18588.0
1.0
202.89663664400902

2806
1
0


In [4]:
df = df[df.comment.notnull()]
df = df.reset_index(drop=True)

In [5]:
# Print the first messages that constains the pattern

print(df.loc[df.comment.str.contains(pattern) & df.comment.notnull(), 'comment'])

87        After just one play I get the feeling I have s...
129       I really don’t get the hype around Heat, but t...
165       An okay "deck management” game with a relatabl...
295       Update: After some more plays I feel that some...
394       [imageID=6940449small inline] This really feel...
                                ...                        
129937    More plays with a different game group have si...
129963    Interesting legacy role play co-op board game....
130050    Played solo. The different scenarios keeps the...
130201    Digital version available (PC) Playing solo be...
130344    Too hard under the rules in the game and a bit...
Name: comment, Length: 2806, dtype: object


#### Transformation
Modify values to be easier to read.

In [6]:
# Nothing to transform

#### Filtering the data
Limit the results to one country or language.

In [7]:
from guess_language import guess_language
import enchant
import string
import re

# Function to check if a comment is in English
def is_english_batch(batch):
    # Create a batch of processed texts
    processed_texts = batch['comment'].str.lower().apply(lambda text: re.findall(r"[a-zA-Z0-9']+", text))

    # Create an English dictionary
    english_dictionary = enchant.Dict("en_US")

    # Check if any comment in the batch is in English
    is_english = processed_texts.apply(lambda text: sum(english_dictionary.check(word) for word in text) >= len(text) / 2)

    # Return a boolean Series indicating if each comment is in English
    return is_english

In [15]:
from tqdm import tqdm
import time
import sys

from alive_progress import alive_bar
from IPython.display import display, HTML

import pandas as pd

# Display the CSS styling
# display(HTML(bar_styling))

# Batch processing
batch_size = 1000  # Number of rows to process in each batch
num_rows = len(df)
result = pd.Series([], dtype='float64')  # Store the results

bar_stat = round(num_rows/batch_size)+1

with tqdm(total=bar_stat, ncols=bar_stat) as pbar:
    for i in range(0, num_rows, batch_size):
        batch = df.iloc[i:i+batch_size]
        batch_english = batch.loc[is_english_batch(batch)]
        result = pd.concat([result, batch_english])
        pbar.update(1)

# Reset the index of the DataFrame
result.reset_index(drop=True, inplace=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 131/131 [06:31<00:00,  2.99s/it]


In [16]:
# Print the updated DataFrame
result.head(10)

Unnamed: 0,0,id,title,user,comment,rating
0,,387866.0,Star Wars: Unlimited,8janek8,pl,
1,,387866.0,Star Wars: Unlimited,Irsaan,"Should have been an LCG, not a TCG.",1.0
2,,387866.0,Star Wars: Unlimited,ObiKKa,Big surprise news in 2023 of this 2024 Star Wa...,
3,,387866.0,Star Wars: Unlimited,Stephen Glenn,Collecting and trading is fun!,
4,,366013.0,Heat: Pedal to the Metal,100pcBlade,I was a little disappointed with Heat after my...,7.5
5,,366013.0,Heat: Pedal to the Metal,360_Piranha,"Just, wow.",10.0
6,,366013.0,Heat: Pedal to the Metal,Aardvarkius,Shrinkwrap BIN: Not for sale/trade at this time,
7,,366013.0,Heat: Pedal to the Metal,abernath,It's fine. I would rather play Automobiles or ...,6.0
8,,366013.0,Heat: Pedal to the Metal,abjohnuf,Boarding School Games back order,
9,,366013.0,Heat: Pedal to the Metal,aby407,23.08,


In [17]:
print(len(result))
print(len(df))
print(len(df)-len(result))

df1 = df
df2 = result

import pandas as pd

# Assuming you have two DataFrames: df1 and df2 representing the two databases

# Find rows with differing 'comment' in df1 compared to df2
diff_df1 = df1[~df1['comment'].isin(df2['comment'])]

# Find rows with differing 'comment' in df2 compared to df1
diff_df2 = df2[~df2['comment'].isin(df1['comment'])]

# Concatenate the differing rows into a single DataFrame
diff_combined = pd.concat([diff_df1, diff_df2])

# Reset the index of the resulting DataFrame
diff_combined.reset_index(drop=True, inplace=True)

# Print the differences
diff_combined.head(10)

119472
130346
10874


Unnamed: 0,id,title,user,comment,rating,0
0,366013.0,Heat: Pedal to the Metal,a2greg,nyp,,
1,366013.0,Heat: Pedal to the Metal,alexbatbee,zatu,,
2,366013.0,Heat: Pedal to the Metal,ANDREWSOFT,Jugadas varias partidas en solitario con el mó...,8.2,
3,366013.0,Heat: Pedal to the Metal,angeltg,Partidas rápidas y sencillas con toda la emoci...,10.0,
4,366013.0,Heat: Pedal to the Metal,ankara,Juegaken,10.0,
5,366013.0,Heat: Pedal to the Metal,Anxelox,Por xogar,,
6,366013.0,Heat: Pedal to the Metal,Argantonio,Muy buen juego de carreras que exige un par de...,8.0,
7,366013.0,Heat: Pedal to the Metal,Arnar146,Gulli,8.0,
8,366013.0,Heat: Pedal to the Metal,Avantyr,PnP,8.5,
9,366013.0,Heat: Pedal to the Metal,badzacz,H20,,


In [33]:
result['text_length'] = result['comment'].apply(lambda x: len(x))
result['word_count'] = result['comment'].apply(lambda x: len(x.split()))
result = result[result['word_count'] > 5]
result = result.drop(result.columns[0], axis=1)
result.head(10)

Unnamed: 0,id,title,user,comment,rating,text_length,word_count
1,387866.0,Star Wars: Unlimited,Irsaan,"Should have been an LCG, not a TCG.",1.0,35,8
2,387866.0,Star Wars: Unlimited,ObiKKa,Big surprise news in 2023 of this 2024 Star Wa...,,467,79
4,366013.0,Heat: Pedal to the Metal,100pcBlade,I was a little disappointed with Heat after my...,7.5,629,121
6,366013.0,Heat: Pedal to the Metal,Aardvarkius,Shrinkwrap BIN: Not for sale/trade at this time,,48,8
7,366013.0,Heat: Pedal to the Metal,abernath,It's fine. I would rather play Automobiles or ...,6.0,61,10
10,366013.0,Heat: Pedal to the Metal,adamgrey,Light deck building racing game. Basic game is...,8.0,152,26
11,366013.0,Heat: Pedal to the Metal,adamredwoods,"2 plays / 2, 3 players LIKE: Fast to play, si...",7.0,768,139
13,366013.0,Heat: Pedal to the Metal,Addiction2k,"This is just a fantastic game, and I need a fe...",9.5,153,32
16,366013.0,Heat: Pedal to the Metal,agilmor,"I played with Formula D several times, but nev...",8.0,282,52
17,366013.0,Heat: Pedal to the Metal,Ahmadneama,Fun racing game that doesn't punish players fo...,8.5,179,32


In [34]:
result.to_csv('pre_processed_comment_data_demo.csv', index=False)

Try to find words unrelated that may be blacklisted from the dataset.

### Steps to Clean the Data
#### Punctuation Removal
Then try to find relevant key words.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import json
import os
import numpy as np
path_original_data=r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'
df=pd.read_csv(os.path.join(path_original_data,'pre_processed_comment_data_demo.csv'), low_memory=False)

In [2]:
import string

def remove_punctuation(text):
    # Create a set of allowed characters (letters and numbers)
    allowed_chars = set(string.ascii_letters + string.digits + ' ')
    
    # Remove punctuation characters not in the allowed set
    processed_text = ''.join(char for char in text if char in allowed_chars)
    
    return processed_text

# Apply the remove_punctuation() function to the 'text' column
df['processed_comment'] = df['comment'].apply(remove_punctuation)

#### Lowering the Text

In [3]:
# Lower case all the messages
df['processed_comment'] = df['processed_comment'].str.lower()

#### Tokenization

In [4]:
# Defining function for tokenization
import re

def tokenization(text):
    tokens = re.split(' ',text)
    
    return tokens

# Applying function to the column
df['comment_tokenied']= df['processed_comment'].apply(lambda x: tokenization(x))

#### Stop Word Removal

In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
#nltk.download('wordnet')

# Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[0:10])

# Defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    
    return output

def remove_short_words(text):
    output= [i for i in text if len(i) > 2]
    
    return output

# Applying the function
df['comment_key_words']= df['comment_tokenied'].apply(lambda x:remove_stopwords(x))
df['comment_key_words']= df['comment_key_words'].apply(lambda x:remove_short_words(x))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


#### Stemming

In [6]:
from nltk.stem.porter import PorterStemmer

# Defining the object for stemming
porter_stemmer = PorterStemmer()

# Defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]

    return stem_text

df['comment_stemmed']= df['comment_key_words'].apply(lambda x: stemming(x))

#### Lemmatization

In [7]:
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

# Defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

# Defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    
    return lemm_text

df['comment_lemmatized']= df['comment_key_words'].apply(lambda x:lemmatizer(x))

### Gensim tutorial

In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
#nltk.download('wordnet')

stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

def preprocess_verbs(text):
    text = gensim.utils.simple_preprocess(text)
    tagged_tokens = nltk.pos_tag(text)
    filtered_tokens = [token for token, pos_tag in tagged_tokens if pos_tag not in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    
    result = []
    for token in filtered_tokens:
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

df['gensim_comment']= df['comment'].apply(preprocess)
df['gensim_comment_verbs']= df['comment'].apply(preprocess_verbs)

#### Restructring the dataset
Rearrage the columns and drop the ones you find not useful.

In [9]:
# Sort columns
list(df.columns)

['id',
 'title',
 'user',
 'comment',
 'rating',
 'text_length',
 'word_count',
 'processed_comment',
 'comment_tokenied',
 'comment_key_words',
 'comment_stemmed',
 'comment_lemmatized',
 'gensim_comment',
 'gensim_comment_verbs']

In [10]:
df.sample(5)

Unnamed: 0,id,title,user,comment,rating,text_length,word_count,processed_comment,comment_tokenied,comment_key_words,comment_stemmed,comment_lemmatized,gensim_comment,gensim_comment_verbs
46248,237182.0,Root,bucklen_uk,I can see why many people love this. I don’t. ...,3.0,213,38,i can see why many people love this i dont t...,"[i, can, see, why, many, people, love, this, i...","[see, many, people, love, dont, asymmetric, me...","[see, mani, peopl, love, dont, asymmetr, mean,...","[see, many, people, love, dont, asymmetric, me...","[peopl, love, asymmetr, mean, newbi, crush, ex...","[peopl, asymmetr, mean, newbi, expert, equal, ..."
72097,230802.0,Azul,Tariff,Lovely production and fun little abstract,7.0,41,6,lovely production and fun little abstract,"[lovely, production, and, fun, little, abstract]","[lovely, production, fun, little, abstract]","[love, product, fun, littl, abstract]","[lovely, production, fun, little, abstract]","[love, product, fun, littl, abstract]","[love, product, fun, littl, abstract]"
56329,169786.0,Scythe,athiel,Moves along. Many rules are a bit tricky. Bu...,8.8,185,34,moves along many rules are a bit tricky but ...,"[moves, along, , many, rules, are, a, bit, tri...","[moves, along, many, rules, bit, tricky, say, ...","[move, along, mani, rule, bit, tricki, say, do...","[move, along, many, rule, bit, tricky, say, do...","[move, rule, bite, tricki, bother, littl, time...","[move, rule, bite, tricki, bother, littl, time..."
24783,162886.0,Spirit Island,Wildman0326,The second of three games I will play every ti...,9.9,170,32,the second of three games i will play every ti...,"[the, second, of, three, games, i, will, play,...","[second, three, games, play, every, time, aske...","[second, three, game, play, everi, time, ask, ...","[second, three, game, play, every, time, asked...","[second, game, play, time, ask, prefer, play, ...","[second, game, time, prefer, scenario, adversa..."
38790,174430.0,Gloomhaven,71gamer,"Trade local only, this isn't shippable. Also, ...",,157,27,trade local only this isnt shippable also plea...,"[trade, local, only, this, isnt, shippable, al...","[trade, local, isnt, shippable, also, please, ...","[trade, local, isnt, shippabl, also, pleas, ma...","[trade, local, isnt, shippable, also, please, ...","[trade, local, isn, shippabl, sure, trade, req...","[trade, local, isn, shippabl, sure, trade, req..."


In [11]:
df = df[['gensim_comment', 'gensim_comment_verbs', 'id', 'title', 'user', 'rating', 'comment', 'text_length', 'word_count', 
       'processed_comment', 'comment_key_words', 'comment_tokenied', 'comment_stemmed', 'comment_lemmatized']]

In [12]:
df = df[df['gensim_comment'].map(lambda d: len(d)) >= 5]
df = df.reset_index(drop=True)

df.sample(5)

Unnamed: 0,gensim_comment,gensim_comment_verbs,id,title,user,rating,comment,text_length,word_count,processed_comment,comment_key_words,comment_tokenied,comment_stemmed,comment_lemmatized
70198,"[play, gencon, home, player, obvious, great, t...","[play, gencon, home, player, obvious, great, t...",345972.0,Cat in the Box: Deluxe Edition,Stealthmutant,7.5,"After 5 plays at Gencon and home, 2 and 4 play...",226,42,after 5 plays at gencon and home 2 and 4 playe...,"[plays, gencon, home, player, obviously, great...","[after, 5, plays, at, gencon, and, home, 2, an...","[play, gencon, home, player, obvious, great, t...","[play, gencon, home, player, obviously, great,..."
66741,"[premium, sleev, raptor, print, insert, paint,...","[premium, raptor, print, insert, miniatur, int...",167355.0,Nemesis,Wardoxx,10.0,Premium Sleeved E-raptor UV Print Insert Paint...,559,110,premium sleeved eraptor uv print insert painte...,"[premium, sleeved, eraptor, print, insert, pai...","[premium, sleeved, eraptor, uv, print, insert,...","[premium, sleev, eraptor, print, insert, paint...","[premium, sleeved, eraptor, print, insert, pai..."
32518,"[love, stori, build, theme, deck, build, game,...","[stori, build, theme, deck, build, game, artwo...",205637.0,Arkham Horror: The Card Game,kydaria,9.0,"Love the story building, theme and deck buildi...",432,78,love the story building theme and deck buildin...,"[love, story, building, theme, deck, building,...","[love, the, story, building, theme, and, deck,...","[love, stori, build, theme, deck, build, game,...","[love, story, building, theme, deck, building,..."
58746,"[impress, extrem, product, qualiti, artwork, p...","[impress, extrem, product, qualiti, artwork, p...",336986.0,Flamecraft,DopeforHope,8.5,First impression - seems to be extremely well ...,153,26,first impression seems to be extremely well d...,"[first, impression, seems, extremely, well, do...","[first, impression, , seems, to, be, extremely...","[first, impress, seem, extrem, well, done, pro...","[first, impression, seems, extremely, well, do..."
25181,"[obvious, game, pleasur, look, play, card, gam...","[obvious, game, pleasur, card, game, piec, stu...",266192.0,Wingspan,BlueMissed,9.0,Obviously this game is a pleasure to look at a...,292,58,obviously this game is a pleasure to look at a...,"[obviously, game, pleasure, look, well, play, ...","[obviously, this, game, is, a, pleasure, to, l...","[obvious, game, pleasur, look, well, play, car...","[obviously, game, pleasure, look, well, play, ..."


In [13]:
average_length = df['gensim_comment'].apply(lambda x: len(x)).mean()
print(average_length)

26.55023453692593


In [14]:
df.to_csv('post_processed_comment_data_demo.csv', index=False)

In [15]:
import pandas as pd
pd.set_option('display.max_columns', None)
import json
import os
import numpy as np
path_original_data=r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'
df=pd.read_csv(os.path.join(path_original_data,'post_processed_comment_data_demo.csv'), low_memory=False)

In [16]:
df.sample(10)

Unnamed: 0,gensim_comment,gensim_comment_verbs,id,title,user,rating,comment,text_length,word_count,processed_comment,comment_key_words,comment_tokenied,comment_stemmed,comment_lemmatized
20721,"['play', 'like', 'despit', 'difficulti', 'rate...","['despit', 'difficulti', 'rate', 'higher', 'pl...",162886.0,Spirit Island,phildelvec,8.0,Played this once and really liked it despite i...,103,17,played this once and really liked it despite i...,"['played', 'really', 'liked', 'despite', 'diff...","['played', 'this', 'once', 'and', 'really', 'l...","['play', 'realli', 'like', 'despit', 'difficul...","['played', 'really', 'liked', 'despite', 'diff..."
6373,"['design', 'gameplay', 'balanc', 'variant', 'g...","['gameplay', 'variant', 'gameplay', 'deckbuild...",316554.0,Dune: Imperium,Omnidude,9.3,It's a really well Designed Gameplay and well-...,581,99,its a really well designed gameplay and wellba...,"['really', 'well', 'designed', 'gameplay', 'we...","['its', 'a', 'really', 'well', 'designed', 'ga...","['realli', 'well', 'design', 'gameplay', 'well...","['really', 'well', 'designed', 'gameplay', 'we..."
12491,"['play', 'time', 'love', 'game', 'theme', 'abs...","['time', 'game', 'theme', 'absolut', 'idea', '...",167791.0,Terraforming Mars,elTigreCHino,10.0,I don't own it. I have played it three times. ...,354,69,i dont own it i have played it three times i l...,"['dont', 'played', 'three', 'times', 'love', '...","['i', 'dont', 'own', 'it', 'i', 'have', 'playe...","['dont', 'play', 'three', 'time', 'love', 'gam...","['dont', 'played', 'three', 'time', 'love', 'g..."
67920,"['enjoy', 'variabl', 'game', 'constant', 'accu...","['variabl', 'game', 'constant', 'chip', 'upgra...",227935.0,Wonderland's War,Solofunk,8.0,Really enjoyed the variability in this game. Y...,719,126,really enjoyed the variability in this game yo...,"['really', 'enjoyed', 'variability', 'game', '...","['really', 'enjoyed', 'the', 'variability', 'i...","['realli', 'enjoy', 'variabl', 'game', 'consta...","['really', 'enjoyed', 'variability', 'game', '..."
69403,"['giant', 'game', 'consid', 'giant', 'board', ...","['giant', 'game', 'giant', 'board', 'long', 't...",115746.0,War of the Ring: Second Edition,Raeez,9.2,"Giant game, considering a giant board, long se...",215,37,giant game considering a giant board long set ...,"['giant', 'game', 'considering', 'giant', 'boa...","['giant', 'game', 'considering', 'a', 'giant',...","['giant', 'game', 'consid', 'giant', 'board', ...","['giant', 'game', 'considering', 'giant', 'boa..."
49656,"['play', 'game', 'unnecessarili', 'complex', '...","['play', 'game', 'unnecessarili', 'complex', '...",169786.0,Scythe,Albarc,7.0,"After one play, I found the game unnecessarily...",123,22,after one play i found the game unnecessarily ...,"['one', 'play', 'found', 'game', 'unnecessaril...","['after', 'one', 'play', 'i', 'found', 'the', ...","['one', 'play', 'found', 'game', 'unnecessaril...","['one', 'play', 'found', 'game', 'unnecessaril..."
705,"['eta', 'june', 'dux', 'delay', 'https', 'www'...","['eta', 'june', 'dux', 'https', 'www', 'kickst...",331106.0,The Witcher: Old World,VaeVictis1918,,Eta June 2022. Dux. Delayed. https://www.kicks...,87,6,eta june 2022 dux delayed httpswwwkickstarterc...,"['eta', 'june', '2022', 'dux', 'delayed', 'htt...","['eta', 'june', '2022', 'dux', 'delayed', 'htt...","['eta', 'june', '2022', 'dux', 'delay', 'https...","['eta', 'june', '2022', 'dux', 'delayed', 'htt..."
42672,"['game', 'look', 'board', 'game', 'expans', 'a...","['game', 'look', 'board', 'game', 'expans', 'l...",237182.0,Root,nicku,9.0,This game is almost everything I look for in a...,101,21,this game is almost everything i look for in a...,"['game', 'almost', 'everything', 'look', 'boar...","['this', 'game', 'is', 'almost', 'everything',...","['game', 'almost', 'everyth', 'look', 'board',...","['game', 'almost', 'everything', 'look', 'boar..."
26976,"['pretti', 'artwork', 'interest', 'engin', 'bu...","['pretti', 'artwork', 'interest', 'engin', 'bu...",266192.0,Wingspan,Illusia,8.0,Pretty artwork. Interesting engine building ga...,222,42,pretty artwork interesting engine building gam...,"['pretty', 'artwork', 'interesting', 'engine',...","['pretty', 'artwork', 'interesting', 'engine',...","['pretti', 'artwork', 'interest', 'engin', 'bu...","['pretty', 'artwork', 'interesting', 'engine',..."
13398,"['yes', 'long', 'lot', 'downtim', 'fun', 'abl'...","['yes', 'long', 'lot', 'downtim', 'fun', 'abl'...",167791.0,Terraforming Mars,Ibaimendi,9.0,"Yes, it's long and has a lot of downtime, but ...",419,79,yes its long and has a lot of downtime but its...,"['yes', 'long', 'lot', 'downtime', 'fun', 'abl...","['yes', 'its', 'long', 'and', 'has', 'a', 'lot...","['ye', 'long', 'lot', 'downtim', 'fun', 'abl',...","['yes', 'long', 'lot', 'downtime', 'fun', 'abl..."


In [17]:
# Let's check the posts that contains found words that could help us filter

print(len(df[df.comment.str.contains('luck')]))
print(len(df[df.comment.str.contains('random')]))
print(len(df[df.comment.str.contains('boring')]))
print(len(df[df.comment.str.contains('complex')]))
print(len(df[df.comment.str.contains('complicated')]))
print(len(df[df.comment.str.contains('bookkeeping')]))

df[df.comment.str.contains('boring')].sample(5)

3097
2755
1185
3057
868
130


Unnamed: 0,gensim_comment,gensim_comment_verbs,id,title,user,rating,comment,text_length,word_count,processed_comment,comment_key_words,comment_tokenied,comment_stemmed,comment_lemmatized
28591,"['nice', 'pleasant', 'game', 'fun', 'manag', '...","['nice', 'pleasant', 'game', 'fun', 'bird', 't...",266192.0,Wingspan,Nyanapulsar,9.0,Very nice and pleasant game. It's very fun to ...,653,118,very nice and pleasant game its very fun to ma...,"['nice', 'pleasant', 'game', 'fun', 'manage', ...","['very', 'nice', 'and', 'pleasant', 'game', 'i...","['nice', 'pleasant', 'game', 'fun', 'manag', '...","['nice', 'pleasant', 'game', 'fun', 'manage', ..."
66141,"['super', 'themat', 'play', 'take', 'care', 'b...","['super', 'themat', 'care', 'bore', 'stuff', '...",167355.0,Nemesis,Peneda1,8.3,Super thematic. I played with someone who took...,506,94,super thematic i played with someone who took ...,"['super', 'thematic', 'played', 'someone', 'to...","['super', 'thematic', 'i', 'played', 'with', '...","['super', 'themat', 'play', 'someon', 'took', ...","['super', 'thematic', 'played', 'someone', 'to..."
30472,"['think', 'player', 'run', 'bite', 'long', 'wa...","['think', 'player', 'bite', 'long', 'player', ...",266192.0,Wingspan,wafflebun,3.0,I think at 5 players it ran a bit long and the...,368,67,i think at 5 players it ran a bit long and the...,"['think', 'players', 'ran', 'bit', 'long', 'wa...","['i', 'think', 'at', '5', 'players', 'it', 'ra...","['think', 'player', 'ran', 'bit', 'long', 'was...","['think', 'player', 'ran', 'bit', 'long', 'was..."
55787,"['decent', 'econom', 'game', 'nice', 'mechan',...","['decent', 'econom', 'game', 'nice', 'mechan',...",169786.0,Scythe,Valcurdra,4.0,Decent economic game with some nice mechanisms...,686,118,decent economic game with some nice mechanisms...,"['decent', 'economic', 'game', 'nice', 'mechan...","['decent', 'economic', 'game', 'with', 'some',...","['decent', 'econom', 'game', 'nice', 'mechan',...","['decent', 'economic', 'game', 'nice', 'mechan..."
54925,"['like', 'row', 'action', 'player', 'board', '...","['like', 'row', 'action', 'player', 'board', '...",169786.0,Scythe,sighlance,1.0,"I don't like that the ""bottom-row actions"" on ...",1746,279,i dont like that the bottomrow actions on the ...,"['dont', 'like', 'bottomrow', 'actions', 'play...","['i', 'dont', 'like', 'that', 'the', 'bottomro...","['dont', 'like', 'bottomrow', 'action', 'playe...","['dont', 'like', 'bottomrow', 'action', 'playe..."


In [18]:
print(len(df[df.comment.str.contains('edition')]))
print(len(df[df.comment.str.contains('version')]))
print(len(df[df.comment.str.contains('expansion')]))

1177
2136
5469
