In [2]:
import requests
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import re
import unicodedata
import warnings
import copy
import statsmodels.formula.api as smf

In [3]:
sns.set(rc={'figure.figsize':(9,7)})

In [4]:
import nltk
nltk.download(["names", "stopwords","averaged_perceptron_tagger",
               "punkt", "vader_lexicon", "wordnet"], quiet=True)

True

In [5]:
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [11]:
df = pd.read_csv('gizmos_reviews.csv')

In [37]:
df = df[~pd.isna(df['description'])]
df['description'] = df['description'].apply(lambda x: x.lower())

In [12]:
def make_lemma(text):
  """
  Function to lemmatize and clean text
  
  From: https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460
  """
  word_lemma = nltk.stem.WordNetLemmatizer()
  stopwords_ = nltk.corpus.stopwords.words('english')
  stopwords_.extend(['gizmos', 'feel', 'like', 'game', 'board', 'dont',
                     'bit', 'get', 'really', 'much', 'also', 'well', 'lot'])
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split() 
  word_li = [word_lemma.lemmatize(word) for word in words if word not in stopwords_]

  # One last move: will replace building with builder, since there's no distinction in this case
  # This is also a very commong word in the corpus
  word_li = ['builder' if word == 'building' else word for word in word_li] 

  # For some reason, stopwords isn't removing this one. Doing so manually
  if 'game' in word_li:
    word_li.remove('game')

  return word_li

In [53]:
def dataframe_to_wordlist(df, word_col):
    all_text = ''

    for i in range(len(df)):
        new_text = df.iloc[i, word_col]
        #print(new_text)
        all_text += ' ' + new_text

    all_text = all_text.encode("ascii", errors="ignore").decode()
    for symbol in ['!', ',','.','_', '(', ')']:
        all_text = all_text.replace(symbol, ' ')
    
    all_text = all_text.lower()
    return all_text.split(' ')

In [28]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

In [41]:
df['word_col'] = df['description'].apply(lambda x: make_lemma(x))

In [43]:
df.head(10)

Unnamed: 0,game_id,rating,user,description,word_col
0,246192,6.0,3davoli,for sale,[sale]
1,246192,8.0,4ndrewcol3,my favorite shorter game,"[favorite, shorter]"
2,246192,6.0,Abdul,decent but forgettable. i would have expected ...,"[decent, forgettable, would, expected, least, ..."
3,246192,10.0,AbleCompany,my pick for an engine building game with just ...,"[pick, engine, builder, right, kind, theming, ..."
4,246192,7.0,Abruptdolphin,slimbo,[slimbo]
5,246192,7.0,Achire,light game. pretty fun with all the card combo...,"[light, pretty, fun, card, combo, mechanic, ma..."
6,246192,8.0,Acoidan85,grintxitos,[grintxitos]
7,246192,7.5,adamdynris,boardgamebliss dec. 2018,"[boardgamebliss, dec, 2018]"
8,246192,8.0,adamgospod,w koszulkach,"[w, koszulkach]"
9,246192,6.0,adamredwoods,1 play / 3 players (1 child) like: engine bui...,"[1, play, 3, player, 1, child, engine, builder..."


In [49]:
backup_df = copy.deepcopy(df)

In [56]:
full_word_str = []
for x in range(len(df)):
    for word in df.iloc[x,4]:
        full_word_str.append(word)

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2,max_features=200)
vectorizer.fit(full_word_str)
feature_li = vectorizer.get_feature_names()
len(feature_li)

200

In [61]:
bow_df = copy.deepcopy(df)

In [64]:
bow_df['bow_col'] = df['word_col'].apply(lambda x: ' '.join(x))

In [90]:
item_li = [''] * 200
for value, idx in vectorizer.vocabulary_.items():
    item_li[idx] = value

In [92]:
bow_df = pd.DataFrame(X.toarray())
bow_df.columns = item_li

In [111]:
bow_df[bow_df['ability'] > 0].head()

Unnamed: 0,2018,2019,ability,action,actually,add,almost,although,always,another,...,understand,use,want,way,weight,win,wish,work,would,yet
14,0,0,1,0,0,3,0,0,0,1,...,0,0,1,3,0,0,2,2,3,0
21,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
132,0,0,4,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
183,0,0,1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
df.iloc[21,3]

"gismos feels like a lighter version of terraforming mars in terms of an engine builder. a card-and-marble engine building game where you can chain/domino abilities. the marbles might initially think of a kid's game, but there is a good bit of strategy to make an efficient engine work towards earning you points and not just spinning its wheels.  terraforming mars feels like it is a more mature game with more depth, but gizmos is easier to get to the table for a family. if you enjoy engine builders, gizmos is a great addition as a lightweight introductory game that will still be fun after the initial play."

In [137]:
X_train, y_train = bow_df.iloc[:700, :], df.loc[:700-1,'rating']
X_test, y_test = bow_df.iloc[700:, :], df.loc[700:,'rating']

In [138]:
len(X_test), len(y_test)

(323, 323)

In [132]:
# Random Forsts are notoriously bad with sparse data, but let's start easy
from sklearn.ensemble import RandomForestRegressor

In [156]:
rf = RandomForestRegressor(max_depth=10)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)

In [157]:
pred_df = pd.DataFrame(data={'actual':y_test, 'preds':preds, 'naive': y_train.mean()})

In [159]:
from sklearn.metrics import mean_squared_error
print('model error:', mean_squared_error(pred_df['actual'], pred_df['preds']))
print('naive error:', mean_squared_error(pred_df['actual'], pred_df['naive']))

model error: 1.6413479825727044
naive error: 1.7024290100104065


Not a very large gain

In [164]:
# Random Forsts are notoriously bad with sparse data, but let's start easy
from sklearn.ensemble import GradientBoostingRegressor

In [165]:
gb = GradientBoostingRegressor()

In [166]:
gb = GradientBoostingRegressor(max_depth=10)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

In [167]:
pred_df = pd.DataFrame(data={'actual':y_test, 'preds':preds, 'naive': y_train.mean()})

In [168]:
from sklearn.metrics import mean_squared_error
print('model error:', mean_squared_error(pred_df['actual'], pred_df['preds']))
print('naive error:', mean_squared_error(pred_df['actual'], pred_df['naive']))

model error: 1.9358393795524733
naive error: 1.7024290100104065


In [170]:
from sklearn.linear_model import Lasso

In [178]:
lasso = Lasso(alpha=1)
lasso.fit(X_train, y_train)
preds = lasso.predict(X_test)

In [177]:
pred_df = pd.DataFrame(data={'actual':y_test, 'preds':preds, 'naive': y_train.mean()})
print('model error:', mean_squared_error(pred_df['actual'], pred_df['preds']))
print('naive error:', mean_squared_error(pred_df['actual'], pred_df['naive']))

model error: 1.7024290100104065
naive error: 1.7024290100104065


In [179]:
# Needs work! Probably too many useless features