To do list:

- import the modules required
- import data
- filter out the columns required
- clean the data (check for null values)
- use TDD to process the text
- split data into training and test data

- use TDD to develop models (fit and predict)
- measure accuracy of the models and decide whether to make improvements on the models
- models that will be used: Naive bayes and support vector machines

In [3]:
import pandas as pd
import numpy as np
import unittest
import string
from imblearn.over_sampling import SMOTE
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Import the data

In [4]:
df_for_classifer = pd.read_csv('opencritic_data_with_ratings.csv', index_col= 'Unnamed: 0')

df_for_classifer.head()

Unnamed: 0,name,developer,publisher,genre,release_date,description,critics_average_score,critic_review,date_of_review,critics_score,opencritic_rating_critics_score
0,The Outer Worlds,Obsidian Entertainment,Private Division,RPG,2019-10-25,The Outer Worlds is a new single-player first-...,82,There's a category of games I think of as Satu...,2019-10-22 00:00:00,79.0,strong
1,The Outer Worlds,Obsidian Entertainment,Private Division,RPG,2019-10-25,The Outer Worlds is a new single-player first-...,82,"With The Outer Worlds, Obsidian has found its ...",2019-10-22 00:00:00,85.0,strong
2,The Outer Worlds,Obsidian Entertainment,Private Division,RPG,2019-10-25,The Outer Worlds is a new single-player first-...,82,"A conventional, easygoing scifi RPG with sligh...",2019-10-22 00:00:00,,weak
3,The Outer Worlds,Obsidian Entertainment,Private Division,RPG,2019-10-25,The Outer Worlds is a new single-player first-...,82,"A deep, funny, and intricately designed RPG re...",2019-10-29 00:00:00,90.0,mighty
4,The Outer Worlds,Obsidian Entertainment,Private Division,RPG,2019-10-25,The Outer Worlds is a new single-player first-...,82,The Outer Worlds marks Obsidian operating at t...,2019-10-22 00:00:00,90.0,mighty


Columns required for training and predicting the models:

- critic_review
- critics_score

In [5]:
df_c = df_for_classifer.loc[:,['critic_review','opencritic_rating_critics_score']]
df_c.dropna(inplace=True)
df_c

Unnamed: 0,critic_review,opencritic_rating_critics_score
0,There's a category of games I think of as Satu...,strong
1,"With The Outer Worlds, Obsidian has found its ...",strong
2,"A conventional, easygoing scifi RPG with sligh...",weak
3,"A deep, funny, and intricately designed RPG re...",mighty
4,The Outer Worlds marks Obsidian operating at t...,mighty
...,...,...
15339,For Honor developed from a promising concept t...,strong
15340,For Honor is an incredibly competitive multipl...,strong
15341,As an arena sword fighter For Honor does an ad...,weak
15342,For Honor is an impressive fighting game with ...,mighty


In [6]:
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html
df_c = df_c.sample(frac=0.5, random_state=0)
df_c

Unnamed: 0,critic_review,opencritic_rating_critics_score
1335,"A stellar remake, which is not afraid to chang...",mighty
12217,Kingdom Hearts III is the game that fans have ...,strong
8010,Injustice 2 is a combination of the core gamep...,weak
1272,"Collectively speaking, this game subverts expe...",mighty
549,"""Triumph"" is a word often lazily used by revie...",weak
...,...,...
13339,Fallout 4 is not perfect. It’s plagued with mi...,weak
6760,Prey is a divisive game that many will adore w...,mighty
10657,Yakuza 0 is easily one of the highlights of th...,weak
3816,"On its own, the remake of Resident Evil 3 is a...",mighty


In [7]:
#how many rating scores for each category?
ratings_grouped = df_c.groupby('opencritic_rating_critics_score').critic_review.count()
ratings_grouped

#small number of 'fair' reviews to classify. attempt to oversample these using SMOTE
#will train model with the current data and check accuracy score

opencritic_rating_critics_score
fair       167
mighty    2378
strong    3298
weak      1820
Name: critic_review, dtype: int64

Use TDD to process the text, ready for training the model

## Process text code

In [8]:
#remove punctuation
def remove_punctuation(reviews):
    punctuation = string.punctuation
    new_list = []
    for review in reviews:
        new_string = ''
        for letter in review:
            if letter not in punctuation:
                new_string += letter
        new_list.append(new_string)
    
    return new_list #explain the difficulty in refactoring this code due to the number of components

#lemmatize
def lemmatize_review(reviews):
    new_list = []
    lem = WordNetLemmatizer()
    for review in reviews:
        #tokenize review
        tokenised = word_tokenize(review)
        lemmatized_list = []
        for word in tokenised:
            lemmatized_word = lem.lemmatize(word)
            lemmatized_list.append(lemmatized_word)
        new_string = " ".join(lemmatized_list)
        new_list.append(new_string)
    return new_list

#use tfidfVectorizer to convert words to lowercase, removes stopwords and tokenizes the data
def tfidfvectorize_reviews(review):
    stop_words = set(stopwords.words('english'))
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    fit_review = vectorizer.fit_transform(review)
    matrix = fit_review.toarray()
    return matrix
    


## Unit tests

In [9]:
class TestTextFormatting(unittest.TestCase):
    
    def test_remove_punc_exists(self):
        self.assertIsNotNone(remove_punctuation)
    
    def test_punctuation_removed(self):
        res = remove_punctuation(['x'+ string.punctuation, 'y'+ string.punctuation])
        for r in res:
            self.assertNotRegex(r,'[\W]')
        
    def test_lemmatizer_exists(self):
        self.assertIsNotNone(lemmatize_review)
        
    def test_words_lemmatized(self):
        tester = ["corpora rockers", "players syllabi"]
        res = lemmatize_review(tester)
        self.assertEqual(res, ["corpus rocker", "player syllabus"])
    
    def test_vectorizer_exists(self):
        self.assertIsNotNone(tfidfvectorize_reviews)
        
    def test_words_vectorized(self):
        tester = ["This is a review",
          "This is a very good review",
          "This is a very bad review",
          "This is an awful review dont watch",
          "This review is outstanding"
          ]
        res = tfidfvectorize_reviews(tester)
        self.assertEqual(type(res), np.ndarray)
    
unittest.main(argv=['ingored', '-v'], exit=False)

test_lemmatizer_exists (__main__.TestTextFormatting) ... ok
test_punctuation_removed (__main__.TestTextFormatting) ... ok
test_remove_punc_exists (__main__.TestTextFormatting) ... ok
test_vectorizer_exists (__main__.TestTextFormatting) ... ok
test_words_lemmatized (__main__.TestTextFormatting) ... ok
test_words_vectorized (__main__.TestTextFormatting) ... ok

----------------------------------------------------------------------
Ran 6 tests in 1.165s

OK


<unittest.main.TestProgram at 0x219fbd68748>

Now that functions have been tested successfully, implement functions on the reviews...

In [10]:
text_to_process = df_c.loc[:,'critic_review']
y = df_c.loc[:,['opencritic_rating_critics_score']].reset_index(drop=True)

remove_punc = remove_punctuation(text_to_process)

lemmatize_text = lemmatize_review(remove_punc)

tfidf_vectorize_text = tfidfvectorize_reviews(lemmatize_text)

In [27]:
y.head()

Unnamed: 0,opencritic_rating_critics_score
0,mighty
1,strong
2,weak
3,mighty
4,weak


In [12]:
tfidf_vectorize_text

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
X = pd.DataFrame(data=tfidf_vectorize_text)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14119,14120,14121,14122,14123,14124,14125,14126,14127,14128
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Don't need to split the data into training and testing sets as k-fold cross validation will be used, and the average of the scores taken.

In [14]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [15]:
#X_train

In [16]:
#y_train

In [17]:
#X_test

In [18]:
#y_test

In [19]:
#import the naive bayes model and SVM model
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import svm

In [20]:
NB = MultinomialNB()
NB_scores = cross_val_score(NB, X, np.ravel(y), cv=10)

In [21]:
np.mean(NB_scores)

0.4832310960270424

In [22]:
#perform k-fold cross validation on critics scores using support vector regression
regr = svm.LinearSVC()
svm_scores = cross_val_score(regr, X, np.ravel(y), cv=10)

In [23]:
np.mean(svm_scores)

0.4370382726093661

In [24]:
#can't use SMOTE technique to oversample fair reviews due to lack of memory
oversample = SMOTE(k_neighbors=4)
oversampled_X, oversampled_y = oversample.fit_resample(X, y)


In [25]:
oversampled_NB_scores = cross_val_score(NB, oversampled_X, np.ravel(oversampled_y), cv=10)
np.mean(oversampled_NB_scores)

0.6637434810577343

In [26]:
oversampled_svm_scores = cross_val_score(regr, oversampled_X, np.ravel(oversampled_y), cv=10)
np.mean(oversampled_svm_scores)

0.7282555080754475

Potential downside of SMOTE: the majority classes are not considered when the synthetic samples are created for the minority classes. The overlapping of review lexicon between the classes may be more nuanced but this is not considered with SMOTE