Base on the notebook "steam-games-reviews-analysis-sentiment-analysis.ipynb"

Previous sections and Section 5.5

We build our own testing script of the model for building the API and other stuff.

Should provide interface for dataframe, as well as single text

In [1]:
import pandas as pd
import numpy as np

import re
import nltk

from pathlib import Path

# 4. Data Cleaning For Sentiment Processing

Includes null values cleaning, duplicate values removing etc...

In [2]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michaelcheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Null values Cleaning

In [3]:
# review.isnull().sum()

But after we've seen that the missing values are only in app name feature, we will ignore them anyway.

### Duplicate Values Removing

Appear in the original script.

In [4]:
# review = review.drop_duplicates(keep='first')

### We make a function to clean some basic characters

In [5]:
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

### We make function to remove numeric

In [6]:
def remove_num(texts):
   output = re.sub(r'\d+', '', texts)
   return output

### We make function to remove emoji

In [7]:
def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

### We make function to unify whitespaces

In [8]:
def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string 

### We make function to remove symbols

In [9]:
def remove_symbols(x):
    cleaned_string = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', x)
    return cleaned_string

### We make function to remove punctuation

In [10]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',','))
    return final

### We make function to remove stopwords

Stopwords are words that occur frequently in a language and are often grammatical in nature, such as articles (e.g., "the," "a"), prepositions (e.g., "in," "on"), conjunctions (e.g., "and," "but"), and pronouns (e.g., "he," "she"). These words are necessary for constructing sentences and conveying grammatical structure, but they often do not contribute much to the overall meaning of the text.

- noise reduction
- Improved Analysis Accuracy
- faster processing

In [11]:
stop=set(stopwords.words("english"))
stemmer=PorterStemmer()
lemma=WordNetLemmatizer()

def remove_stopword(text):
   text=[word.lower() for word in text.split() if word.lower() not in stop]
   return " ".join(text)

### We make function to use stemming to normalize words

Reduce the words to its stem to reduce dimension.
e.g. 

cared ----> care

university ----> univers

fairly ----> fair

easily ----> easili

singing ----> sing

sings ----> sing

sung ----> sung

singer ----> singer

sportingly ----> sport

In [12]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
def stemming(text):
   stem=[]
   # stopword = stopwords.words('english')
   snowball_stemmer = SnowballStemmer('english')
   word_tokens = nltk.word_tokenize(text)
   stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
   stem=' '.join(stemmed_word)
   return stem

### Then we combine all the cleaning functions

In [13]:
def cleaning(df,review):
    df[review] = df[review].apply(clean)
    df[review] = df[review].apply(deEmojify)
    df[review] = df[review].str.lower()
    df[review] = df[review].apply(remove_num)
    df[review] = df[review].apply(remove_symbols)
    df[review] = df[review].apply(remove_punctuation)
    df[review] = df[review].apply(remove_stopword)
    df[review] = df[review].apply(unify_whitespaces)
    df[review] = df[review].apply(stemming)

In [14]:
# example of applying text cleaning to a text

# cleaning(review,'review_text')

# Section 5: Creating pipeline for tokenizing and modelling

We create an end to end pipeline for training, testing and future usage of the models.

It involves Count Vectorizer, Tfidf Transformer and a ML classifier (e.g. Random Forest Classifier)

We can build another pipeline using TfidfVectorizer (a combination of Count Vectorizer and Tfidf Transformer). It is the same as applying Count Vectorizer then Tfidf Transformer.

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

# max_features: If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
# Otherwise, all features are used.
# vect = CountVectorizer(stop_words= "english",max_features=3000)
# tfidf = TfidfTransformer()
# model = RandomForestClassifier()


# load the model to the pipeline

import pickle
from pathlib import Path

filename = Path("steam-games-reviews-analysis-sentiment-analysis_model_12-09-2023.sav")
loaded_model = pickle.load(open(filename, 'rb'))

# we save the count vectorizer in section 5.5 also
filename_count_vec = Path('steam-games-reviews-analysis-sentiment-analysis_count_vectorizer_12-09-2023.pkl')
loaded_count_vec = pickle.load(open(filename_count_vec, "rb"))

# we save the fit tfidf (fit in pipeline2.fit())
filename_tfidf = Path('steam-games-reviews-analysis-sentiment-analysis_tfidf_12-09-2023.pkl')
loaded_tfidf = pickle.load(open(filename_tfidf, "rb"))

pipeline_target = Pipeline([
    ('count_vectorizer', loaded_count_vec),
    ('tfidf', loaded_tfidf),
    ('model', loaded_model)
])

For prediction, the list of text should first passed through data-cleaning

Then pass the list of text to the pipeline.

In [16]:
def cleaning(s_list:list[str]):
    _s_list = list(map(clean, s_list))
    _s_list = list(map(deEmojify, _s_list))
    _s_list = list(map(lambda x: x.lower(), _s_list))
    _s_list = list(map(remove_num, _s_list))
    _s_list = list(map(remove_symbols, _s_list))
    _s_list = list(map(remove_punctuation, _s_list))
    _s_list = list(map(remove_stopword, _s_list))
    _s_list = list(map(unify_whitespaces, _s_list))
    _s_list = list(map(stemming, _s_list))
    return _s_list

In [17]:
# the input

testing_list = [
    'This game is freaking good! I enjoy the graphics.',
    'GOAT !!!',
    'The only reason this game exists is for us to put shit on it.',
    'Disappointing. It feels an EA game with lots of DLC to be released… Not recommending.'
]

In [18]:
testing_list_2 = cleaning(testing_list)
testing_list_2

['game freak good enjoy graphic',
 'goat',
 'reason game exist us put shit',
 'disappoint feel ea game lot dlc releas recommend']

In [19]:
pipeline_target.predict(testing_list_2)

array([ 1,  1, -1, -1])

1: positive

-1: negative