### Requirements:
- An existing folder `results`
- An existing folder `datasets`
- A dataset called `stray-game-14-08.csv` in directory `datasets` with at least 
  70 tweets containing a column 'text' where the tweet content is stores
- Import necessary libraries through first code block

In [1]:
import textblob
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.sentiments import NaiveBayesAnalyzer
import nltk
from nltk import pos_tag


import spacy
from spacy.tokenizer import Tokenizer
from spacytextblob.spacytextblob import SpacyTextBlob

import re
import pandas as pd

nltk.download('movie_reviews')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to /nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Pre-processing: removing links and web-specific content that is not needed
def clean_data(tweets):
  simple_url_re = re.compile(r'''https:\/\/([^\s]+)''')
  twitter_url_re = re.compile(r'''https:\/\/t.co\/[A-Za-z0-9_]+''')
  username_prefix_re = re.compile(r'''(\@)\w{1,15} ?''')
  # hashtag_re = re.compile(r'''(\#)[A-Za-z0-9_]+''')

  new_tweets = []
  new_tweet = ''

  for tweet in tweets:
    tweet = re.sub(simple_url_re, "", tweet)
    tweet = re.sub(twitter_url_re, "", tweet)
    tweet = re.sub(username_prefix_re, "", tweet)
    # tweet = re.sub(hashtag_re, "", tweet)
    new_tweets.append(tweet)
  return new_tweets

In [3]:
# Create a list of all tweets' content from dataframe
# tweets_content_list = tweets_df['text'].tolist()

# When reading from .csv file, create dataframe from .csv column 'text' and create list
tweets_content_list = pd.read_csv('datasets/stray-game-14-08.csv')['text'].tolist()
tweets_content_list = clean_data(tweets_content_list)

# def custom_tokenizer(nlp):
#     return Tokenizer(nlp.vocab, url_match=simple_url_re.match)

# Import tools to help with pre-processing and classification
nlp = spacy.load("en_core_web_sm")
nlp.enable_pipe("parser")

nlp.add_pipe('spacytextblob')

# Using Spacy's English tokenizer, PoS tagger, syntactic parser and named-entity recognition
processed_tweets = []

for tweet in tweets_content_list:
  doc = nlp(tweet)
  processed_tweets.append(doc)

In [4]:
# Show some example tweets
print(processed_tweets[6])
print(processed_tweets[13])

혷~ is trying out stray. looks like a pretty game 
Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.


In [5]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')

processed_tweets = []

for tweet in tweets_content_list:
  doc = nlp(tweet)
  processed_tweets.append(doc)

print(processed_tweets[13])
print(processed_tweets[13]._.blob.sentiment_assessments.assessments)

Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.
[(['honestly'], 0.6, 0.9, None), (['only'], 0.0, 1.0, None), (['good'], 0.7, 0.6000000000000001, None), (['not', 'good'], -0.35, 0.6000000000000001, None), (['game'], -0.4, 0.4, None)]


In [6]:
# Classify polarity
def classify_polarity(polarity):
  if polarity <= -0.1:
    return 'negative'
  elif (polarity > -0.1 and polarity < 0.1):
    return 'neutral'
  elif polarity >= 0.1:
    return 'positive'

sentiment_df = pd.DataFrame()

tweet_sentiment_pairs = []
for tweet in processed_tweets:
  sentiment = classify_polarity(tweet._.blob.polarity)
  tweet_sentiment_pairs.append((tweet.text, sentiment))

  sentiment_df = sentiment_df.append(pd.DataFrame({
      'text': [tweet.text],
      'sentiment': [sentiment]
  }))
  sentiment_df = sentiment_df.reset_index(drop=True)

# sentiment_df
pd.set_option('display.max_colwidth', None) 
sentiment_df


Unnamed: 0,text,sentiment
0,"Platinum no. 27 goes to Stray 🐈 what a wonderful game all around, absolutely adored it ♥️ kept the speedrunning trophy for last 😄",positive
1,Now you can play Stray from the cat's purr-spective:,neutral
2,"stream schedule for this week (Aug 15th to 21st) \n\nmore chill this week~ catching up with chat and more singing streams! Starting a new game this week ""stray"" as well! &lt;3 \n\nHope to see you there! \n\n⊹ \n⊹ Schedule by : \n",positive
3,"First Game: Alphabet Park Adventure (V-Smile) or Petz Catz 2 (Wii)\nLast Game: Stray\nBest Game: Stray, GTA V, and Red Dead Redemption 2\nWorst Game: Fortnite (not my cup of tea)",negative
4,* honestly i recall seeing this game in development before stray came out but i agree,neutral
...,...,...
95,Anybody else’s #stray game just like crash on them at some point???,negative
96,I love playing Stray such a relaxin game,neutral
97,"I loved playing Stray too. I frankly think that their BlueTwelve Studio team did a great job developing it for PS4, PS5, and PC. And I will treasure it forever. It deserves a lot of replays. I love it. It's a great video game.",positive
98,Stray\n\nINCREDIBLE worldbuilding and imagery my jaw dropped like 5 times also it was really sad and i cried and also cute cat is cute i love him so much this game is great 10/10 recommend,positive


In [7]:
sentiment_df.to_csv('results/stray-game-sentiments.csv')

In [8]:
print(processed_tweets[13]._.blob.polarity)
print(processed_tweets[13]._.blob.sentiment_assessments.assessments)

0.10999999999999996
[(['honestly'], 0.6, 0.9, None), (['only'], 0.0, 1.0, None), (['good'], 0.7, 0.6000000000000001, None), (['not', 'good'], -0.35, 0.6000000000000001, None), (['game'], -0.4, 0.4, None)]


### Get a better look at the used components of the spaCy pipeline

In [9]:
nlp.component_names
nlp.components

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fb3f391bc20>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fb3f0b4a4a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fb3f10b6820>),
 ('senter', <spacy.pipeline.senter.SentenceRecognizer at 0x7fb3f0b5b900>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fb3f103c340>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fb3f10460c0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fb3f0f2ec40>),
 ('spacytextblob',
  <spacytextblob.spacytextblob.SpacyTextBlob at 0x7fb3f301ad30>)]

In [10]:
print(processed_tweets[13])
print(processed_tweets[13]._.blob.sentiment)

print(processed_tweets[61])
print(processed_tweets[61]._.blob.sentiment)

Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.
Sentiment(polarity=0.10999999999999996, subjectivity=0.7)
Stray has been a great experience so far! Really enjoying the game and looking forward to seeing the end of story. 
Sentiment(polarity=0.25625, subjectivity=0.6875)


### Visualise computed dependency between words in 1 tweet

In [11]:
from spacy import displacy
displacy.render(processed_tweets[13], style="dep")

In [12]:
# Create n-grams from processed tweets
for tweet in processed_tweets[0:2]:
  print(tweet._.blob.ngrams())

[WordList(['Platinum', 'no', '27']), WordList(['no', '27', 'goes']), WordList(['27', 'goes', 'to']), WordList(['goes', 'to', 'Stray']), WordList(['to', 'Stray', '🐈']), WordList(['Stray', '🐈', 'what']), WordList(['🐈', 'what', 'a']), WordList(['what', 'a', 'wonderful']), WordList(['a', 'wonderful', 'game']), WordList(['wonderful', 'game', 'all']), WordList(['game', 'all', 'around']), WordList(['all', 'around', 'absolutely']), WordList(['around', 'absolutely', 'adored']), WordList(['absolutely', 'adored', 'it']), WordList(['adored', 'it', '♥️']), WordList(['it', '♥️', 'kept']), WordList(['♥️', 'kept', 'the']), WordList(['kept', 'the', 'speedrunning']), WordList(['the', 'speedrunning', 'trophy']), WordList(['speedrunning', 'trophy', 'for']), WordList(['trophy', 'for', 'last']), WordList(['for', 'last', '😄'])]
[WordList(['Now', 'you', 'can']), WordList(['you', 'can', 'play']), WordList(['can', 'play', 'Stray']), WordList(['play', 'Stray', 'from']), WordList(['Stray', 'from', 'the']), WordLi

# Supervised Sentiment Analysis with TextBlob

We define a function that cleans our dataset's 'text' column.

In [14]:
tweets_content_list = pd.read_csv('datasets/stray-game-14-08.csv')['text'].tolist()

# Pre-processing: removing links and web-specific content that is not needed
def clean_data(tweets):
  simple_url_re = re.compile(r'''https:\/\/([^\s]+)''')
  twitter_url_re = re.compile(r'''https:\/\/t.co\/[A-Za-z0-9_]+''')
  username_prefix_re = re.compile(r'''(\@)\w{1,15} ?''')
  # hashtag_re = re.compile(r'''(\#)[A-Za-z0-9_]+''')

  new_tweets = []
  new_tweet = ''

  for tweet in tweets:
    tweet = re.sub(simple_url_re, "", tweet)
    tweet = re.sub(twitter_url_re, "", tweet)
    tweet = re.sub(username_prefix_re, "", tweet)
    # tweet = re.sub(hashtag_re, "", tweet)
    new_tweets.append(tweet)
  return new_tweets

tweets_content_list = clean_data(tweets_content_list)

## Training our own Naive Bayes classifier

In [15]:
processed_tweets = []
naivebayes_analyzer = NaiveBayesAnalyzer() # Trained on movie reviews

# Our own trained classifier
training_data = [
    ('This game is amazing!', 'pos'),
    ('Stray is cute', 'pos'),
    ('It\'s the game of the year!', 'pos'),
    ('I love cats', 'pos'),
    ('It\'s not a good game.', 'neg'),
    ('I don\'t like cats', 'neg'),
    ('It\'s really not that good', 'neg'),
]

naivebayes_classifier = NaiveBayesClassifier(training_data)

# Test our classifier
def test_classifier(tweet):
  sentiment = naivebayes_classifier.classify(tweet)
  probability_distribution = naivebayes_classifier.prob_classify(tweet)

  positive_probability = probability_distribution.prob('pos')
  negative_probability = probability_distribution.prob('neg')
  if (positive_probability > 0.4 and negative_probability > 0.4):
    sentiment = 'neutral'

  return sentiment

tweets = [
    "Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.",
    "Platinum no. 27 goes to Stray 🐈 what a wonderful game all around, absolutely adored it ♥️ kept the speedrunning trophy for last 😄",
    "Now you can play Stray from the cat's purr-spective:",
    "stray is a very difficult game because the moment the kitty shows any sign of being distressed or hurt i feel like i deserve to be thrown under a moving vehicle"]

for tweet in tweets:
    print(f'Tweet: {tweet} - Sentiment: {test_classifier(tweet)}')

Tweet: Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME. - Sentiment: neg
Tweet: Platinum no. 27 goes to Stray 🐈 what a wonderful game all around, absolutely adored it ♥️ kept the speedrunning trophy for last 😄 - Sentiment: pos
Tweet: Now you can play Stray from the cat's purr-spective: - Sentiment: pos
Tweet: stray is a very difficult game because the moment the kitty shows any sign of being distressed or hurt i feel like i deserve to be thrown under a moving vehicle - Sentiment: pos


## Using TextBlob's pre-trained Naive Bayes classifier

In [16]:
tweets = ["Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.",
          "Platinum no. 27 goes to Stray 🐈 what a wonderful game all around, absolutely adored it ♥️ kept the speedrunning trophy for last 😄",
          "Now you can play Stray from the cat's purr-spective:",
          "stray is a very difficult game because the moment the kitty shows any sign of being distressed or hurt i feel like i deserve to be thrown under a moving vehicle"]

# Create TextBlob objects from every tweet and save them in list
def classify_tweets(tweets):
  textblobs = []
  for tweet in tweets:
    blob = TextBlob(tweet, analyzer=naivebayes_analyzer)
    textblobs.append(blob)
    
  return textblobs

processed_tweets = []
naivebayes_analyzer = NaiveBayesAnalyzer() # Trained on movie reviews

# Classify with neutral sentiment and create dataframe
textb_sentiment_df = pd.DataFrame()

textblobs = []
for tweet in tweets_content_list:
  blob = TextBlob(tweet, analyzer=naivebayes_analyzer)
  textblobs.append(blob)
  sentiment = blob.sentiment.classification

  # Add 'neutral' label to classification
  if (blob.sentiment.p_pos > 0.4 and blob.sentiment.p_neg > 0.4):
    sentiment = 'neutral'
  elif blob.sentiment.classification == 'pos':
    sentiment = 'positive'
  elif blob.sentiment.classification == 'neg':
    sentiment = 'negative'
  
  # Add to dataframe
  textb_sentiment_df = textb_sentiment_df.append(pd.DataFrame({
      'text': [tweet],
      'sentiment': [sentiment]
  }))

  textb_sentiment_df = textb_sentiment_df.reset_index(drop=True)

pd.set_option('display.max_colwidth', None)
textb_sentiment_df.head()
  

Unnamed: 0,text,sentiment
0,"Platinum no. 27 goes to Stray 🐈 what a wonderful game all around, absolutely adored it ♥️ kept the speedrunning trophy for last 😄",positive
1,Now you can play Stray from the cat's purr-spective:,neutral
2,"stream schedule for this week (Aug 15th to 21st) \n\nmore chill this week~ catching up with chat and more singing streams! Starting a new game this week ""stray"" as well! &lt;3 \n\nHope to see you there! \n\n⊹ \n⊹ Schedule by : \n",positive
3,"First Game: Alphabet Park Adventure (V-Smile) or Petz Catz 2 (Wii)\nLast Game: Stray\nBest Game: Stray, GTA V, and Red Dead Redemption 2\nWorst Game: Fortnite (not my cup of tea)",negative
4,* honestly i recall seeing this game in development before stray came out but i agree,neutral


## Using TextBlob's pre-trained Naive Bayes classifier wihtout 'neutral' sentiment

In [None]:
tweets = ["Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.",
          "Platinum no. 27 goes to Stray 🐈 what a wonderful game all around, absolutely adored it ♥️ kept the speedrunning trophy for last 😄",
          "Now you can play Stray from the cat's purr-spective:",
          "stray is a very difficult game because the moment the kitty shows any sign of being distressed or hurt i feel like i deserve to be thrown under a moving vehicle"]

# Create TextBlob objects from every tweet and save them in list
def classify_tweets(tweets):
  textblobs = []
  for tweet in tweets:
    blob = TextBlob(tweet, analyzer=naivebayes_analyzer)
    textblobs.append(blob)

  return textblobs

processed_tweets = []
naivebayes_analyzer = NaiveBayesAnalyzer() # Trained on movie reviews

# Classify with neutral sentiment and create dataframe
textb_sentiment_df = pd.DataFrame()

textblobs = []
for tweet in tweets_content_list:
  blob = TextBlob(tweet, analyzer=naivebayes_analyzer)
  textblobs.append(blob)
  sentiment = blob.sentiment.classification

  # Pure binary classification
  if blob.sentiment.classification == 'pos':
    sentiment = 'positive'
  elif blob.sentiment.classification == 'neg':
    sentiment = 'negative'
  
  # Add to dataframe
  textb_sentiment_df = textb_sentiment_df.append(pd.DataFrame({
      'text': [tweet],
      'sentiment': [sentiment]
  }))

  textb_sentiment_df = textb_sentiment_df.reset_index(drop=True)

pd.set_option('display.max_colwidth', None)
textb_sentiment_df.head()
  

In [17]:
textb_sentiment_df.to_csv('results/stray-game-sentiments-textblob2-no-neutral.csv')

# Aspect-based Sentiment Analysis

In [18]:
processed_tweets = []
naivebayes_analyzer = NaiveBayesAnalyzer() # Trained on movie reviews

def classify_tweets(tweets):
  textblobs = []
  for tweet in tweets:
    blob = TextBlob(tweet, analyzer=naivebayes_analyzer)
    textblobs.append(blob)

  return textblobs

processed_tweets = classify_tweets(tweets_content_list)

blob = processed_tweets[13]

print(blob)

nouns = []
for word, tag in blob.tags:
  if tag == 'NN':
    print(tag)
    print(word)
    nouns.append(word.lemmatize())

Honestly, stray's only good at audiovisuals. Apart from that it's not a good GAME.
NN
stray


In [19]:
processed_tweets = []
naivebayes_analyzer = NaiveBayesAnalyzer() # Trained on movie reviews

# Classify tweets with neutral sentiment
textblobs = []
tweets_sents_pairs = []
for tweet in tweets_content_list:
  blob = TextBlob(tweet, analyzer=naivebayes_analyzer)
  textblobs.append(blob)
  sentiment = blob.sentiment.classification

  # Pure binary classification
  if blob.sentiment.classification == 'pos':
    sentiment = 'positive'
  elif blob.sentiment.classification == 'neg':
    sentiment = 'negative'
  
  # Add to list of tweet-sentiment pairs
  tweets_sents_pairs.append((tweet, sentiment))

def get_general_sentiment_score(tweets_sents):
  sentimentScore = 0
  for pair in tweets_sents:
    if pair[1] == 'positive':
      sentimentScore += 1
    elif pair[1] == 'negative':
      sentimentScore -= 1
      # Ignore 'neutral' tweets

  return (sentimentScore/(len(tweets_sents)-1))

def classify_general_sentiment(score):
  if (score > -0.4 and score < 0.4):
    sentiment = 'neutral'
  elif score < -0.4:
    sentiment = 'negative'
  elif score > 0.4:
    sentiment = 'positive'
  
  return sentiment

avg_sentiment_score = get_general_sentiment_score(tweets_sents_pairs)
print(avg_sentiment_score)
general_sentiment = classify_general_sentiment(avg_sentiment_score)
print(general_sentiment)

0.20202020202020202
neutral


### Remark
Notice that the general sentiment would have been 'positive' had we not included a way to work with neutral sentiments, or reduced our tresholds from '0.4' to '0.2'.

# Another small example to illustrate a trained Naive Bayes classifier

In [20]:
training_data = [
    ("This food is delicious!", 'pos'),
    ("OMG! I ❤️ this food", 'pos'),
    ("The creamy texture of the dessert.. WOW! Amazing.", 'pos'),
    ("It's not bad at all.", 'pos'),
    ("Pasta is my favorite!", 'pos'),
    ("What a waste of ingredients...", 'neg'),
    ("I don't really like the texture of pasta.", 'neg'),
    ("It is not my cup of tea. ", 'neg'),
]

naivebayes_classifier = NaiveBayesClassifier(training_data)

# Test our classifier
def test_classifier(tweet):
  sentiment = naivebayes_classifier.classify(tweet)
  probability_distribution = naivebayes_classifier.prob_classify(tweet)

  positive_probability = probability_distribution.prob('pos')
  negative_probability = probability_distribution.prob('neg')
  if (positive_probability > 0.4 and negative_probability > 0.4):
    sentiment = 'neutral'

  return sentiment


test_sentences = [
    "Italian food is not my cup of tea",
    "The food wasn't bad, but not amazing either.",
    "I love the desserts here!",
    "I'm not a big fan of the fries. They're so dry!",
    "I ❤️ pasta",    
]

print("Our Naive Bayes classifier")
for sentence in test_sentences:
  sentiment = test_classifier(sentence)
  print(f'Sentence: {sentence} - Sentiment: {sentiment}')

Our Naive Bayes classifier
Sentence: Italian food is not my cup of tea - Sentiment: neg
Sentence: The food wasn't bad, but not amazing either. - Sentiment: pos
Sentence: I love the desserts here! - Sentiment: pos
Sentence: I'm not a big fan of the fries. They're so dry! - Sentiment: neg
Sentence: I ❤️ pasta - Sentiment: pos
