<a href="https://colab.research.google.com/github/MarioAvolio/Amazon-Fine-Foods-reviews-Transformers-Text-Classification/blob/main/Amazon_Fine_Food_Review_Aspect_Level_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aspect-Level Sentiment Analysis


**Mario Avolio: 880995 - https://marioavolio.netlify.app/**

Credits: 
- https://www.oreilly.com/library/view/practical-natural-language/9781492054047/

Dataset:
- https://snap.stanford.edu/data/web-FineFoods.html



In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt # plotting
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Constants and Methods

In [3]:
PATH_PROJ = "/content/drive/MyDrive/data-proj/"
# if not os.path.exists(PATH_PROJ):
#   PATH_PROJ = "/content/drive/MyDrive/shared/data-proj/"

PATH_DATASET = PATH_PROJ+"preprocessed.csv"


# Data 

In [4]:
df = pd.read_csv(PATH_DATASET)
df[df['text'].isnull()]

Unnamed: 0,text,score


In [5]:
def convert_to_list(row):
  try:
    return list(row.split(","))
  except:
    print(row)

df.text = df.text.apply(convert_to_list)
df

Unnamed: 0,text,score
0,"[bought, several, vitality, canned, dog, food,...",5.0
1,"[product, arrived, labeled, jumbo, salted, pea...",1.0
2,"[this, confection, around, centuries, light, p...",4.0
3,"[if, looking, secret, ingredient, robitussin, ...",2.0
4,"[great, taffy, great, price, there, wide, asso...",5.0
...,...,...
35165,"[once, tasted, hazelnut, coffee, hooked, now, ...",5.0
35166,"[has, maxwell, house, quit, making, coffee, ca...",5.0
35167,"[nutty, smooth, subtle, wonderful, aroma, love...",5.0
35168,"[price, right, taste, good, we, buying, harmon...",5.0


# Topic Modeling: LDA

In [6]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint

In [7]:

data_path = PATH_DATASET

summaries = []
for list_of_words in df.text.to_numpy():
  summaries.append(list_of_words)

summaries[:3]


[['bought',
  'several',
  'vitality',
  'canned',
  'dog',
  'food',
  'products',
  'found',
  'good',
  'quality',
  'product',
  'looks',
  'like',
  'stew',
  'processed',
  'meat',
  'smells',
  'better',
  'my',
  'labrador',
  'finicky',
  'appreciates',
  'product',
  'better'],
 ['product',
  'arrived',
  'labeled',
  'jumbo',
  'salted',
  'peanuts',
  'peanuts',
  'actually',
  'small',
  'sized',
  'unsalted',
  'not',
  'sure',
  'error',
  'vendor',
  'intended',
  'represent',
  'product',
  'jumbo'],
 ['this',
  'confection',
  'around',
  'centuries',
  'light',
  'pillowy',
  'citrus',
  'gelatin',
  'nuts',
  'case',
  'filberts',
  'cut',
  'tiny',
  'squares',
  'liberally',
  'coated',
  'powdered',
  'sugar',
  'tiny',
  'mouthful',
  'heaven',
  'not',
  'chewy',
  'flavorful',
  'highly',
  'recommend',
  'yummy',
  'treat',
  'if',
  'familiar',
  'story',
  'lewis',
  'lion',
  'witch',
  'wardrobe',
  'treat',
  'seduces',
  'edmund',
  'selling',
  'brothe

In [8]:

# Create a dictionary representation of the documents.

dictionary = Dictionary(summaries)
len(dictionary)

36128

In [9]:

# Filter infrequent or too frequent words.
dictionary.filter_extremes(no_below=10, no_above=0.5) # https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes
corpus = [dictionary.doc2bow(summary) for summary in summaries]
corpus[:3]

[[(0, 2),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 2),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1)],
 [(14, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 2),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1)],
 [(26, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 2),
  (60, 2),
  (61, 1)]]

In [10]:

# Make a index to word dictionary.
print(dictionary[0])
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
id2word

better


{0: 'better',
 1: 'bought',
 2: 'canned',
 3: 'dog',
 4: 'finicky',
 5: 'food',
 6: 'found',
 7: 'good',
 8: 'labrador',
 9: 'like',
 10: 'looks',
 11: 'meat',
 12: 'my',
 13: 'processed',
 14: 'product',
 15: 'products',
 16: 'quality',
 17: 'several',
 18: 'smells',
 19: 'stew',
 20: 'actually',
 21: 'arrived',
 22: 'error',
 23: 'intended',
 24: 'jumbo',
 25: 'labeled',
 26: 'not',
 27: 'peanuts',
 28: 'salted',
 29: 'sized',
 30: 'small',
 31: 'sure',
 32: 'unsalted',
 33: 'vendor',
 34: 'around',
 35: 'brother',
 36: 'case',
 37: 'chewy',
 38: 'citrus',
 39: 'coated',
 40: 'confection',
 41: 'cut',
 42: 'familiar',
 43: 'flavorful',
 44: 'gelatin',
 45: 'heaven',
 46: 'highly',
 47: 'if',
 48: 'light',
 49: 'mouthful',
 50: 'nuts',
 51: 'powdered',
 52: 'recommend',
 53: 'selling',
 54: 'sisters',
 55: 'squares',
 56: 'story',
 57: 'sugar',
 58: 'this',
 59: 'tiny',
 60: 'treat',
 61: 'yummy',
 62: 'addition',
 63: 'beer',
 64: 'believe',
 65: 'cherry',
 66: 'extract',
 67: 'flavo

In [11]:
#Train the topic model
N_TOPICS = 5
model = LdaModel(corpus=corpus, id2word=id2word,iterations=10000, num_topics=N_TOPICS)
top_topics = list(model.top_topics(corpus)) # https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.top_topics
# Returns
# Each element in the list is a pair of a topic representation and its coherence score. Topic representations are distributions of words, represented as a list of pairs of word IDs and their probabilities.

# Return type
# list of (list of (int, str), float)
pprint(top_topics)

[([(0.024155362, 'coffee'),
   (0.016531885, 'amazon'),
   (0.012753052, 'product'),
   (0.011742533, 'price'),
   (0.0102750845, 'one'),
   (0.008797492, 'this'),
   (0.008689013, 'great'),
   (0.008449803, 'order'),
   (0.008124542, 'find'),
   (0.007953288, 'good'),
   (0.0075424463, 'buy'),
   (0.007027551, 'box'),
   (0.0066313003, 'time'),
   (0.0064987084, 'ordered'),
   (0.0061199404, 'store'),
   (0.00600764, 'bought'),
   (0.005882356, 'get'),
   (0.0057246806, 'would'),
   (0.005271704, 'shipping'),
   (0.0051970812, 'like')],
  -1.9981970976003314),
 ([(0.030315304, 'tea'),
   (0.020161303, 'like'),
   (0.018586857, 'taste'),
   (0.015810516, 'flavor'),
   (0.00991046, 'good'),
   (0.009166032, 'this'),
   (0.008825216, 'drink'),
   (0.008037442, 'sugar'),
   (0.0064589786, 'one'),
   (0.006189325, 'really'),
   (0.0060301432, 'would'),
   (0.006007487, 'great'),
   (0.005393595, 'coffee'),
   (0.0053526764, 'water'),
   (0.005223829, 'sweet'),
   (0.0050678905, 'tastes'),


In [12]:
for idx in range(N_TOPICS):
    print("Topic #%s:" % idx, model.print_topic(idx, 5))
print("=" * 20)

Topic #0: 0.030*"tea" + 0.020*"like" + 0.019*"taste" + 0.016*"flavor" + 0.010*"good"
Topic #1: 0.016*"good" + 0.015*"great" + 0.014*"like" + 0.012*"taste" + 0.012*"chips"
Topic #2: 0.024*"coffee" + 0.017*"amazon" + 0.013*"product" + 0.012*"price" + 0.010*"one"
Topic #3: 0.019*"dog" + 0.015*"my" + 0.012*"dogs" + 0.009*"loves" + 0.009*"food"
Topic #4: 0.021*"food" + 0.009*"like" + 0.008*"product" + 0.008*"one" + 0.007*"good"
