# SpaCy Example 

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm
from src.data_processing.process_labels import *
from src.data_processing.process_reviews import *

## Data processing

### Import data

In [2]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')

In [3]:
# Separate reviews and labels
reviews = df.text
food_labels = df.food
service_labels = df.service

### Import spaCy and loading the english pipeline

In [4]:
# import spaCy and load the english pipeline
import spacy
nlp = spacy.load('en_core_web_sm')

2023-09-08 02:00:28.100603: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# Use a single sentence as example
sentence = reviews[0]
sentence

'Food was very good, service quick and pleasant.  The place was airy and clean.  I would highly recommend this place. Great atmosphere'

In [6]:
doc = nlp(sentence)
type(doc)

spacy.tokens.doc.Doc

In [7]:
doc

Food was very good, service quick and pleasant.  The place was airy and clean.  I would highly recommend this place. Great atmosphere

In [8]:
# Examine the tokens
index = 8
print(doc[index], f"\nType: {type(doc[index])}")

pleasant 
Type: <class 'spacy.tokens.token.Token'>


In [9]:
# Removing stop words 
for token in doc:
    if not token.is_stop:
        print(token)

Food
good
,
service
quick
pleasant
.
 
place
airy
clean
.
 
highly
recommend
place
.
Great
atmosphere


In [10]:
# remove stop words using list comprehension
trimmed_sentence = [token for token in doc if not token.is_stop]
trimmed_sentence

[Food,
 good,
 ,,
 service,
 quick,
 pleasant,
 .,
  ,
 place,
 airy,
 clean,
 .,
  ,
 highly,
 recommend,
 place,
 .,
 Great,
 atmosphere]

In [11]:
# remove punctuation
trimmed_sentence2 = [token for token in doc if not token.is_punct]
trimmed_sentence2

[Food,
 was,
 very,
 good,
 service,
 quick,
 and,
 pleasant,
  ,
 The,
 place,
 was,
 airy,
 and,
 clean,
  ,
 I,
 would,
 highly,
 recommend,
 this,
 place,
 Great,
 atmosphere]

In [12]:
# remove both punctuation and stop words and additional white space
# this returns a list of token obj
clean_sentence = [token for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
clean_sentence

[Food,
 good,
 service,
 quick,
 pleasant,
 place,
 airy,
 clean,
 highly,
 recommend,
 place,
 Great,
 atmosphere]

In [13]:
# This returns a list of str
clean_review = [token.text for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
clean_review

['Food',
 'good',
 'service',
 'quick',
 'pleasant',
 'place',
 'airy',
 'clean',
 'highly',
 'recommend',
 'place',
 'Great',
 'atmosphere']

In [14]:
# join the tokens back to string
" ".join(clean_review)

'Food good service quick pleasant place airy clean highly recommend place Great atmosphere'

### Testing if function works

In [15]:
from src.data_processing.preprocess_reviews import remove_stop_punc

In [17]:
reviews_list = [remove_stop_punc(review, model=nlp) for review in tqdm(reviews[:100], desc ="Preprocessing Reviews")]

Preprocessing Reviews: 100%|██████████| 100/100 [00:01<00:00, 53.96it/s]


In [18]:
reviews_list[0]

'Food good service quick pleasant place airy clean highly recommend place Great atmosphere'

In [19]:
reviews[0]

'Food was very good, service quick and pleasant.  The place was airy and clean.  I would highly recommend this place. Great atmosphere'

In [21]:
FINAL = pd.Series(reviews_list)

In [23]:
FINAL

0     Food good service quick pleasant place airy cl...
1     place special occasions birthdays anniversarie...
2     Amazing Service waiter Tyler great helping pic...
3     know place good reviews opened pizza favorites...
4     Service bad food good arrived 11:30 pm sat out...
                            ...                        
95    Absolute Cocktails food die Staff service frie...
96    indoor wearing tasty interesting food combinat...
97    frustrating substitute toppings want single sl...
98    Great lunch burgers sandwiches terrific good s...
99    negative stars customer service girlfriend exp...
Length: 100, dtype: object

In [24]:
pd.Series(["The food here sucks", "dont waste money"])

0    The food here sucks
1       dont waste money
dtype: object