# SpaCy Example 

In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import gensim.downloader as api
from tqdm import tqdm
from src.data_processing.process_labels import *
from src.data_processing.process_reviews import *

## Data processing

### Import data

In [3]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')

In [4]:
# Separate reviews and labels
reviews = df.text
food_labels = df.food
service_labels = df.service

### Import spaCy and loading the english pipeline

In [16]:
# import spaCy and load the english pipeline
import spacy
nlp = spacy.load('en_core_web_sm')

In [17]:
# Use a single sentence as example
sentence = reviews[0]
sentence

'Food was very good, service quick and pleasant.  The place was airy and clean.  I would highly recommend this place. Great atmosphere'

In [20]:
doc = nlp(sentence)
type(doc)

spacy.tokens.doc.Doc

In [19]:
doc

Food was very good, service quick and pleasant.  The place was airy and clean.  I would highly recommend this place. Great atmosphere

In [28]:
# Examine the tokens
index = 8
print(doc[index], f"\nType: {type(doc[index])}")

pleasant 
Type: <class 'spacy.tokens.token.Token'>


In [29]:
# Removing stop words 
for token in doc:
    if not token.is_stop:
        print(token)

Food
good
,
service
quick
pleasant
.
 
place
airy
clean
.
 
highly
recommend
place
.
Great
atmosphere


In [33]:
# remove stop words using list comprehension
trimmed_sentence = [token for token in doc if not token.is_stop]
trimmed_sentence

[Food,
 good,
 ,,
 service,
 quick,
 pleasant,
 .,
  ,
 place,
 airy,
 clean,
 .,
  ,
 highly,
 recommend,
 place,
 .,
 Great,
 atmosphere]

In [41]:
# remove punctuation
trimmed_sentence2 = [token for token in doc if not token.is_punct]
trimmed_sentence2

[Food,
 was,
 very,
 good,
 service,
 quick,
 and,
 pleasant,
  ,
 The,
 place,
 was,
 airy,
 and,
 clean,
  ,
 I,
 would,
 highly,
 recommend,
 this,
 place,
 Great,
 atmosphere]

In [42]:
# remove both punctuation and stop words
clean_sentence = [token for token in doc if not token.is_punct and not token.is_stop]
clean_sentence

[Food,
 good,
 service,
 quick,
 pleasant,
  ,
 place,
 airy,
 clean,
  ,
 highly,
 recommend,
 place,
 Great,
 atmosphere]