<a href="https://colab.research.google.com/github/Homaoa/NLP-Using-spaCy-for-a-Hotel-Reviews-Data-Set/blob/main/NLP_with_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# First, I install spaCy and download en_core_web_sm small size

!pip install spacy
!python -m spacy download en_core_web_sm

2023-11-14 19:02:46.534434: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-14 19:02:46.534508: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-14 19:02:46.534542: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-14 19:02:46.543874: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading htt

In [2]:
import pandas as pd

In [3]:
# I use the hotel reviews data set that I used in my other NLP project (NLP-for-a-Hotel-Reviews-Data-Set), using NLTK.

url = "https://raw.githubusercontent.com/Homaoa/NLP-for-a-Hotel-Reviews-Data-Set/main/tripadvisor_hotel_reviews.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
# I do not need the rating column in analyzing the text without getting prediction now, so I remove it

df = df.drop('Rating' , axis = 1)
df.head()

Unnamed: 0,Review
0,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
dtypes: object(1)
memory usage: 160.2+ KB


In [6]:
# The data frame is very big, it consists of 20491 reviews. I drop some rows to make the data frame smaller

df = df.drop(df.index[5001:20491])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  5001 non-null   object
dtypes: object(1)
memory usage: 39.2+ KB


In [8]:
df.tail()

Unnamed: 0,Review
4996,great location reasonable price husband stayed...
4997,"worth try wife stayed hotel maxim recently, de..."
4998,"perfect location, clean safe reasonably priced..."
4999,great location accomodating perfect hotel stay...
5000,"gem family run hotel, hotel casci wonderful ho..."


In [9]:
# There are 5000 reviews now to analyze

In [10]:
# Now I can import spaCy and define nlp

import spacy
nlp = spacy.load("en_core_web_sm")

In [11]:
# Then I use the reviews to build one mega string to analyze using cat method in pandas

all_text = df.Review.str.cat(sep = " ")

In [12]:
# I also increase the nlp.max_length

nlp.max_length = 3819790

In [13]:
# Next step is to build a spaCy document

doc = nlp(all_text , disable = ["ner"])

In [14]:
# It is possible to analyze the reviews now, for example word frequency analysis

from collections import Counter

words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
print(word_freq)
word_freq.most_common(20)



[('hotel', 11945),
 ('room', 11662),
 ('stay', 7077),
 ('great', 5349),
 ('good', 5205),
 ('  ', 5000),
 ('staff', 3896),
 ('night', 3616),
 ('nice', 3506),
 ('day', 3492),
 ('time', 3329),
 ('beach', 3329),
 ('service', 3082),
 ('place', 2832),
 ('restaurant', 2812),
 ('food', 2625),
 ('like', 2539),
 ('resort', 2515),
 ('location', 2474),
 ('clean', 2454)]

In [15]:
# We can see the 20 most common words that have been used in the 5000 reviews data set
# I used lemma_ attribute to use the root of the words
# We can see what people talk about the most. Some of them was predictble, like hotel, stay, etc
# But they also for example talked about service, food, and how clean it is too a lot

In [16]:
# Next, pattern matching can be done on the data. For example checking the most repeated adjective and noun combinations

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'ADJ'} , {'POS':'NOUN'}]
matcher.add('ADJ_PHRASE', [pattern])
matches = matcher(doc , as_spans = True)

phrases = []
for span in matches:
  phrases.append(span.text.lower())
  phrases_freq = Counter(phrases)

phrases_freq.most_common(20)


[('great location', 452),
 ('great hotel', 301),
 ('great time', 258),
 ('great place', 233),
 ('royal service', 170),
 ('nice hotel', 153),
 ('continental breakfast', 148),
 ('french quarter', 147),
 ('good location', 135),
 ('friendly staff', 131),
 ('good value', 127),
 ('bottled water', 121),
 ('great value', 102),
 ('short walk', 101),
 ('great service', 95),
 ('best hotel', 91),
 ('great view', 90),
 ('nice touch', 88),
 ('hot water', 88),
 ('good food', 86)]

In [17]:
# We can see the most common adjectives and nouns. For example "great location" is mentioned more than the rest combinations

In [18]:
# Another analysis that I can do is to look for words that most frequently come near the word "bad"

from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr = 'LOWER') # Everything must be lowered
pattern = [nlp.make_doc('bad')]
matcher.add('bad', pattern)
matches = matcher(doc)

bad_colloc = []
for match_id, start, end in matches:
    span = doc[start-10 : end+10] # It grabs the 10 tokens before and after
    bad_colloc.extend([token.lemma_.lower() for token in span if not token.is_stop and not token.is_punct])
Counter(bad_colloc).most_common(20)

[('bad', 882),
 ('hotel', 291),
 ('room', 215),
 ('stay', 175),
 ('  ', 173),
 ('good', 173),
 ('review', 147),
 ('food', 142),
 ('great', 109),
 ('place', 100),
 ('night', 98),
 ('people', 95),
 ('service', 94),
 ('time', 93),
 ('resort', 85),
 ('day', 84),
 ('nice', 84),
 ('restaurant', 80),
 ('beach', 72),
 ('staff', 64)]

In [19]:
# We can tell what poeple mention the most when they use word "bad"