# Exploring data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Python for NLP (Udemy)/Projects/1. Hotel-Reviews/tripadvisor_hotel_reviews.csv')

In [5]:
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [6]:
data.shape

(20491, 2)

In [9]:
data.isnull().sum()

Review    0
Rating    0
dtype: int64

In [11]:
blanks = []
for i in data['Review']:
  if i.isspace() == True:
    blanks.append(i)

In [12]:
blanks

[]

In [14]:
data['Rating'].value_counts().sort_values()

1    1421
2    1793
3    2184
4    6039
5    9054
Name: Rating, dtype: int64

In [15]:
def rating(rating):
  if rating > 3 and rating <= 5:
    return "Positive"
  if rating > 0 and rating <= 3:
    return "Negative"

In [16]:
data['p/n'] = data['Rating'].apply(rating)

In [17]:
data.head()

Unnamed: 0,Review,Rating,p/n
0,nice hotel expensive parking got good deal sta...,4,Positive
1,ok nothing special charge diamond member hilto...,2,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,Negative
3,"unique, great stay, wonderful time hotel monac...",5,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive


In [18]:
data['p/n'].value_counts()

Positive    15093
Negative     5398
Name: p/n, dtype: int64

# Sentiment Analysis

In [19]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [20]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentanalyze = SentimentIntensityAnalyzer()

In [21]:
data['scores'] = data['Review'].apply(lambda review: sentanalyze.polarity_scores(review))

In [22]:
data.head()

Unnamed: 0,Review,Rating,p/n,scores
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co..."
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com..."
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp..."
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com..."
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co..."


In [23]:
data['compound'] = data['scores'].apply(lambda f:f['compound'])

In [24]:
data.head()

Unnamed: 0,Review,Rating,p/n,scores,compound
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797


In [25]:
data['compound_pn'] = data['compound'].apply(lambda score: "Positive" if score >= 0 else "Negative")

In [26]:
data.head()

Unnamed: 0,Review,Rating,p/n,scores,compound,compound_pn
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747,Positive
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787,Positive
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889,Positive
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797,Positive


In [27]:
eg = 'This laptop is good in performance. I personally recommend this one to other people as well.'

In [28]:
sentanalyze.polarity_scores(eg)

{'neg': 0.0, 'neu': 0.615, 'pos': 0.385, 'compound': 0.7579}

  # Classification of reviews

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x = data['Review']
y = data['p/n']

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [34]:
textclf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [35]:
textclf.fit(x_train,y_train)

In [36]:
predictions = textclf.predict(x_test)

In [38]:
from sklearn.metrics import accuracy_score
print(accuracy_score(predictions,y_test))

0.8970396877033181


In [42]:
newreview = [('Hotel was good, we liked it!')]

In [43]:
textclf.predict(newreview)

array(['Positive'], dtype=object)