In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [3]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
df.shape

(20491, 2)

In [5]:
df.dtypes

Review    object
Rating     int64
dtype: object

In [6]:
df.isnull().any()

Review    False
Rating    False
dtype: bool

In [7]:
blanks = []
for index in df["Review"]:
    if index.isspace() == True:
        blanks.append(index)
        

In [8]:
blanks

[]

In [9]:
df["Rating"].value_counts().sort_values()

1    1421
2    1793
3    2184
4    6039
5    9054
Name: Rating, dtype: int64

In [10]:
def rating(rating):
    if rating > 3 and rating <= 5:
        return "Positive"
    if rating >0 and rating <=3:
        return "Negative"

In [11]:
df["Rating_posorneg"] = df["Rating"].apply(rating)

In [12]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg
0,nice hotel expensive parking got good deal sta...,4,Positive
1,ok nothing special charge diamond member hilto...,2,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,Negative
3,"unique, great stay, wonderful time hotel monac...",5,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive


In [13]:
df["Rating_posorneg"].value_counts()

Positive    15093
Negative     5398
Name: Rating_posorneg, dtype: int64

# Sentiment Analysis

In [14]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mitul\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [16]:
df["scores"] = df["Review"].apply(lambda review: sentiment.polarity_scores(review))

In [17]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg,scores
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co..."
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com..."
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp..."
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com..."
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co..."


In [18]:
df["compound"] = df["scores"].apply(lambda f:f["compound"])

In [20]:
df["Positive"] = df["scores"].apply(lambda f:f["pos"])

In [21]:
df["Negative"] = df["scores"].apply(lambda f:f["neg"])

In [22]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg,scores,compound,Positive,Negative
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747,0.285,0.072
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787,0.189,0.11
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889,0.219,0.081
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912,0.385,0.06
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797,0.221,0.135


In [23]:
df.drop("scores",  axis = 1, inplace = True)

In [24]:
df["compound_posorneg"] = df["compound"].apply(lambda compound: "Positive" if compound >= 0 else "Negative")

In [25]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg,compound,Positive,Negative,compound_posorneg
0,nice hotel expensive parking got good deal sta...,4,Positive,0.9747,0.285,0.072,Positive
1,ok nothing special charge diamond member hilto...,2,Negative,0.9787,0.189,0.11,Positive
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,0.9889,0.219,0.081,Positive
3,"unique, great stay, wonderful time hotel monac...",5,Positive,0.9912,0.385,0.06,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive,0.9797,0.221,0.135,Positive


In [26]:
example = "Hotel Was excellent and i have enjoyed it"
sentiment.polarity_scores(example)

{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.7906}

# Classification on Reviews

In [27]:
from sklearn.model_selection  import train_test_split

In [28]:
X = df["Review"]
y = df["Rating_posorneg"]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 10) 

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC 

In [31]:
textclassifier = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC(C = 10,))])

In [32]:
textclassifier.fit(X_train, y_train)

In [33]:
preds = textclassifier.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,preds))

0.8770331815224464


In [39]:
newreview = [("Hotel is ok ok, i will give the rating 1")]
textclassifier.predict(newreview)

array(['Negative'], dtype=object)