In [15]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt


In [16]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing dataset

In [17]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/tripadvisor_hotel_reviews.csv')

In [18]:
dataset.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


We’ll be detecting 3 basic sentiments in the text provided in the review i.e. positive, neutral and negative

## Converting Rating column

In [19]:
def convert(x):
  if (x==1 or x==2):
    return 0
  elif (x==3):
    return 1
  elif (x==4 or x==5):
    return 2

In [20]:
convert(5)

2

In [21]:
dataset.Rating.apply(convert)

0        2
1        0
2        1
3        2
4        2
        ..
20486    2
20487    2
20488    0
20489    0
20490    0
Name: Rating, Length: 20491, dtype: int64

# Deal with missing values:

In [22]:
dataset.isna().sum()

Review    0
Rating    0
dtype: int64

# Text Cleaning

### 1) Lower all the texts:


In [23]:
dataset.Review = dataset.Review.str.lower() 

In [24]:
dataset

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


### 2) Remove links:

In [25]:
dataset.Review.replace(r"https?//[a-z0-9./]",' ',inplace= True,regex=True)

### 3) Remove Non alphabetical characters:

In [26]:
dataset.Review.replace(r'[^a-z]',' ',inplace=True,regex=True)

### 4) Remove extra spaces:

In [27]:
dataset.Review.replace(r' +',' ',inplace=True,regex=True)

In [28]:
dataset

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not experience hotel monaco seattle...,3
3,unique great stay wonderful time hotel monaco ...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret rd time staying charm not sta...,5
20487,great location price view hotel great quick pl...,4
20488,ok just looks nice modern outside desk staff n...,2
20489,hotel theft ruined vacation hotel opened sept ...,1


# Saving dataset

In [29]:
dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/Cleaned_data.csv')

#Tokenization


In [30]:
def tokenize(x):
  return nltk.word_tokenize(x)

In [31]:
dataset.Review = dataset.Review.apply(tokenize)

In [32]:
dataset

Unnamed: 0,Review,Rating
0,"[nice, hotel, expensive, parking, got, good, d...",4
1,"[ok, nothing, special, charge, diamond, member...",2
2,"[nice, rooms, not, experience, hotel, monaco, ...",3
3,"[unique, great, stay, wonderful, time, hotel, ...",5
4,"[great, stay, great, stay, went, seahawk, game...",5
...,...,...
20486,"[best, kept, secret, rd, time, staying, charm,...",5
20487,"[great, location, price, view, hotel, great, q...",4
20488,"[ok, just, looks, nice, modern, outside, desk,...",2
20489,"[hotel, theft, ruined, vacation, hotel, opened...",1


## Removing Stop Words

In [33]:
stop_words = nltk.corpus.stopwords.words('english')

In [34]:
def remove_stop_word(x):
  return [w for w in x if w not in stop_words]

In [35]:
dataset.Review = dataset.Review.apply(remove_stop_word)

In [36]:
dataset

Unnamed: 0,Review,Rating
0,"[nice, hotel, expensive, parking, got, good, d...",4
1,"[ok, nothing, special, charge, diamond, member...",2
2,"[nice, rooms, experience, hotel, monaco, seatt...",3
3,"[unique, great, stay, wonderful, time, hotel, ...",5
4,"[great, stay, great, stay, went, seahawk, game...",5
...,...,...
20486,"[best, kept, secret, rd, time, staying, charm,...",5
20487,"[great, location, price, view, hotel, great, q...",4
20488,"[ok, looks, nice, modern, outside, desk, staff...",2
20489,"[hotel, theft, ruined, vacation, hotel, opened...",1


# Stemming

In [37]:
from nltk.stem.porter import PorterStemmer

In [38]:
stemmer =PorterStemmer()

In [39]:
def stemming(x):
  s = [stemmer.stem(y) for y in x]
  return ' '.join(s)

In [40]:
dataset.Review = dataset.Review.apply(stemming)

In [41]:
dataset

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,4
1,ok noth special charg diamond member hilton de...,2
2,nice room experi hotel monaco seattl good hote...,3
3,uniqu great stay wonder time hotel monaco loca...,5
4,great stay great stay went seahawk game awesom...,5
...,...,...
20486,best kept secret rd time stay charm star ca n ...,5
20487,great locat price view hotel great quick place...,4
20488,ok look nice modern outsid desk staff n partic...,2
20489,hotel theft ruin vacat hotel open sept guest w...,1


In [42]:
dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/Dataset/Stemmed_data.csv')