In [1]:
import pandas as pd
import re

In [3]:
data = pd.read_csv('tripadvisor_hotel_reviews.csv')
data

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",5
20487,great location price view hotel great quick pl...,4
20488,"ok just looks nice modern outside, desk staff ...",2
20489,hotel theft ruined vacation hotel opened sept ...,1


In [4]:
length = len(data.Review)
length

20491

### Detecting 3 basic sentiments:

In [5]:
def sentiment(rating):
    if((rating == 1) or (rating == 2)):
        return 0
    elif(rating == 3):
        return 1
    elif((rating == 4) or (rating == 5)):
        return 2
    
data.Rating = data['Rating'].apply(sentiment) 
data

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not 4* experience hotel monaco seat...,1
3,"unique, great stay, wonderful time hotel monac...",2
4,"great stay great stay, went seahawk game aweso...",2
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",2
20487,great location price view hotel great quick pl...,2
20488,"ok just looks nice modern outside, desk staff ...",0
20489,hotel theft ruined vacation hotel opened sept ...,0


### Dealing with missing values:

In [6]:
data.info()

# There are no null or NaN values in the dataFrame.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


### clean the text and achieve the following:

In [7]:
# A sample of review before changes are done:

data.loc[204, 'Review']

'posh hotel husband stayed hilton seattle september 2003 night really liked, stayed queen room bed, bed linens wall paper furniture bathroom really nice, nice view, centrally located, reserved room hotwire.com room 100,  '

#### 1) Lower all the text:

In [8]:
for i in range(length):
    
    hyperlink = re.compile(r'([A-Z])')
    hyperlink_match = hyperlink.finditer(data.loc[i, 'Review'])
    
    for match in hyperlink_match:
        print(i)
        print(match)
        
# There are no capital letters to lower them!

#### 2) Remove all the hyperlinks if present in the text.

In [9]:
for i in range(length):
    
    hyperlink = re.compile(r'([a-zA-Z0-9]+\.[a-zA-Z]+\s)')
    hyperlink_match = hyperlink.finditer(data.loc[i, 'Review'])
    
#     for match in hyperlink_match:
#         print(i)
#         print(match)
        
    data.loc[i, 'Review'] = hyperlink.sub(r'', data.loc[i, 'Review'])

#### 3) substituting all the non alphabetical characters(a-z) by ‘ ‘ (space).

In [10]:
for i in range(length):
    
    alpSpace = re.compile(r'([^a-zA-Z])')
    alpSpace_match = alpSpace.finditer(data.loc[i, 'Review'])
    
#     for match in alpSpace_match:
#         print(i)
#         print(match)
        
    data.loc[i, 'Review'] = alpSpace.sub(r' ', data.loc[i, 'Review'])

#### 4) substituting multiple spaces with a single space.

In [11]:
for i in range(length):
    
    oneSpace = re.compile(r'(\s+)')
    oneSpace_match = oneSpace.finditer(data.loc[i, 'Review'])
    
#     for match in oneSpace_match:
#         print(i)
#         print(match)
        
    data.loc[i, 'Review'] = oneSpace.sub(r' ', data.loc[i, 'Review'])

In [12]:
# final output of a sample review after changes are done.

data.loc[204, 'Review']

'posh hotel husband stayed hilton seattle september night really liked stayed queen room bed bed linens wall paper furniture bathroom really nice nice view centrally located reserved room room '

In [13]:
import nltk
import string

In [14]:
punctuations = string.punctuation
#punctuations

In [15]:
stop_words = nltk.corpus.stopwords.words('english')
#stop_words

In [16]:
ps = nltk.PorterStemmer()

In [17]:
no_nums = re.compile(r'(\d+\s|\s\d+)')

In [18]:
def NLP_preprocessing(text):
    remove_punctuation = [cha for cha in text if cha not in punctuations]          # getting rid of punctuations
    no_punctuation = ''.join(remove_punctuation)                                   # joining to make a string of a list
    tokens = nltk.word_tokenize(no_punctuation)                                    # tokenizing the string
    unnecessary_words_removed = [cha for cha in tokens if cha not in stop_words]   # removing stop words
    stemmed = [ps.stem(cha) for cha in unnecessary_words_removed]                  # stemming the list
    text = " ".join(stemmed)                                                       # joining to make a string of a list
    
    return no_nums.sub("", text)                                                   # returning the string after removing numerical digits

In [19]:
data["Review"] = data.Review.apply(NLP_preprocessing)

In [20]:
data

Unnamed: 0,Review,Rating
0,nice hotel expens park got good deal stay hote...,2
1,ok noth special charg diamond member hilton de...,0
2,nice room experi hotel monaco seattl good hote...,1
3,uniqu great stay wonder time hotel monaco loca...,2
4,great stay great stay went seahawk game awesom...,2
...,...,...
20486,best kept secret rd time stay charm star ca n ...,2
20487,great locat price view hotel great quick place...,2
20488,ok look nice modern outsid desk staff n partic...,0
20489,hotel theft ruin vacat hotel open sept guest w...,0
