In [3]:
import numpy as np   
import pandas as pd  
  
# Import dataset 
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')  

In [8]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [18]:
# Text Cleaning/Preprocessing stage
# library to clean data 
import re  
  
# Natural Language Tool Kit 
import nltk  
  
nltk.download('stopwords') 
nltk.download('punkt')

# to remove stopword 
from nltk.corpus import stopwords 
  
# for Stemming propose  
from nltk.stem.porter import PorterStemmer 

from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Matth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Matth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [34]:
# Initialize empty array 
# to append clean text  
corpus = []  
for i in tqdm(range(0, len(dataset))):
    
    # column : "Review", ith row
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    
    # convert all cases to lower case and split
    review = review.lower()
    review = review.split()

    # creating PorterStemmer object to 
    # take main stem of each word (for %ing, )
    ps = PorterStemmer()  
    
    # loop for stemming each word 
    # in string array at ith row if it's not a stop word    
    review = [ps.stem(word) for word in review 
                if not word in set(stopwords.words('english'))]    
    
    # rejoin all string array elements 
    # to create back into a string 
    review = ' '.join(review)   
    corpus.append(review)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:06<00:00, 152.57it/s]


In [35]:
from nltk.tokenize import sent_tokenize, word_tokenize

# These imports tokenizes the corpus either by word or sentence

sen_text = ""
word_text = ""
for sentence in tqdm(corpus):
    sen_text += sentence + ". "
    word_text += sentence + " "
corpus = sent_tokenize(sen_text)

100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 544431.98it/s]


In [36]:
# Creating the Bag of Words model 
from sklearn.feature_extraction.text import CountVectorizer 
  
# To extract max 1500 feature. 
# "max_features" is attribute to 
# experiment with to get better results 
cv = CountVectorizer(max_features = 1500)  
  
# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray()  
  
# y contains answers if review 
# is positive or negative 
y = dataset.iloc[:, 1].values  


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
from nltk.tag import StanfordNERTagger

In [24]:
st = StanfordNERTagger(f'C:/Users/Matth/Research/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                      "C:/Users/Matth/Research/stanford-ner-2014-06-16/stanford-ner-3.4.jar")

In [26]:
import os
java_path = "C:/Program Files/Java/jre1.8.0_221/bin/java.exe"
os.environ['JAVAHOME'] = java_path

st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

[('Rami', 'PERSON'),
 ('Eid', 'PERSON'),
 ('is', 'O'),
 ('studying', 'O'),
 ('at', 'O'),
 ('Stony', 'ORGANIZATION'),
 ('Brook', 'ORGANIZATION'),
 ('University', 'ORGANIZATION'),
 ('in', 'O'),
 ('NY', 'O')]