<h1 style="text-align:center;">Natural Language Processing</h1>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("Restaurant_Reviews.csv", usecols=range(2))
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Removing punctuation marks and splitting the sentence into words:

In [2]:
import re

In [3]:
comment = re.sub('[^a-zA-Z]',' ',data['Review'][0])
comment = comment.lower().split()
comment

['wow', 'loved', 'this', 'place']

### Downloading and importing stopwords:

In [4]:
import nltk
stopwords = nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mrtke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from nltk.corpus import stopwords

### Getting the stems of words and removing stopwords:

In [6]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [7]:
comment = [ps.stem(word) for word in comment if not word in set(stopwords.words('english'))]

*This creates a list of stems of words which are not stopwords.*

In [8]:
comment

['wow', 'love', 'place']

### Putting back the list of words as a sentence to be analyzed:

In [9]:
comment = ' '.join(comment)
comment

'wow love place'

*This is the format we want and we can apply this process to the entire dataset.*

### Preprocessing the dataframe:

In [10]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [11]:
comments = []
for i in range(len(data)):
    comment = re.sub('[^a-zA-Z]',' ',data['Review'][i]).lower().split()
    comment = [ps.stem(word) for word in comment if not word in set(stopwords.words('english'))]
    comments.append(' '.join(comment))

data['Review'] = comments

In [12]:
data.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


### Feature Extraction

#### Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)

In [14]:
X=cv.fit_transform(data['Review']).toarray()
Y=data.iloc[:,1].values

### Machine Learning

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.33, random_state=0)

In [17]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [19]:
gnb.fit(x_train,y_train)

y_pred = gnb.predict(x_test)

In [20]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

cm

array([[ 87,  72],
       [ 26, 145]], dtype=int64)