## Types of NLP： 
### 1. IF/ELSE RULES(CHATBOT)
### 2. Audio Frequency components analysis (speech recognition)
### 3. Bag of words model (classification)
### 4. CNN for text recognition (classification)
### 5. Seq2Seq (many appplications)

# Importing the libraries

In [115]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing dataset

In [116]:
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t', quoting=3) 
###delimiter default is , so now we specify \t, also quoting=3 means that we are telling our model to ignore all double quote to avoid processing error

# Cleaning the text

In [117]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer ###applying stemming of the words(只看词根，避免了词态变化的带来的语义无法识别的情况)
corpus = [] ###store all the cleaned text in corpus
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) ###sub function can replace anything in the string with anything you want, so here we want to delete puncutations(anything not a letter), so we replace the punctuation by spaces, means not all lower and upper letters a-z will be replaced
  review = review.lower() ####to return all into lower letters
  review = review.split() ###split the different elements of the review into different words
  ps = PorterStemmer() ###create an object of PorterStemmer() and then apply this object to the data that we want 
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)] ###if the words are not in the stopwords then we will consider since all the stopwords give us no hint of sentiments
  review = ' '.join(review) ###the previous step returns all the single words after stemming and now you will have to put them all together, " ".join means you will put a space between the words
  corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XiaoluZhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Creating bag of words model
## Tokenization

In [127]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)#1500 most frequent words
X = cv.fit_transform(corpus).toarray() ###fit the corpus in X, put all the words in cols. the matrix has to be a 2D array
y = dataset.iloc[:,-1].values ##last col of the dataset

# Splitting the dataset into training and testing set

In [130]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Training the Naive Bayes model on the trianing set

In [132]:
from sklearn.naive_bayes import GaussianNB 
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

# Predicting the test set results

In [139]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

# Evaluating your performance

In [138]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73

# Predicting whether a single review is positive or negative 

In [145]:
new_review = 'I dislike this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]
