# Natural Language Processing

## Importing the libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [0]:
dataset = pd.read_csv('Restaurant_Reviews.tsv' , delimiter ="\t" , quoting=3)

## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwrds = stopwords.words('english')
 
to_be_removed =['not' ,'no' ,'nor' ,"wasn't" ,"wouldn't","weren't","doesn't" ,"didn't" ,"haven't", "isn't","mustn't","won't"]
for w in to_be_removed:
  stopwrds.remove(w)



from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus =[]

for i in range(1007):
  review = dataset.iloc[i , 0]
  review = re.sub('[^a-zA-Z]' , " " ,review)
  review = review.lower()
  review = review.split()

  review =[ps.stem(x) for x in review if not x in set(stopwrds)]
  review =" ".join(review)
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Creating the Bag of Words model

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)#len(x[0])
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[: , -1].values


## Splitting the dataset into the Training set and Test set

In [0]:
from sklearn.model_selection import train_test_split
x_train ,x_test , y_train , y_test = train_test_split(x,y,test_size =0.2)#random_state=0

## **1) Naive** **Bayes**

### Training the Naive Bayes model on the Training set

In [6]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train , y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## predicting result

In [0]:
y_pred_naive = naive.predict(x_test)
#print(np.concatenate(( y_test.reshape(len(y_test),1) , y_pred_naive.reshape(len(y_pred_naive) ,1)  ),1))

## Confusion matrix and accuracy

In [8]:
from sklearn.metrics import accuracy_score ,confusion_matrix
print(accuracy_score(y_test,y_pred_naive))
print(confusion_matrix(y_test ,y_pred_naive))

0.7277227722772277
[[59 47]
 [ 8 88]]


## **2) Logistic** **regression**

##Training the Logistic regression model on the Training set

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## predicting result

In [0]:
y_pred_lr = lr.predict(x_test)

## Confusion matrix and accuracy

In [11]:
from sklearn.metrics import accuracy_score ,confusion_matrix
print(accuracy_score(y_test,y_pred_lr))
print(confusion_matrix(y_test ,y_pred_lr))

0.7821782178217822
[[85 21]
 [23 73]]


##**3)Support Vector Machine**

##Training the Logistic regression model on the Training set

In [12]:
from sklearn.svm import SVC
svmClassifier = SVC(kernel = 'linear')
svmClassifier.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

##Predicting result

In [0]:
y_pred_svm = svmClassifier.predict(x_test)

##Confusion Martrix and accuracy

In [14]:
from sklearn.metrics import accuracy_score ,confusion_matrix
print(accuracy_score(y_test,y_pred_svm))
print(confusion_matrix(y_test ,y_pred_svm))

0.7920792079207921
[[84 22]
 [20 76]]


## saving the model

In [15]:
'''from sklearn.externals import joblib
joblib.dump(svmClassifier ,'svm_model')'''

"from sklearn.externals import joblib\njoblib.dump(svmClassifier ,'svm_model')"

##**4)Random Forest**

##Training the Random forest model on the Training set

In [16]:
from sklearn.ensemble import RandomForestClassifier
randomforestClassifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
randomforestClassifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

##Predicting result

In [0]:
y_pred_randomF = randomforestClassifier.predict(x_test)

##Confusion Martrix and accuracy

In [18]:
from sklearn.metrics import accuracy_score ,confusion_matrix
print(accuracy_score(y_test,y_pred_randomF))
print(confusion_matrix(y_test ,y_pred_randomF))

0.7277227722772277
[[93 13]
 [42 54]]


##**5)Kernel SVM**

##Training the kernel SVM model on the Training set

In [19]:
from sklearn.svm import SVC
kernelsvmClassifier = SVC(kernel = 'rbf', random_state = 0)
kernelsvmClassifier.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

##Predicting result

In [0]:
y_pred_kernelSVM =kernelsvmClassifier.predict(x_test) 

##Confusion Martrix and accuracy

In [21]:
from sklearn.metrics import accuracy_score ,confusion_matrix
print(accuracy_score(y_test,y_pred_kernelSVM))
print(confusion_matrix(y_test ,y_pred_kernelSVM))

0.7673267326732673
[[89 17]
 [30 66]]


##**Making new prediction with SVM**

In [24]:
from nltk.stem.porter import PorterStemmer
ps2 = PorterStemmer()
text = input('Enter new review :')
new_review = re.sub('[^a-zA-Z]' ," " ,text)
new_review = new_review.lower()
new_review = new_review.split()
new_review = [ps2.stem(x) for x in new_review if not x in set(stopwrds)]
new_review = " ".join(new_review)
#print(new_review)
new_corpus =[new_review]

corpus2 =cv.transform(new_corpus).toarray()

#my = new_corpus[-1].reshape(1,-1)
if svmClassifier.predict(corpus2)==0:
  print('svm:negtive')
else:
  print('svm:positive')

if lr.predict(corpus2)==0:
  print('lr:negtive')
else:
  print('lr:positive')

if kernelsvmClassifier.predict(corpus2)==0:
  print('kernelsvmClassifier:negtive')
else:
  print('kernelsvmClassifier:positive')

if randomforestClassifier.predict(corpus2)==0:
  print('randomforestClassifier:negtive')
else:
  print('randomforestClassifier:positive')
if naive.predict(corpus2)==0:
  print('naive:negtive')
else:
  print('naive:positive')


Enter new review :Service is slow :-/
svm:negtive
lr:negtive
kernelsvmClassifier:negtive
randomforestClassifier:negtive
naive:negtive
