# Natural Language Processing

## Importing the dataset

In [1]:
dataset_original = read.delim('Restaurant_Reviews.tsv', quote='', stringsAsFactors=FALSE)
head(dataset_original)

Review,Liked
Wow... Loved this place.,1
Crust is not good.,0
Not tasty and the texture was just nasty.,0
Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
The selection on the menu was great and so were the prices.,1
Now I am getting angry and I want my damn pho.,0


## Cleaning the texts

In [3]:
# install.packages('tm')
# install.packages('SnowballC')

In [4]:
library(tm)
library(SnowballC)

corpus = VCorpus(VectorSource(dataset_original$Review))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)

"package 'tm' was built under R version 3.6.3"Loading required package: NLP
"package 'SnowballC' was built under R version 3.6.3"

## Creating the Bag of Words model

In [6]:
dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.999)
dataset = as.data.frame(as.matrix(dtm))
dataset$Liked = dataset_original$Liked

In [7]:
head(dataset)

absolut,acknowledg,actual,ago,almost,also,although,alway,amaz,ambianc,...,wow,wrap,wrong,year,yet,youd,your,yummi,zero,Liked
0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Encoding the target feature as factor

In [8]:
dataset$Liked = factor(dataset$Liked, levels=c(0, 1))

## Splitting the dataset into the Training set and Test set

In [9]:
library(caTools)
set.seed(123)

split = sample.split(dataset$Liked, SplitRatio=0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

"package 'caTools' was built under R version 3.6.3"

## Fitting Random Forest Classification to the Training set

In [10]:
library(randomForest)

classifier = randomForest(x=training_set[-692],
                          y=training_set$Liked,
                          ntree=10)

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.


## Predicting the Test set results

In [12]:
y_pred = predict(classifier, newdata=test_set[-692])

## Making the Confusion Matrix

In [13]:
cm = table(test_set[, 692], y_pred)
cm

   y_pred
     0  1
  0 78 22
  1 23 77