# NLP Notebook

#### *Author: Kunyu He*
#### *University of Chicago, CAPP'20*

In [109]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

### Load Data

In [110]:
comments = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)
comments.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Data Cleaning

In [111]:
comments.isnull().sum(axis=0)

Review    0
Liked     0
dtype: int64

No value missing.

### Feature Engineering

In [112]:
def clean_text(row):
    row = re.sub('[^a-z]', ' ', row.lower())
    row = [PorterStemmer().stem(word) for word in row.split()
          if not word in set(stopwords.words('english'))]
    row = " ".join(row)
    corpus.append(row)

In [113]:
corpus = []
null = comments.Review.map(clean_text)

In [114]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
X.shape

(1000, 1500)

In [115]:
y = comments.Liked.values
y.shape

(1000,)

### Training and Test Set Split

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=123)

### Model Training

In [117]:
nbc = GaussianNB()
nbc.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [118]:
dtc = DecisionTreeClassifier(criterion="gini", random_state=123)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [119]:
rfc = RandomForestClassifier(n_estimators=1000, criterion="gini",
                             oob_score=True, random_state=123)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=True, random_state=123, verbose=0, warm_start=False)

### Model Evaluation

#### Naive Bayes

In [101]:
confusion_matrix(y_test, nbc.predict(X_test))

array([[46, 49],
       [16, 89]], dtype=int64)

Seems that Naive Bayes is not doing a good job on recognizing false positives (negative comments classified as positive).

#### Decision Tree

In [107]:
confusion_matrix(y_test, dtc.predict(X_test))

array([[67, 28],
       [34, 71]], dtype=int64)

Decision Tree is better at catching false positives, but performs worse on telling true positive comments from negative ones.

#### Random Forest

In [120]:
confusion_matrix(y_test, rfc.predict(X_test))

array([[76, 19],
       [32, 73]], dtype=int64)

Similar to Decision Tree, Random Forest is as good at catching false positives. It is slightly better at avoiding false negatives.