# Appendix: Import needed modules 

In [1]:
import re, collections
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()
import string

In [2]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

In [3]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(str(TextBlob(sample).correct()))

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

In [102]:
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

import pandas as pd
df = pd.read_csv('C:/Users/kennd/Downloads/testtest.csv',encoding = "ISO-8859-1")

# Download data and munge munge

In [104]:
df2 = df[['sentence','category']]

In [105]:
df2.sentence = df2.sentence.apply(cleanText)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


# Feature Generation

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(ngram_range=(1,3),min_df=3, tokenizer=tokenizeText)
tfvectorizer = TfidfVectorizer(ngram_range=(1,3),min_df = 3,tokenizer=tokenizeText,sublinear_tf=True)

In [None]:
## Gets the count of each word in each sentence
countfeature = vectorizer.fit_transform(df2.sentence)
tffeature = tfvectorizer.fit_transform(df2.sentence)

In [None]:
from sklearn.decomposition import TruncatedSVD
lsa_count = TruncatedSVD(n_components=50,n_iter=100).fit_transform(countfeature)
lsa_tf = TruncatedSVD(n_components=50,n_iter=100).fit_transform(tffeature)

In [None]:
## Transform into word counts into pandas data frame
featuredf_count = pd.DataFrame(countfeature.A, columns=vectorizer.get_feature_names())
featuredf_tf = pd.DataFrame(tffeature.A, columns=tfvectorizer.get_feature_names())
lsa_count_df = pd.DataFrame(lsa_count)
lsa_tf_df = pd.DataFrame(lsa_tf)

In [None]:
df3_count = pd.concat((df2,featuredf_count),axis=1)
df3_tf = pd.concat((df2,featuredf_tf),axis=1)
df3_lsa_count = pd.concat((df2,lsa_count_df),axis=1)
df3_lsa_tf = pd.concat((df2,lsa_tf_df),axis=1)

In [None]:
print(df3_count.info())
print(df3_tf.info())
print(df3_lsa_count.info())
print(df3_lsa_tf.info())

In [None]:
import pickle
df3_count.to_pickle("df3_count")
df3_tf.to_pickle("df3_tf")
df3_lsa_count.to_pickle("df3_lsa_count")
df3_lsa_tf.to_pickle("df3_lsa_tf")

# Building the model and in-training-set accuracy

In [20]:
from sklearn.cross_validation import train_test_split 

In [43]:
##Split into train and test at 75/25
train, test = train_test_split(df3_count,test_size = 0.25)

In [44]:
##Split X & Y
X_train = train.iloc[:,2:]
Y_train = train.iloc[:,1]
X_test = test.iloc[:,2:]
Y_test = test.iloc[:,1]

In [45]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

##KNN
from sklearn.neighbors import KNeighborsClassifier
for neigh in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=neigh).fit(X_train,Y_train)
    knn_Y_pred = knn.predict(X_test)
    print(neigh,"KNN Accuracy: ",np.mean(knn_Y_pred == np.array(Y_test)))
    print(classification_report(Y_test,knn_Y_pred))

## NaiveBayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB().fit(X_train,Y_train)
nb_Y_pred = nb.predict(X_test)
print("NB Accuracy: ",np.mean(nb_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,nb_Y_pred))

## Linear SVC
from sklearn.svm import SVC
svcl = SVC(kernel='linear').fit(X_train,Y_train)
svcl_Y_pred = svcl.predict(X_test)
print("SVC Linear Accuracy: ",np.mean(svcl_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,svcl_Y_pred))

## RBF SVC
from sklearn.svm import SVC
svcrbf = SVC(kernel='rbf', gamma=3).fit(X_train,Y_train)
svcrbf_Y_pred = svcrbf.predict(X_test)
print("SVC RBF Accuracy: ",np.mean(svcrbf_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,svcrbf_Y_pred))

## Logistic
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(multi_class='multinomial',solver='newton-cg').fit(X_train,Y_train)
log_Y_pred = log.predict(X_test)
print("Logistc Accuracy: ",np.mean(log_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,log_Y_pred))

## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train,Y_train)
dt_Y_pred = dt.predict(X_test)
print("Decision Tree Accuracy: ",np.mean(dt_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,dt_Y_pred))

## Random Forests
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train,Y_train)
rf_Y_pred = rf.predict(X_test)
print("Random Forests Accuracy: ",np.mean(rf_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,rf_Y_pred))

1 KNN Accuracy:  0.463157894737
             precision    recall  f1-score   support

   ambiance       0.17      0.18      0.17        11
       food       0.70      0.55      0.62        47
     others       0.42      0.38      0.40        26
    service       0.27      0.55      0.36        11

avg / total       0.51      0.46      0.48        95

2 KNN Accuracy:  0.4
             precision    recall  f1-score   support

   ambiance       0.12      0.27      0.17        11
       food       0.61      0.49      0.54        47
     others       0.33      0.38      0.36        26
    service       1.00      0.18      0.31        11

avg / total       0.52      0.40      0.42        95

3 KNN Accuracy:  0.526315789474
             precision    recall  f1-score   support

   ambiance       0.19      0.36      0.25        11
       food       0.65      0.79      0.71        47
     others       0.47      0.27      0.34        26
    service       1.00      0.18      0.31        11

avg / 

  'precision', 'predicted', average, warn_for)


# Neural Network

In [None]:
# food = 1
# service = 2
# ambiance = 3
# others = 4

def integerize(x):
    if x == 'food':
        return 1
    elif x == 'service':
        return 2
    elif x == 'ambiance':
        return 3
    else:
        return 4

In [52]:
df4 = df3_tf.copy()

In [53]:
df4.category = df3_count.category.apply(integerize)

In [64]:
train_nn, test_nn = train_test_split(df4,test_size = 0.25)
X_train_nn = train_nn.iloc[:,2:].reset_index(drop=True)
Y_train_nn = train_nn.iloc[:,1].reset_index(drop=True)
X_test_nn = test_nn.iloc[:,2:].reset_index(drop=True)
Y_test_nn = test_nn.iloc[:,1].reset_index(drop=True)

In [96]:
from sknn.mlp import Classifier, Layer

nn = Classifier(
    layers=[
        Layer("Tanh", units=200),
        Layer("Softmax")],
    learning_rate=0.001,
    n_iter=50,
    loss_type='mcc')
nn.fit(df4.iloc[:,2:], df4.iloc[:,1])

Classifier(batch_size=1, callback=None, debug=False, dropout_rate=None,
      f_stable=0.001,
      hidden0=<sknn.nn.Layer `Tanh`: units=200, frozen=False, name='hidden0'>,
      layers=[<sknn.nn.Layer `Tanh`: units=200, frozen=False, name='hidden0'>, <sknn.nn.Layer `Softmax`: units=4, frozen=False, name='output'>],
      learning_momentum=0.9, learning_rate=0.001, learning_rule='sgd',
      loss_type='mcc', n_iter=50, n_stable=10, normalize=None,
      output=<sknn.nn.Layer `Softmax`: units=4, frozen=False, name='output'>,
      parameters=None, random_state=None, regularize=None, valid_set=None,

In [97]:
y_pred = nn.predict(df4.iloc[:,2:])

[(378, 4)]


In [99]:
print("NN: ",np.mean(y_pred == np.array(df4.iloc[:,1])))
print(classification_report(df4.iloc[:,1],y_pred))

NN:  0.551671285798


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          1       0.61      1.00      0.76       217
          2       1.00      0.17      0.30        46
          3       0.00      0.00      0.00        39
          4       0.85      0.14      0.25        76

avg / total       0.64      0.62      0.52       378



# Vectorizing New Text

In [42]:
new_text = df2.sentence

In [57]:
vocab_list = tfvectorizer.get_feature_names()

In [58]:
vocab_dict = dict.fromkeys(vocab_list)

In [59]:
tfvectorizer2 = TfidfVectorizer(ngram_range=(1,3),min_df = 3,tokenizer=tokenizeText,sublinear_tf=True,vocabulary=vocab_dict)

In [60]:
len(vocab_list)

208

In [68]:
check = tfvectorizer.transform(new_text[0:100])

In [69]:
check

<100x208 sparse matrix of type '<class 'numpy.float64'>'
	with 286 stored elements in Compressed Sparse Row format>