In [56]:
#Importing libraries 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


In [57]:
# Importing dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
x=dataset.iloc[0:1001,0].values  
y=dataset.iloc[0:1001,1].values 
#creating list for storing review
reviews = []  
#creating list for storing all reviws in corpus
corpus = [] 


In [58]:
# Preprocessing

#for loop for processing all reviews
for character in range(0, len(x)): 
    # removing special characters and numbers
    #if word has no alphabets, replaced by blank space.
    #all characters except A-Z         
    review = re.sub('[^a-zA-Z]', ' ', str(x[character])) 
    # sub method returns the string obtained by replacing the leftmost non-overlapping occurrences of the pattern 
    #in string by the replacement mentioned as second parameter
    # converting text to lower case 
    review = review.lower()
    #split the sentences into words by split it over blank space 
    review = review.split()
    #taking root words and removing stop words 
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)  #processed words are joined to form a review text
    #appending processed review into corpus 
    corpus.append(review)




In [59]:
# Creating bag of words to convert strings to vector

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



# Creating instances of classifiers
lg_classifier = LogisticRegression(random_state=0)
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
svm_linear_classifier = SVC(kernel='linear', random_state=0)
svm_nonlinear_classifier = SVC(kernel='rbf', random_state=0)
nb_classifier = GaussianNB()
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)

# Fitting classifiers to training data
lg_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
svm_linear_classifier.fit(X_train, y_train)
svm_nonlinear_classifier.fit(X_train, y_train)
nb_classifier.fit(X_train, y_train)
dt_classifier.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)

# Making predictions on testing data
lg_y_pred = lg_classifier.predict(X_test)
knn_y_pred = knn_classifier.predict(X_test)
svm_linear_y_pred = svm_linear_classifier.predict(X_test)
svm_nonlinear_y_pred = svm_nonlinear_classifier.predict(X_test)
nb_y_pred = nb_classifier.predict(X_test)
dt_y_pred = dt_classifier.predict(X_test)
rf_y_pred = rf_classifier.predict(X_test)

lg_accuracy = accuracy_score(y_test, lg_y_pred)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
svm_linear_accuracy = accuracy_score(y_test, svm_linear_y_pred)
svm_nonlinear_accuracy = accuracy_score(y_test, svm_nonlinear_y_pred)
nb_accuracy = accuracy_score(y_test, nb_y_pred)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [60]:
# Displaying performance metrics in table
classifiers = ['Logistic Regression', 'k-NN', 'SVM (Linear)', 'SVM (Non-Linear)', 'Naive Bayes', 'Decision Tree', 'Random Forest']
data = {'Classifier': classifiers, 
        'Accuracy': [lg_accuracy, knn_accuracy, svm_linear_accuracy, svm_nonlinear_accuracy, nb_accuracy, dt_accuracy, rf_accuracy]}
df = pd.DataFrame(data, index=classifiers)
df.index.name = 'Classifier'
df = df.rename(index={'Logistic Regression': 'LG', 'k-NN': 'KNN', 'SVM (Linear)': 'SVM-L', 'SVM (Non-Linear)': 'SVM-NL', 'Naive Bayes': 'NB', 'Decision Tree': 'DT', 'Random Forest': 'RF'})
print(df)

                     Classifier  Accuracy
Classifier                               
LG          Logistic Regression     0.710
KNN                        k-NN     0.585
SVM-L              SVM (Linear)     0.720
SVM-NL         SVM (Non-Linear)     0.735
NB                  Naive Bayes     0.730
DT                Decision Tree     0.710
RF                Random Forest     0.720


In [61]:
#TFID vectorizer
vectorizer = TfidfVectorizer()
X_tfid = vectorizer.fit_transform(corpus).toarray()


# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_tfid, y, test_size = 0.20, random_state = 0)



# Creating instances of classifiers
lg_classifier = LogisticRegression(random_state=0)
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
svm_linear_classifier = SVC(kernel='linear', random_state=0)
svm_nonlinear_classifier = SVC(kernel='rbf', random_state=0)
nb_classifier = GaussianNB()
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)

# Fitting classifiers to training data
lg_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
svm_linear_classifier.fit(X_train, y_train)
svm_nonlinear_classifier.fit(X_train, y_train)
nb_classifier.fit(X_train, y_train)
dt_classifier.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)

# Making predictions on test data
lg_y_pred = lg_classifier.predict(X_test)
knn_y_pred = knn_classifier.predict(X_test)
svm_linear_y_pred = svm_linear_classifier.predict(X_test)
svm_nonlinear_y_pred = svm_nonlinear_classifier.predict(X_test)
nb_y_pred = nb_classifier.predict(X_test)
dt_y_pred = dt_classifier.predict(X_test)
rf_y_pred = rf_classifier.predict(X_test)

lg_accuracy = accuracy_score(y_test, lg_y_pred)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
svm_linear_accuracy = accuracy_score(y_test, svm_linear_y_pred)
svm_nonlinear_accuracy = accuracy_score(y_test, svm_nonlinear_y_pred)
nb_accuracy = accuracy_score(y_test, nb_y_pred)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
rf_accuracy1 = accuracy_score(y_test, rf_y_pred)


In [62]:
# Displaying performance metrics in table
classifiers = ['Logistic Regression', 'k-NN', 'SVM (Linear)', 'SVM (Non-Linear)', 'Naive Bayes', 'Decision Tree', 'Random Forest']
data = {'Classifier': classifiers, 
        'Accuracy': [lg_accuracy, knn_accuracy, svm_linear_accuracy, svm_nonlinear_accuracy, nb_accuracy, dt_accuracy, rf_accuracy1]}
df = pd.DataFrame(data, index=classifiers)
df.index.name = 'Classifier'
df = df.rename(index={'Logistic Regression': 'LG', 'k-NN': 'KNN', 'SVM (Linear)': 'SVM-L', 'SVM (Non-Linear)': 'SVM-NL', 'Naive Bayes': 'NB', 'Decision Tree': 'DT', 'Random Forest': 'RF'})
print(df)

                     Classifier  Accuracy
Classifier                               
LG          Logistic Regression     0.755
KNN                        k-NN     0.680
SVM-L              SVM (Linear)     0.730
SVM-NL         SVM (Non-Linear)     0.755
NB                  Naive Bayes     0.720
DT                Decision Tree     0.695
RF                Random Forest     0.730
