In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

#importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [2]:
file = pd.read_csv("train.txt", sep="	", header=None, names = ["Label","Review"])
df = file[['Review','Label']].copy()
df.head()

Unnamed: 0,Review,Label
0,The sheraton was a wonderful hotel! When me an...,TRUTHFULPOSITIVE
1,We stayed at the Omni between Christmas and Ne...,TRUTHFULPOSITIVE
2,I was REALLY looking forward to a nice relaxin...,DECEPTIVENEGATIVE
3,"First let me say, I try not to be too critical...",TRUTHFULNEGATIVE
4,The Ambassador East Hotel is a terrible place ...,DECEPTIVENEGATIVE


In [3]:
review_list = df["Review"].values.astype('U').tolist()
varietal_list = df["Label"].tolist()

print(review_list)
print(varietal_list)

['TRUTHFULPOSITIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVENEGATIVE', 'TRUTHFULNEGATIVE', 'DECEPTIVENEGATIVE', 'DECEPTIVENEGATIVE', 'TRUTHFULNEGATIVE', 'DECEPTIVEPOSITIVE', 'DECEPTIVENEGATIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVENEGATIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULPOSITIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULNEGATIVE', 'DECEPTIVENEGATIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'DECEPTIVENEGATIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVENEGATIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULPOSITIVE', 'TRUTHFULPOSITIVE', 'TRUTHFULPOSITIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULNEGATIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULNEGATIVE', 'TRUTHFULPOSITIVE', 'DECEPTIVEPOSITIVE', 'DECEPTIVEPOSITIVE', 'TRUTHFULNEGATIVE', 'DECEPTIVENEGATIVE', 'TRUTHFULPOSITIVE', 'TRUTHFULPOSITIVE', 'TRUTHFULPOSITIVE', 

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
#text pre processing
#nltk.download("wordnet")
reviews = []
for i in range(0, len(review_list)):
    review = re.sub('[^a-zA-Z]', ' ', review_list[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    reviews.append(review)

print(reviews)



In [6]:
#tf idf
tf_idf = TfidfVectorizer(use_idf=True)
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(reviews)

TS = 0.1
X_train, X_test, y_train, y_test = train_test_split(X_train_tf, varietal_list, test_size=TS, shuffle = True)

In [7]:
#Support Vector
clf = SVC(kernel ="linear").fit(X_train, y_train)

In [8]:
y_score = clf.predict(X_test)

n_right = 0
for i in range(len(y_score)):
    if y_score[i] == y_test[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 84.29%


In [9]:
df_aux = pd.DataFrame({"Prediction":y_score, "Real": y_test, "Review":review_list[round((1-TS)*len(review_list)):]})
df_filtered = df_aux[df_aux['Prediction'] != df_aux["Real"]]

print("wrong rows:",len(df_filtered))
df_filtered.head()

wrong rows: 66


Unnamed: 0,Prediction,Real,Review
8,TRUTHFULPOSITIVE,DECEPTIVEPOSITIVE,My husband and I stayed at the Conrad Chicago ...
11,TRUTHFULNEGATIVE,DECEPTIVENEGATIVE,Although the Architecture of the hotel is quai...
22,DECEPTIVEPOSITIVE,TRUTHFULPOSITIVE,"I have been 4 times in Chicago, and I have sta..."
38,DECEPTIVENEGATIVE,TRUTHFULNEGATIVE,"Beautiful hotel, but doesn't have a lot to off..."
68,TRUTHFULNEGATIVE,DECEPTIVENEGATIVE,I was very disappointed with this hotel. I hav...


In [10]:
df_filtered.to_csv(path_or_buf="wrong_predictions.csv", index = False, sep = "\t")