In [27]:
#IMPORTS
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from IPython.display import display
from sklearn import naive_bayes, linear_model, tree, ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from collections import OrderedDict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [184]:
def load_reviews(path, fold_nrs, label):
    columns = ['Raw', 'Processed', 'Label']
    df = pd.DataFrame()
    for i in fold_nrs:
        fold = "fold"+str(i)
        p = path + fold
        for file in os.listdir(p): # for each .txt file in this fold's folder
            if file.endswith(".txt"):
                f = open(os.path.join(p, file), "r")
                review = f.read()
                # remove whitespaces, numbers, punctuation, & make lowercase
                processed = process_string(review)
                new_row = pd.DataFrame([[review, processed, label]])
                df = df.append(new_row)
    df.columns = columns
    return df

def make_train_test_set():
    # Focusing only on the negative reviews
    path_dec = "./op_spam_v1.4/negative_polarity/deceptive_from_MTurk/"
    path_true = "./op_spam_v1.4/negative_polarity/truthful_from_Web/"

    # Label = 1 if it is a truthful (negative) review, =0 if it is a deceptive (negative) review

    #loading training set:
    train_dec = load_reviews(path_dec, np.arange(4)+1, 0) # folds 1-4 form the training set
    train_true = load_reviews(path_true, np.arange(4)+1, 1)
    train = pd.concat([train_dec, train_true])
    train = train.reset_index(drop=True)

    #loading the test set:
    test_dec = load_reviews(path_dec, [5], 0)  # test set for deceptive reviews
    test_true = load_reviews(path_true, [5], 1)
    test = pd.concat([test_dec, test_true])
    test = test.reset_index(drop=True)

    return [train,test]

def process_string(s):
    s = s.strip()    # remove whitespaces
    s = s.lower() # to lowercase
    s = re.sub(r'\d+', '', s) # remove numbers
    s = s.translate(str.maketrans("","", string.punctuation)) # remove punctuation
    return s


##########################
###Process files to CSV###
##########################

train, test = make_train_test_set()
train.to_csv("./train.csv", header = ['Raw', 'Processed', 'Label'], index=False)
test.to_csv("./test.csv", header = ['Raw', 'Processed', 'Label'], index=False)


In [185]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("test.csv")

print(f"\n Shape of training set: {train.shape}")
print(train.head())
print(f"\n Shape of test set: {test.shape}")
print(test.head())

train.head()


 Shape of training set: (640, 3)
                                                 Raw  \
0  We stayed at the Schicago Hilton for 4 days an...   
1  Hotel is located 1/2 mile from the train stati...   
2  I made my reservation at the Hilton Chicago be...   
3  When most people think Hilton, they think luxu...   
4  My husband and I recently stayed stayed at the...   

                                           Processed  Label  
0  we stayed at the schicago hilton for  days and...      0  
1  hotel is located  mile from the train station ...      0  
2  i made my reservation at the hilton chicago be...      0  
3  when most people think hilton they think luxur...      0  
4  my husband and i recently stayed stayed at the...      0  

 Shape of test set: (160, 3)
                                                 Raw  \
0  I recently stayed at the Hotel Allegro Chicago...   
1  I recently stayed at the Hotel Allegro in Chic...   
2  I recently visited Chicago. I stayed at the Ho...   
3  

Unnamed: 0,Raw,Processed,Label
0,We stayed at the Schicago Hilton for 4 days an...,we stayed at the schicago hilton for days and...,0
1,Hotel is located 1/2 mile from the train stati...,hotel is located mile from the train station ...,0
2,I made my reservation at the Hilton Chicago be...,i made my reservation at the hilton chicago be...,0
3,"When most people think Hilton, they think luxu...",when most people think hilton they think luxur...,0
4,My husband and I recently stayed stayed at the...,my husband and i recently stayed stayed at the...,0


In [186]:
def make_xy_train_test(train, test, bigram = False, min_df = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = TfidfVectorizer(stop_words = stpw, ngram_range=(1,2),min_df = min_df)
    else:
        vectorizer = TfidfVectorizer(stop_words = stpw,min_df = min_df)
    print(f'Words must appear at least in {min_df*100}% of the reviews to become feature')
    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    
    print(f'The model contains {features.shape[1]} features')

    # print("features shape", features.shape)
    # print("test features shape", test_features.shape)
    # print("test shape", test.shape)
    # print("test merged", test_merged.shape)
    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

def make_countvec_xy_train_test(train, test, bigram = False, binary = False,min_df = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = CountVectorizer(stop_words = stpw, binary=binary, ngram_range=(1,2), min_df = min_df)
    else:
        vectorizer = CountVectorizer(stop_words = stpw, binary = binary, min_df = min_df)
    print(f'Words must appear at least in {min_df*100}% of the reviews to become feature')

    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
        
    print(f'The model contains {features.shape[1]} features')
        
    # print("features shape", features.shape)
    # print("test features shape", test_features.shape)
    # print("test shape", test.shape)
    # print("test merged", test_merged.shape)
    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train, x_test,y_test]

train_x, train_y, test_x, test_y = make_xy_train_test(train, test, min_df=0.01)
#train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test)

#train_x, train_y = make_xy(train)
#test_x, test_y = make_xy(test)

bitrain_x, bitrain_y, bitest_x, bitest_y = make_xy_train_test(train, test, bigram=True,min_df=0.05)
#bitrain_x, bitrain_y, bitest_x, bitest_y = make_countvec_xy_train_test(train, test, bigram=True)

#bitrain_x, bitrain_y = make_xy(train, bigram = True)
#bitest_x, bitest_y = make_xy(test, bigram = True)


display(train_x.head())
display(train_y.head())
display(test_x.head())
display(test_y.head())

display(bitrain_x.head())
display(bitrain_y.head())
display(bitest_x.head())
display(bitest_y.head())

Words must appear at least in 1.0% of the reviews to become feature
The model contains 1255 features
Words must appear at least in 5.0% of the reviews to become feature
The model contains 337 features


Unnamed: 0,able,absolutely,ac,access,accommodate,accommodations,account,across,acted,actual,...,write,wrong,year,years,yelling,yes,yet,youd,young,youre
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330,0.0,0.0,0.0,0.256244,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.159819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


99     0
375    1
330    1
526    1
404    1
Name: Label, dtype: int64

Unnamed: 0,able,absolutely,ac,access,accommodate,accommodations,account,across,acted,actual,...,write,wrong,year,years,yelling,yes,yet,youd,young,youre
141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.145127,0.0,0.0,0.0,0.0,0.0,0.0,0.0


141    1
39     0
121    1
28     0
129    1
Name: Label, dtype: int64

Unnamed: 0,able,actually,air,almost,already,also,although,amenities,another,anyone,...,without,wont,work,working,worst,worth,would,would recommend,wouldnt,wrong
51,0.0,0.0,0.0,0.0,0.0,0.161768,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.101549,0.0,0.0,0.0
573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.160973,0.0,0.0,0.0
529,0.0,0.0,0.0,0.0,0.0,0.152142,0.0,0.150668,0.055138,0.0,...,0.0,0.0,0.059199,0.0,0.0,0.071217,0.174739,0.0,0.075849,0.0
141,0.0,0.0,0.0,0.0,0.0,0.14307,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.200912,0.0,0.0,0.0,0.0


51     0
564    1
573    1
529    1
141    0
Name: Label, dtype: int64

Unnamed: 0,able,actually,air,almost,already,also,although,amenities,another,anyone,...,without,wont,work,working,worst,worth,would,would recommend,wouldnt,wrong
156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.237018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147387,...,0.158264,0.0,0.0,0.0,0.0,0.0,0.215713,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.208943,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213416,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.130425,...,0.0,0.0,0.215567,0.0,0.0,0.0,0.0,0.0,0.0,0.0


156    1
67     0
118    1
137    1
158    1
Name: Label, dtype: int64

In [187]:
class Classifier(object):
    """ Generic classifier object. """
    def __init__(self):
        self.name = "Classifier"
        self.esimator = None
    
    def evaluate(self, X_test, y_test):
        y_pred = self.estimator.predict(X_test)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        return recall, precision, accuracy, f1

class NaiveBayes(Classifier):
    """ Hyper-parameter tuning for the multinominal naive bayes classifier. """
    def __init__(self):
        self.name = "Naive Bayes"
        self.estimator = naive_bayes.MultinomialNB()

    def train(self, X_train, y_train):
        # TODO: Some feature selection needs to go here
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with 150 features")

In [188]:
class LogRegClassifier(Classifier):
    """ Hyper-parameter tuning for the logistic regression classifier. """
    def __init__(self):
        self.name = "Logistic Regression"
        self.estimator = linear_model.LogisticRegressionCV(cv=4, max_iter=1000, Cs=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

    def train(self, X_train, y_train):
        print("training...", end='\r')
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with lambda: {self.estimator.C_}")

In [189]:
class TreeClassifier(Classifier):
    """ Hyper-parameter tuning for the decision tree classifier. """
    def __init__(self):
        self.name = "Decision Tree"
        self.estimator = tree.DecisionTreeClassifier()
    
    def train(self, X_train, y_train):
        print("set aplhas...", end='\r')
        path = self.estimator.cost_complexity_pruning_path(X_train, y_train)
        ccp_alphas = path.ccp_alphas
        parameters = {'ccp_alpha': ccp_alphas}
        clf = GridSearchCV(self.estimator, parameters, cv=4)

        print("training...", end='\r')
        clf.fit(X_train, y_train)
        self.estimator = clf.best_estimator_
        print(f"{self.name} trained with aplha: {self.estimator.ccp_alpha}")

In [190]:
class RandForestClassifier(Classifier):
    """ Hyper-parameter tuning for the random forest classifier. """
    def __init__(self, min_trees=20, max_trees=160):
        self.name = "Random Forest"
        self.estimator = ensemble.RandomForestClassifier(oob_score=True)
        self.max_features_list = ["auto", "sqrt", "log2"]
        self.n_trees = range(min_trees, max_trees, 10)

    def train(self, X_train, y_train):
        error_rates = OrderedDict((label, []) for label in self.max_features_list)
        min_oob_error = [None, 0, 100]
        for label in self.max_features_list:
            for n in self.n_trees:
                print(f"tuning... {label}, {n}", end='\r')
                self.estimator.set_params(n_estimators=n,max_features=label)
                self.estimator.fit(X_train, y_train)
                oob_error = 1 - self.estimator.oob_score_
                error_rates[label].append((n, oob_error))
                if oob_error < min_oob_error[2]:
                    min_oob_error = [label, n, oob_error]

        print("training...", end='\r')
        self.estimator.set_params(n_estimators=min_oob_error[1],max_features=min_oob_error[0])
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with hyper-parameters: {self.estimator.n_estimators}, {self.estimator.max_features}")

In [191]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score, accuracy_score,f1_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

In [204]:
def run_models(bigram = False, vec_type = "tfidf", min_df = False):
    train = pd.read_csv("./train.csv")
    test = pd.read_csv("test.csv")
    

    if vec_type == "tfidf":
        train_x, train_y, test_x, test_y = make_xy_train_test(train, test, bigram, min_df =  min_df)
    elif vec_type == "countvec":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram,min_df =  min_df)
    elif vec_type == "binary":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, binary=True,min_df =  min_df)
    
    data = {}
    classifiers = [NaiveBayes(), LogRegClassifier(), TreeClassifier(), RandForestClassifier()]
    
    #feature selection for naiveBayes
    for clf in classifiers:
        if clf == NaiveBayes(): #select n most important features and use them for train the classifier
            
            #choose the number of features
            #n_feat=np.shape(train_x)[1]
            n_feat = 150
            
            print("Extracting %d best features by a chi-squared test" %n_feat)
            ch2 = SelectKBest(mutual_info_regression, k=n_feat)
            train_x = ch2.fit_transform(train_x, train_y)
            test_x = ch2.transform(test_x)
            
        clf.train(train_x, train_y)
        data[clf.name] = clf.evaluate(test_x, test_y)
    df = pd.DataFrame(data, \
        columns=[clf.name for clf in classifiers], \
        index=["recall", "precision", "accuracy", "f1"])
    print(f'{"bigram" if bigram else "unigram"} | {vec_type}:')
    print(df)

run_models(vec_type="tfidf",bigram=False, min_df = 0.01)        
run_models(vec_type="countvec", bigram=False, min_df = 0.01)
run_models(vec_type="binary", bigram=False, min_df = 0.01)

Words must appear at least in 1.0% of the reviews to become feature
The model contains 1255 features
Naive Bayes trained
Logistic Regression trained with lambda: [10.]
Decision Tree trained with aplha: 0.016480530741049437
tuning... auto, 30

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Random Forest trained with hyper-parameters: 130, sqrt
unigram | tfidf:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall        0.837500             0.875000       0.725000       0.900000
precision     0.893333             0.823529       0.630435       0.818182
accuracy      0.868750             0.843750       0.650000       0.850000
f1            0.864516             0.848485       0.674419       0.857143
Words must appear at least in 1.0% of the reviews to become feature
The model contains 1255 features
Naive Bayes trained
Logistic Regression trained with lambda: [10.]
Decision Tree trained with aplha: 0.010713206998389069
Random Forest trained with hyper-parameters: 120, sqrt
unigram | countvec:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall          0.8625             0.837500       0.725000       0.825000
precision       0.8625             0.797619       0.637363       0.795181
accuracy        0.8625             0.8

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Random Forest trained with hyper-parameters: 140, sqrt
unigram | binary:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall            0.85             0.887500       0.650000       0.762500
precision         0.85             0.755319       0.641975       0.824324
accuracy          0.85             0.800000       0.643750       0.800000
f1                0.85             0.816092       0.645963       0.792208


In [205]:
def best_feat(k,bigram = False, vec_type = "tfidf", min_df = False):
    train = pd.read_csv("./train.csv")
    test = pd.read_csv("test.csv")
    

    if vec_type == "tfidf":
        train_x, train_y, test_x, test_y = make_xy_train_test(train, test, bigram, min_df =  min_df)
    elif vec_type == "countvec":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram,min_df =  min_df)
    elif vec_type == "binary":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, binary=True,min_df =  min_df)
       
    feature_names = train_x.columns

    #Extracting k best features by a a chi-squared test

    print("Extracting %d best features by a chi-squared test" %k)
    ch2 = SelectKBest(chi2, k=k)
    ch2.fit(train_x, train_y)
    feature_names_chi2 = [feature_names[i] for i in ch2.get_support(indices=True)]
    
    for col in feature_names_chi2: 
        print(col) 
      
    
    #Extracting k best features by logistic regression coefficients:
    
    # define the model
    model = linear_model.LogisticRegressionCV(cv=4, max_iter=1000, Cs=[0.001, 0.01, 0.1, 1, 10, 100, 1000])
    # fit the model
    model.fit(train_x, train_y)
    # get importance
    importance = model.coef_[0]
    index = np.argsort(importance)
    n=np.shape(index)[0]
    # summarize feature importance
    print("Extracting %d best features by logistic regression coefficients:" %k)
    print("%d best features pointing towards fake reviews:" %k)
    for i in range(0,9):
        print('Feature: %s, Score: %.5f' %(feature_names[index[i]], importance[index[i]]))
        
    print("%d best features pointing towards true reviews:" %k)

    for i in range(0,9):
        print('Feature: %s, Score: %.5f' %(feature_names[index[n-i-1]], importance[index[n-i-1]]))

best_feat(10,bigram = False, vec_type = "tfidf", min_df = 0.01 )

Words must appear at least in 1.0% of the reviews to become feature
The model contains 1255 features
Extracting 10 best features by a chi-squared test
chicago
decided
elevators
great
location
luxury
millennium
priceline
recently
smell
Extracting 10 best features by logistic regression coefficients:
10 best features pointing towards fake reviews:
Feature: chicago, Score: -0.00582
Feature: smell, Score: -0.00260
Feature: finally, Score: -0.00251
Feature: luxury, Score: -0.00250
Feature: hotel, Score: -0.00243
Feature: recently, Score: -0.00238
Feature: room, Score: -0.00236
Feature: experience, Score: -0.00229
Feature: seemed, Score: -0.00229
10 best features pointing towards true reviews:
Feature: location, Score: 0.00323
Feature: great, Score: 0.00319
Feature: floor, Score: 0.00227
Feature: star, Score: 0.00216
Feature: elevators, Score: 0.00202
Feature: construction, Score: 0.00179
Feature: open, Score: 0.00176
Feature: elevator, Score: 0.00174
Feature: th, Score: 0.00163
