# SETUP

In [14]:
#IMPORTS
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from IPython.display import display
from sklearn import naive_bayes, linear_model, tree, ensemble
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, mutual_info_regression
from collections import OrderedDict

[nltk_data] Downloading package stopwords to /Users/jsiu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def load_reviews(path, fold_nrs, label):
    columns = ['Raw', 'Processed', 'Label']
    df = pd.DataFrame()
    for i in fold_nrs:
        fold = "fold"+str(i)
        p = path + fold
        for file in os.listdir(p): # for each .txt file in this fold's folder
            if file.endswith(".txt"):
                f = open(os.path.join(p, file), "r")
                review = f.read()
                # remove whitespaces, numbers, punctuation, & make lowercase
                processed = process_string(review)
                new_row = pd.DataFrame([[review, processed, label]])
                df = df.append(new_row)
    df.columns = columns
    return df

def make_train_test_set():
    # Focusing only on the negative reviews
    path_dec = "./op_spam_v1.4/negative_polarity/deceptive_from_MTurk/"
    path_true = "./op_spam_v1.4/negative_polarity/truthful_from_Web/"

    # Label = 1 if it is a truthful (negative) review, =0 if it is a deceptive (negative) review

    #loading training set:
    train_dec = load_reviews(path_dec, np.arange(4)+1, 0) # folds 1-4 form the training set
    train_true = load_reviews(path_true, np.arange(4)+1, 1)
    train = pd.concat([train_dec, train_true])
    train = train.reset_index(drop=True)

    #testchangeDELTETHIS
    #loading the test set:
    test_dec = load_reviews(path_dec, [5], 0)  # test set for deceptive reviews
    test_true = load_reviews(path_true, [5], 1)
    test = pd.concat([test_dec, test_true])
    test = test.reset_index(drop=True)

    return [train,test]

def process_string(s):
    s = s.strip()    # remove whitespaces
    s = s.lower() # to lowercase
    s = re.sub(r'\d+', '', s) # remove numbers
    s = s.translate(str.maketrans("","", string.punctuation)) # remove punctuation
    return s


##########################
###Process files to CSV###
##########################

# train, test = make_train_test_set()
# train.to_csv("./train.csv", header = ['Raw', 'Processed', 'Label'], index=False)
# test.to_csv("./test.csv", header = ['Raw', 'Processed', 'Label'], index=False)



In [16]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("test.csv")

print(f"\n Shape of training set: {train.shape}")
print(train.head())
print(f"\n Shape of test set: {test.shape}")
print(test.head())

train.head()


 Shape of training set: (640, 3)
                                                 Raw  \
0  My husband and I stayed at the Sofitel Chicago...   
1  Staying at the Sofitel was one of the less ple...   
2  I stayed at Sofitel with my husband for a week...   
3  I stayed at the Sofitel Chicago Water Tower ho...   
4  After arriving at the Sofitel Chicago Water To...   

                                           Processed  Label  
0  my husband and i stayed at the sofitel chicago...      0  
1  staying at the sofitel was one of the less ple...      0  
2  i stayed at sofitel with my husband for a week...      0  
3  i stayed at the sofitel chicago water tower ho...      0  
4  after arriving at the sofitel chicago water to...      0  

 Shape of test set: (160, 3)
                                                 Raw  \
0  I stayed at the InterContinental in Chicago fo...   
1  I have to agree that the InterContinental Chic...   
2  Upon entering my hotel room at The Palmer Hous...   
3  

Unnamed: 0,Raw,Processed,Label
0,My husband and I stayed at the Sofitel Chicago...,my husband and i stayed at the sofitel chicago...,0
1,Staying at the Sofitel was one of the less ple...,staying at the sofitel was one of the less ple...,0
2,I stayed at Sofitel with my husband for a week...,i stayed at sofitel with my husband for a week...,0
3,I stayed at the Sofitel Chicago Water Tower ho...,i stayed at the sofitel chicago water tower ho...,0
4,After arriving at the Sofitel Chicago Water To...,after arriving at the sofitel chicago water to...,0


In [124]:
def make_xy_train_test(train, test, bigram = False, min_df = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = TfidfVectorizer(stop_words = stpw, ngram_range=(1,2), min_df=min_df)
    else:
        vectorizer = TfidfVectorizer(stop_words = stpw, min_df=min_df)
        
    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True)#.sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True)#.sample(frac=1) #merge data and shuffle

    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

def make_countvec_xy_train_test(train, test, bigram = False, binary = False, min_df = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = CountVectorizer(stop_words = stpw, binary=binary, ngram_range=(1,2), min_df=min_df)
    else:
        vectorizer = CountVectorizer(stop_words = stpw, binary = binary, min_df=min_df)

    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True)#.sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True)#.sample(frac=1) #merge data and shuffle

    # print("features shape", features.shape)
    # print("test features shape", test_features.shape)
    # print("test shape", test.shape)
    # print("test merged", test_merged.shape)
    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

#train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test)
train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram=False)

bitrain_x, bitrain_y, bitest_x, bitest_y = make_countvec_xy_train_test(train, test, bigram=True)

display(train_x.head())
display(train_y.head())
display(test_x.head())
display(test_y.head())

display(bitrain_x.head())
display(bitrain_y.head())
display(bitest_x.head())
display(bitest_y.head())


Unnamed: 0,aaa,aaahed,aback,abassador,ability,able,abound,abrupt,absence,absent,...,youre,youth,youve,yrs,yuck,yummy,yunan,yup,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

Unnamed: 0,aaa,aaahed,aback,abassador,ability,able,abound,abrupt,absence,absent,...,youre,youth,youve,yrs,yuck,yummy,yunan,yup,zone,zoo
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

Unnamed: 0,aaa,aaa one,aaahed,aaahed picture,aback,aback phone,abassador,abassador east,abassador future,ability,...,yunan,yunan didnt,yup,yup even,yup got,zone,zone computer,zone would,zoo,zoo second
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

Unnamed: 0,aaa,aaa one,aaahed,aaahed picture,aback,aback phone,abassador,abassador east,abassador future,ability,...,yunan,yunan didnt,yup,yup even,yup got,zone,zone computer,zone would,zoo,zoo second
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

# Analysis
Use cross-validation or (for random forests) out-of-bag evaluation to select the values of the hyper-parameters of the algorithms on the training set.

## Multinomial naive Bayes (generative linear classifier)
For naive Bayes, the performance might be improved by applying some form of feature selection (in addition to removing the sparse terms).

In [18]:
class Classifier(object):
    """ Generic classifier object. """
    def __init__(self):
        self.name = "Classifier"
        self.esimator = None
    
    def evaluate(self, X_test, y_test):
        if self.name == "Naive Bayes":
            X_test = self.ch2.transform(X_test)
        
        y_pred = self.estimator.predict(X_test)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        return y_pred, (recall, precision, accuracy, f1)

class NaiveBayes(Classifier):
    """ Hyper-parameter tuning for the multinominal naive bayes classifier. """
    def __init__(self):
        self.name = "Naive Bayes"
        self.estimator = naive_bayes.MultinomialNB()
        self.n_feat = 150

    def train(self, X_train, y_train):
        n_feats = [140, 150, 160, 1000, 1255]
        scores = []
        for n_feat in n_feats:
            print(f"Extracting {n_feat} best features by a chi-squared test", end='\r')
            ch2 = SelectKBest(mutual_info_regression, k=n_feat)
            train_x = ch2.fit_transform(X_train, y_train)
            scores.append(self.cross_validate(train_x, y_train))
        
        self.n_feat = n_feats[scores.index(max(scores))]
        self.ch2 = SelectKBest(mutual_info_regression, k=self.n_feat)
        X_train = self.ch2.fit_transform(X_train, y_train)
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with {self.n_feat} features")

    def cross_validate(self, X_train, y_train):
        accuracy_scores = []
        kf = KFold(n_splits=4)
        for train_index, test_index in kf.split(X_train):
            train_x, test_x = X_train[train_index], X_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            self.estimator.fit(train_x, train_y)
            accuracy_scores.append(self.estimator.score(test_x, test_y))

        return np.mean(accuracy_scores)


## Regularized logistic regression (discriminative linear classifier)

In [19]:
class LogRegClassifier(Classifier):
    """ Hyper-parameter tuning for the logistic regression classifier. """
    def __init__(self):
        self.name = "Logistic Regression"
        self.estimator = linear_model.LogisticRegressionCV(cv=4, max_iter=1000, Cs=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

    def train(self, X_train, y_train):
        print("training...", end='\r')
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with lambda: {self.estimator.C_}")


## Classification trees, (flexible classifier)


In [20]:
class TreeClassifier(Classifier):
    """ Hyper-parameter tuning for the decision tree classifier. """
    def __init__(self):
        self.name = "Decision Tree"
        self.estimator = tree.DecisionTreeClassifier()
    
    def train(self, X_train, y_train):
        print("set aplhas...", end='\r')
        path = self.estimator.cost_complexity_pruning_path(X_train, y_train)
        ccp_alphas = path.ccp_alphas
        parameters = {'ccp_alpha': ccp_alphas}
        clf = GridSearchCV(self.estimator, parameters, cv=4)

        print("training...", end='\r')
        clf.fit(X_train, y_train)
        self.estimator = clf.best_estimator_
        print(f"{self.name} trained with aplha: {self.estimator.ccp_alpha}")


## Random forests (flexible classifier)

In [21]:
class RandForestClassifier(Classifier):
    """ Hyper-parameter tuning for the random forest classifier. """
    def __init__(self, min_trees=20, max_trees=160):
        self.name = "Random Forest"
        self.estimator = ensemble.RandomForestClassifier(oob_score=True)
        self.max_features_list = ["auto", "sqrt", "log2"]
        self.n_trees = range(min_trees, max_trees, 10)

    def train(self, X_train, y_train):
        error_rates = OrderedDict((label, []) for label in self.max_features_list)
        min_oob_error = [None, 0, 100]
        for label in self.max_features_list:
            for n in self.n_trees:
                print(f"tuning... {label}, {n}", end='\r')
                self.estimator.set_params(n_estimators=n,max_features=label)
                self.estimator.fit(X_train, y_train)
                oob_error = 1 - self.estimator.oob_score_
                error_rates[label].append((n, oob_error))
                if oob_error < min_oob_error[2]:
                    min_oob_error = [label, n, oob_error]

        print("training...", end='\r')
        self.estimator.set_params(n_estimators=min_oob_error[1],max_features=min_oob_error[0])
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with hyper-parameters: {self.estimator.n_estimators}, {self.estimator.max_features}")


## Model accuracy comparison analysis
Comparisons of the accuracy of different models should be supported by a statistical test. For the comparison of the other quality measures (precision, recall, F1 score), a statistical test is not required.

In [122]:
def run_models(bigram = False, vec_type = "tfidf", min_df = False, compare_bigrams = False):
    train = pd.read_csv("./train.csv")
    test = pd.read_csv("test.csv")
    

    if vec_type == "tfidf":
        train_x, train_y, test_x, test_y = make_xy_train_test(train, test, bigram, min_df=min_df)
    elif vec_type == "countvec":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, min_df=min_df)
    elif vec_type == "binary":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, binary=True, min_df=min_df)
    
    data = {}
    results = {}
    results["test_y"] = test_y

    classifiers = [NaiveBayes(), LogRegClassifier(), TreeClassifier(), RandForestClassifier()]
    for clf in classifiers:
        print(f"training {clf.name}", end='\r')
        clf.train(train_x, train_y)
        results[clf.name], data[clf.name] = clf.evaluate(test_x, test_y)

    df = pd.DataFrame(data, \
        columns=[clf.name for clf in classifiers], \
        index=["recall", "precision", "accuracy", "f1"])
    print(f'{"bigram" if bigram else "unigram"} | {vec_type}:')
    print(df)
    return pd.DataFrame.from_dict(results)

unitfidf = run_models(bigram=False, min_df=0.01)
bitfidf = run_models(bigram=True, min_df=0.01)

unicv = run_models(vec_type="countvec", bigram=False, min_df=0.01)
bicv = run_models(vec_type="countvec", bigram=True, min_df=0.01)

unibin = run_models(vec_type="binary", bigram=False, min_df=0.01)
bibin = run_models(vec_type="binary", bigram=True, min_df=0.01)

Naive Bayes trained with 1255 features
Logistic Regression trained with lambda: [10.]
Decision Tree trained with aplha: 0.019087865218129996
Random Forest trained with hyper-parameters: 140, log2
unigram | tfidf:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall        0.837500             0.875000       0.725000       0.800000
precision     0.893333             0.823529       0.630435       0.864865
accuracy      0.868750             0.843750       0.650000       0.837500
f1            0.864516             0.848485       0.674419       0.831169
Naive Bayes trained with 1255 features
Logistic Regression trained with lambda: [10.]
Decision Tree trained with aplha: 0.017254483115367286
Random Forest trained with hyper-parameters: 130, auto
bigram | tfidf:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall        0.900000             0.937500       0.662500       0.812500
precision     0.847059             0.797872       0.62352

In [138]:
from statsmodels.stats.contingency_tables import mcnemar

def contingency(y_true, pred1, pred2):
    cM = np.zeros((2,2))
    for i in range(len(y_true)):
        if y_true[i] == pred1[i] and y_true[i] == pred2[i]:
            cM[1,1] += 1
        if y_true[i] != pred1[i] and y_true[i] == pred2[i]:
            cM[0,1] += 1
        if y_true[i] != pred1[i] and y_true[i] != pred2[i]:
            cM[0,0] += 1
        if y_true[i] == pred1[i] and y_true[i] != pred2[i]:
            cM[1,0] += 1
    return cM


def calculateMcNemar(data = None, results = None, test_y = None):
    if results.empty:
        results = data.loc[:, data.columns != "test_y"]
    if test_y.empty:
        test_y = data["test_y"]

    McNemarScores = {}
    for i in range(0, len(results.columns)):
        for j in range(i + 1, len(results.columns)):
            con = contingency(test_y, results.iloc[:,i], results.iloc[:,j])
            mscore = mcnemar(con, exact = False)
            McNemarScores[f"{results.columns[i]} - {results.columns[j]}"] = mscore
            print(f"{results.columns[i]} - {results.columns[j]} \n {con} \n statistic={mscore.statistic}, p-value={mscore.pvalue}")




In [120]:
#McNemar scores of each classification algorith compared with each other

print("\n uni tfidf")
calculateMcNemar(unitfidf)
print("\n bi tfidf")
calculateMcNemar(bitfidf)
print("\n uni countvec")
calculateMcNemar(unicv)

#bigram features
print("\n bi countvec")
calculateMcNemar(bicv)
print("\n uni binary")
calculateMcNemar(unibin)
print("\n bi binary")
calculateMcNemar(bibin)


 uni tfidf
Naive Bayes - Logistic Regression 
 [[ 13.  13.]
 [  8. 126.]] 
 statistic=0.7619047619047619, p-value=0.38273308888522595
Naive Bayes - Decision Tree 
 [[11. 15.]
 [46. 88.]] 
 statistic=14.754098360655737, p-value=0.00012248100770987952
Naive Bayes - Random Forest 
 [[ 15.  11.]
 [ 15. 119.]] 
 statistic=0.34615384615384615, p-value=0.5562984612747348
Logistic Regression - Decision Tree 
 [[10. 11.]
 [47. 92.]] 
 statistic=21.120689655172413, p-value=4.312468453366182e-06
Logistic Regression - Random Forest 
 [[ 16.   5.]
 [ 14. 125.]] 
 statistic=3.3684210526315788, p-value=0.0664574200169312
Decision Tree - Random Forest 
 [[18. 39.]
 [12. 91.]] 
 statistic=13.254901960784315, p-value=0.0002718680028822981

 bi tfidf
Naive Bayes - Logistic Regression 
 [[ 12.  10.]
 [ 10. 128.]] 
 statistic=0.05, p-value=0.8230632737581214
Naive Bayes - Decision Tree 
 [[11. 11.]
 [48. 90.]] 
 statistic=21.966101694915253, p-value=2.7750883948169247e-06
Naive Bayes - Random Forest 
 [[ 

In [152]:
#comparing unigram vs bigram results of same models

def compareBigramScore(test_y, model, unigram, bigram):
    df = pd.DataFrame()
    df["unigram"] = unigram
    df["bigram"] = bigram
    print(f"{model} unigram vs bigram")
    print(df.head())
    calculateMcNemar(results = df, test_y = test_y)


print("TFIDF Unigram vs Bigram")
testmodel = "Naive Bayes"
compareBigramScore(unitfidf["test_y"], testmodel, unitfidf[testmodel], bitfidf[testmodel])
testmodel = "Logistic Regression"
compareBigramScore(unitfidf["test_y"], testmodel, unitfidf[testmodel], bitfidf[testmodel])
testmodel = "Decision Tree"
compareBigramScore(unitfidf["test_y"], testmodel, unitfidf[testmodel], bitfidf[testmodel])
testmodel = "Random Forest"
compareBigramScore(unitfidf["test_y"], testmodel, unitfidf[testmodel], bitfidf[testmodel])

print("--------------------------  \n\n CountVector Unigram vs Bigram")
testmodel = "Naive Bayes"
compareBigramScore(unicv["test_y"], testmodel, unicv[testmodel], bicv[testmodel])
testmodel = "Logistic Regression"
compareBigramScore(unicv["test_y"], testmodel, unicv[testmodel], bicv[testmodel])
testmodel = "Decision Tree"
compareBigramScore(unicv["test_y"], testmodel, unicv[testmodel], bicv[testmodel])
testmodel = "Random Forest"
compareBigramScore(unicv["test_y"], testmodel, unicv[testmodel], bicv[testmodel])

print("--------------------------  \n\n Binary Unigram vs Bigram")
testmodel = "Naive Bayes"
compareBigramScore(unibin["test_y"], testmodel, unibin[testmodel], bibin[testmodel])
testmodel = "Logistic Regression"
compareBigramScore(unibin["test_y"], testmodel, unibin[testmodel], bibin[testmodel])
testmodel = "Decision Tree"
compareBigramScore(unibin["test_y"], testmodel, unibin[testmodel], bibin[testmodel])
testmodel = "Random Forest"
compareBigramScore(unibin["test_y"], testmodel, unibin[testmodel], bibin[testmodel])

    

TFIDF Unigram vs Bigram
Naive Bayes unigram vs bigram
   unigram  bigram
0        1       1
1        0       0
2        0       0
3        0       0
4        0       0
unigram - bigram 
 [[ 13.   8.]
 [  8. 131.]] 
 statistic=0.0625, p-value=0.8025873486341526
Logistic Regression unigram vs bigram
   unigram  bigram
0        1       1
1        0       0
2        0       0
3        0       0
4        0       0
unigram - bigram 
 [[ 19.   6.]
 [  5. 130.]] 
 statistic=0.0, p-value=1.0
Decision Tree unigram vs bigram
   unigram  bigram
0        0       0
1        0       0
2        0       1
3        0       0
4        0       0
unigram - bigram 
 [[50.  6.]
 [ 9. 95.]] 
 statistic=0.26666666666666666, p-value=0.6055766163353462
Random Forest unigram vs bigram
   unigram  bigram
0        0       0
1        0       0
2        0       0
3        0       0
4        0       0
unigram - bigram 
 [[ 15.  11.]
 [ 19. 115.]] 
 statistic=1.6333333333333333, p-value=0.20124262095772028
------------