# SETUP

In [1]:
#IMPORTS
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from IPython.display import display
from sklearn import naive_bayes, linear_model, tree, ensemble
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, mutual_info_regression
from collections import OrderedDict

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Koen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_reviews(path, fold_nrs, label):
    columns = ['Raw', 'Processed', 'Label']
    df = pd.DataFrame()
    for i in fold_nrs:
        fold = "fold"+str(i)
        p = path + fold
        for file in os.listdir(p): # for each .txt file in this fold's folder
            if file.endswith(".txt"):
                f = open(os.path.join(p, file), "r")
                review = f.read()
                # remove whitespaces, numbers, punctuation, & make lowercase
                processed = process_string(review)
                new_row = pd.DataFrame([[review, processed, label]])
                df = df.append(new_row)
    df.columns = columns
    return df

def make_train_test_set():
    # Focusing only on the negative reviews
    path_dec = "./op_spam_v1.4/negative_polarity/deceptive_from_MTurk/"
    path_true = "./op_spam_v1.4/negative_polarity/truthful_from_Web/"

    # Label = 1 if it is a truthful (negative) review, =0 if it is a deceptive (negative) review

    #loading training set:
    train_dec = load_reviews(path_dec, np.arange(4)+1, 0) # folds 1-4 form the training set
    train_true = load_reviews(path_true, np.arange(4)+1, 1)
    train = pd.concat([train_dec, train_true])
    train = train.reset_index(drop=True)

    #loading the test set:
    test_dec = load_reviews(path_dec, [5], 0)  # test set for deceptive reviews
    test_true = load_reviews(path_true, [5], 1)
    test = pd.concat([test_dec, test_true])
    test = test.reset_index(drop=True)

    return [train,test]

def process_string(s):
    s = s.strip()    # remove whitespaces
    s = s.lower() # to lowercase
    s = re.sub(r'\d+', '', s) # remove numbers
    s = s.translate(str.maketrans("","", string.punctuation)) # remove punctuation
    return s


##########################
###Process files to CSV###
##########################

# train, test = make_train_test_set()
# train.to_csv("./train.csv", header = ['Raw', 'Processed', 'Label'], index=False)
# test.to_csv("./test.csv", header = ['Raw', 'Processed', 'Label'], index=False)



In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("test.csv")

print(f"\n Shape of training set: {train.shape}")
print(train.head())
print(f"\n Shape of test set: {test.shape}")
print(test.head())

train.head()


 Shape of training set: (640, 3)
                                                 Raw  \
0  We stayed at the Schicago Hilton for 4 days an...   
1  Hotel is located 1/2 mile from the train stati...   
2  I made my reservation at the Hilton Chicago be...   
3  When most people think Hilton, they think luxu...   
4  My husband and I recently stayed stayed at the...   

                                           Processed  Label  
0  we stayed at the schicago hilton for  days and...      0  
1  hotel is located  mile from the train station ...      0  
2  i made my reservation at the hilton chicago be...      0  
3  when most people think hilton they think luxur...      0  
4  my husband and i recently stayed stayed at the...      0  

 Shape of test set: (160, 3)
                                                 Raw  \
0  I recently stayed at the Hotel Allegro Chicago...   
1  I recently stayed at the Hotel Allegro in Chic...   
2  I recently visited Chicago. I stayed at the Ho...   
3  

Unnamed: 0,Raw,Processed,Label
0,We stayed at the Schicago Hilton for 4 days an...,we stayed at the schicago hilton for days and...,0
1,Hotel is located 1/2 mile from the train stati...,hotel is located mile from the train station ...,0
2,I made my reservation at the Hilton Chicago be...,i made my reservation at the hilton chicago be...,0
3,"When most people think Hilton, they think luxu...",when most people think hilton they think luxur...,0
4,My husband and I recently stayed stayed at the...,my husband and i recently stayed stayed at the...,0


In [13]:
def make_xy_train_test(train, test, bigram = False, min_df = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = TfidfVectorizer(stop_words = stpw, ngram_range=(1,2), min_df=min_df)
    else:
        vectorizer = TfidfVectorizer(stop_words = stpw, min_df=min_df)
        
    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle

    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

def make_countvec_xy_train_test(train, test, bigram = False, binary = False, min_df = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = CountVectorizer(stop_words = stpw, binary=binary, ngram_range=(1,2), min_df=min_df)
    else:
        vectorizer = CountVectorizer(stop_words = stpw, binary = binary, min_df=min_df)

    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle

    # print("features shape", features.shape)
    # print("test features shape", test_features.shape)
    # print("test shape", test.shape)
    # print("test merged", test_merged.shape)
    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

#train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test)
train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram=False)

bitrain_x, bitrain_y, bitest_x, bitest_y = make_countvec_xy_train_test(train, test, bigram=True)

display(train_x.head())
display(train_y.head())
display(test_x.head())
display(test_y.head())

display(bitrain_x.head())
display(bitrain_y.head())
display(bitest_x.head())
display(bitest_y.head())


Unnamed: 0,aaa,aaahed,aback,abassador,ability,able,abound,abrupt,absence,absent,...,youre,youth,youve,yrs,yuck,yummy,yunan,yup,zone,zoo
569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
638,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
265,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


569    1
324    1
638    1
265    0
557    1
Name: Label, dtype: int64

Unnamed: 0,aaa,aaahed,aback,abassador,ability,able,abound,abrupt,absence,absent,...,youre,youth,youve,yrs,yuck,yummy,yunan,yup,zone,zoo
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


34     0
125    1
62     0
72     0
12     0
Name: Label, dtype: int64

Unnamed: 0,aaa,aaa one,aaahed,aaahed picture,aback,aback phone,abassador,abassador east,abassador future,ability,...,yunan,yunan didnt,yup,yup even,yup got,zone,zone computer,zone would,zoo,zoo second
473,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
409,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
242,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


473    1
220    0
409    1
242    0
139    0
Name: Label, dtype: int64

Unnamed: 0,aaa,aaa one,aaahed,aaahed picture,aback,aback phone,abassador,abassador east,abassador future,ability,...,yunan,yunan didnt,yup,yup even,yup got,zone,zone computer,zone would,zoo,zoo second
130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


130    1
142    1
156    1
21     0
73     0
Name: Label, dtype: int64

# Analysis
Use cross-validation or (for random forests) out-of-bag evaluation to select the values of the hyper-parameters of the algorithms on the training set.

## Multinomial naive Bayes (generative linear classifier)
For naive Bayes, the performance might be improved by applying some form of feature selection (in addition to removing the sparse terms).

In [11]:
class Classifier(object):
    """ Generic classifier object. """
    def __init__(self):
        self.name = "Classifier"
        self.esimator = None
    
    def evaluate(self, X_test, y_test):
        if self.name == "Naive Bayes":
            X_test = self.ch2.transform(X_test)
        
        y_pred = self.estimator.predict(X_test)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        return recall, precision, accuracy, f1

class NaiveBayes(Classifier):
    """ Hyper-parameter tuning for the multinominal naive bayes classifier. """
    def __init__(self):
        self.name = "Naive Bayes"
        self.estimator = naive_bayes.MultinomialNB()
        self.n_feat = 150

    def train(self, X_train, y_train):
        n_feats = [140, 150, 160, 1000, 1255]
        scores = []
        for n_feat in n_feats:
            print(f"Extracting {n_feat} best features by a chi-squared test", end='\r')
            ch2 = SelectKBest(mutual_info_regression, k=n_feat)
            train_x = ch2.fit_transform(X_train, y_train)
            scores.append(self.cross_validate(train_x, y_train))
        
        self.n_feat = n_feats[scores.index(max(scores))]
        self.ch2 = SelectKBest(mutual_info_regression, k=self.n_feat)
        X_train = self.ch2.fit_transform(X_train, y_train)
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with {self.n_feat} features")

    def cross_validate(self, X_train, y_train):
        accuracy_scores = []
        kf = KFold(n_splits=4)
        for train_index, test_index in kf.split(X_train):
            train_x, test_x = X_train[train_index], X_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            self.estimator.fit(train_x, train_y)
            accuracy_scores.append(self.estimator.score(test_x, test_y))

        return np.mean(accuracy_scores)


## Regularized logistic regression (discriminative linear classifier)

In [6]:
class LogRegClassifier(Classifier):
    """ Hyper-parameter tuning for the logistic regression classifier. """
    def __init__(self):
        self.name = "Logistic Regression"
        self.estimator = linear_model.LogisticRegressionCV(cv=4, max_iter=1000, Cs=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

    def train(self, X_train, y_train):
        print("training...", end='\r')
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with lambda: {self.estimator.C_}")


## Classification trees, (flexible classifier)


In [7]:
class TreeClassifier(Classifier):
    """ Hyper-parameter tuning for the decision tree classifier. """
    def __init__(self):
        self.name = "Decision Tree"
        self.estimator = tree.DecisionTreeClassifier()
    
    def train(self, X_train, y_train):
        print("set aplhas...", end='\r')
        path = self.estimator.cost_complexity_pruning_path(X_train, y_train)
        ccp_alphas = path.ccp_alphas
        parameters = {'ccp_alpha': ccp_alphas}
        clf = GridSearchCV(self.estimator, parameters, cv=4)

        print("training...", end='\r')
        clf.fit(X_train, y_train)
        self.estimator = clf.best_estimator_
        print(f"{self.name} trained with aplha: {self.estimator.ccp_alpha}")


## Random forests (flexible classifier)

In [8]:
class RandForestClassifier(Classifier):
    """ Hyper-parameter tuning for the random forest classifier. """
    def __init__(self, min_trees=20, max_trees=160):
        self.name = "Random Forest"
        self.estimator = ensemble.RandomForestClassifier(oob_score=True)
        self.max_features_list = ["auto", "sqrt", "log2"]
        self.n_trees = range(min_trees, max_trees, 10)

    def train(self, X_train, y_train):
        error_rates = OrderedDict((label, []) for label in self.max_features_list)
        min_oob_error = [None, 0, 100]
        for label in self.max_features_list:
            for n in self.n_trees:
                print(f"tuning... {label}, {n}", end='\r')
                self.estimator.set_params(n_estimators=n,max_features=label)
                self.estimator.fit(X_train, y_train)
                oob_error = 1 - self.estimator.oob_score_
                error_rates[label].append((n, oob_error))
                if oob_error < min_oob_error[2]:
                    min_oob_error = [label, n, oob_error]

        print("training...", end='\r')
        self.estimator.set_params(n_estimators=min_oob_error[1],max_features=min_oob_error[0])
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with hyper-parameters: {self.estimator.n_estimators}, {self.estimator.max_features}")


## Model accuracy comparison analysis
Comparisons of the accuracy of different models should be supported by a statistical test. For the comparison of the other quality measures (precision, recall, F1 score), a statistical test is not required.

In [18]:
def run_models(bigram = False, vec_type = "tfidf", min_df = False):
    train = pd.read_csv("./train.csv")
    test = pd.read_csv("test.csv")
    

    if vec_type == "tfidf":
        train_x, train_y, test_x, test_y = make_xy_train_test(train, test, bigram, min_df=min_df)
    elif vec_type == "countvec":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, min_df=min_df)
    elif vec_type == "binary":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, binary=True, min_df=min_df)
    
    data = {}
    classifiers = [NaiveBayes(), LogRegClassifier(), TreeClassifier(), RandForestClassifier()]
    for clf in classifiers:
        print(f"training {clf.name}", end='\r')
        clf.train(train_x, train_y)
        data[clf.name] = clf.evaluate(test_x, test_y)
    df = pd.DataFrame(data, \
        columns=[clf.name for clf in classifiers], \
        index=["recall", "precision", "accuracy", "f1"])
    print(f'{"bigram" if bigram else "unigram"} | {vec_type}:')
    print(df)

run_models(bigram=True, min_df=0.01)
run_models(vec_type="countvec", bigram=True, min_df=0.01)
run_models(vec_type="binary", bigram=True, min_df=0.01)



KeyboardInterrupt: 