# SETUP

In [1]:
#IMPORTS
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
    

In [2]:
def load_reviews(path, fold_nrs, label):
    columns = ['Raw', 'Processed', 'Label']
    df = pd.DataFrame()
    for i in fold_nrs:
        fold = "fold"+str(i)
        p = path + fold
        for file in os.listdir(p): # for each .txt file in this fold's folder
            if file.endswith(".txt"):
                f = open(os.path.join(p, file), "r")
                review = f.read()
                # remove whitespaces, numbers, punctuation, & make lowercase
                processed = process_string(review)
                new_row = pd.DataFrame([[review, processed, label]])
                df = df.append(new_row)
    df.columns = columns
    return df

def make_train_test_set():
    # Focusing only on the negative reviews
    path_dec = "./op_spam_v1.4/negative_polarity/deceptive_from_MTurk/"
    path_true = "./op_spam_v1.4/negative_polarity/truthful_from_Web/"

    # Label = 1 if it is a truthful (negative) review, =0 if it is a deceptive (negative) review

    #loading training set:
    train_dec = load_reviews(path_dec, np.arange(4)+1, 0) # folds 1-4 form the training set
    train_true = load_reviews(path_true, np.arange(4)+1, 1)
    train = pd.concat([train_dec, train_true])
    train = train.reset_index(drop=True)

    #loading the test set:
    test_dec = load_reviews(path_dec, [5], 0)  # test set for deceptive reviews
    test_true = load_reviews(path_true, [5], 1)
    test = pd.concat([test_dec, test_true])
    test = test.reset_index(drop=True)

    return [train,test]

def process_string(s):
    s = s.strip()    # remove whitespaces
    s = s.lower() # to lowercase
    s = re.sub(r'\d+', '', s) # remove numbers
    s = s.translate(str.maketrans("","", string.punctuation)) # remove punctuation
    return s


##########################
###Process files to CSV###
##########################

# train, test = make_train_test_set()
# train.to_csv("./train.csv", header = ['Raw', 'Processed', 'Label'], index=False)
# test.to_csv("./test.csv", header = ['Raw', 'Processed', 'Label'], index=False)



In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("test.csv")

print(f"\n Shape of training set: {train.shape}")
print(train.head())
print(f"\n Shape of test set: {test.shape}")
print(test.head())

train.head()


 Shape of training set: (640, 3)
                                                 Raw  ... Label
0  My husband and I stayed at the Sofitel Chicago...  ...     0
1  Staying at the Sofitel was one of the less ple...  ...     0
2  I stayed at Sofitel with my husband for a week...  ...     0
3  I stayed at the Sofitel Chicago Water Tower ho...  ...     0
4  After arriving at the Sofitel Chicago Water To...  ...     0

[5 rows x 3 columns]

 Shape of test set: (160, 3)
                                                 Raw  ... Label
0  I stayed at the InterContinental in Chicago fo...  ...     0
1  I have to agree that the InterContinental Chic...  ...     0
2  Upon entering my hotel room at The Palmer Hous...  ...     0
3  We will not be back to this hotel. There are s...  ...     0
4  The Intercontinental Chicago Magnificent Mile ...  ...     0

[5 rows x 3 columns]


Unnamed: 0,Raw,Processed,Label
0,My husband and I stayed at the Sofitel Chicago...,my husband and i stayed at the sofitel chicago...,0
1,Staying at the Sofitel was one of the less ple...,staying at the sofitel was one of the less ple...,0
2,I stayed at Sofitel with my husband for a week...,i stayed at sofitel with my husband for a week...,0
3,I stayed at the Sofitel Chicago Water Tower ho...,i stayed at the sofitel chicago water tower ho...,0
4,After arriving at the Sofitel Chicago Water To...,after arriving at the sofitel chicago water to...,0


In [9]:
def Tfidf_word(data, stopwords = None):
    vectorizer = TfidfVectorizer(stop_words = stopwords)
    vec = vectorizer.fit_transform(data)
    return pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())

def Tfidf_bigram(data, stopwords = None):
    vectorizer = TfidfVectorizer(stop_words = stopwords, ngram_range=(2,2))
    vec = vectorizer.fit_transform(data)
    return pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())

def make_xy(data, bigram = False):
    if bigram:
        features = Tfidf_bigram(data["Processed"], stopwords = stopwords.words('english'))    
    else:
        features = Tfidf_word(data["Processed"], stopwords = stopwords.words('english'))
    merged = pd.merge(data, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    return merged.iloc[:,3:], merged["Label"] #return x and y

train_x, train_y = make_xy(train)
test_x, test_y = make_xy(test)

bitrain_x, bitrain_y = make_xy(train, bigram = True)
bitest_x, bitest_y = make_xy(test, bigram = True)



# Analysis
Use cross-validation or (for random forests) out-of-bag evaluation to select the values of the hyper-parameters of the algorithms on the training set.

## Multinomial naive Bayes (generative linear classifier)
For naive Bayes, the performance might be improved by applying some form of feature selection (in addition to removing the sparse terms).

## Regularized logistic regression (discriminative linear classifier)

## Classification trees, (flexible classifier)


## Random forests (flexible classifier)

## Model accuracy comparison analysis
Comparisons of the accuracy of different models should be supported by a statistical test. For the comparison of the other quality measures (precision, recall, F1 score), a statistical test is not required.