# SETUP

In [5]:
#IMPORTS
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from IPython.display import display
from sklearn import naive_bayes, linear_model, tree, ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from collections import OrderedDict

In [6]:
def load_reviews(path, fold_nrs, label):
    columns = ['Raw', 'Processed', 'Label']
    df = pd.DataFrame()
    for i in fold_nrs:
        fold = "fold"+str(i)
        p = path + fold
        for file in os.listdir(p): # for each .txt file in this fold's folder
            if file.endswith(".txt"):
                f = open(os.path.join(p, file), "r")
                review = f.read()
                # remove whitespaces, numbers, punctuation, & make lowercase
                processed = process_string(review)
                new_row = pd.DataFrame([[review, processed, label]])
                df = df.append(new_row)
    df.columns = columns
    return df

def make_train_test_set():
    # Focusing only on the negative reviews
    path_dec = "./op_spam_v1.4/negative_polarity/deceptive_from_MTurk/"
    path_true = "./op_spam_v1.4/negative_polarity/truthful_from_Web/"

    # Label = 1 if it is a truthful (negative) review, =0 if it is a deceptive (negative) review

    #loading training set:
    train_dec = load_reviews(path_dec, np.arange(4)+1, 0) # folds 1-4 form the training set
    train_true = load_reviews(path_true, np.arange(4)+1, 1)
    train = pd.concat([train_dec, train_true])
    train = train.reset_index(drop=True)

    #loading the test set:
    test_dec = load_reviews(path_dec, [5], 0)  # test set for deceptive reviews
    test_true = load_reviews(path_true, [5], 1)
    test = pd.concat([test_dec, test_true])
    test = test.reset_index(drop=True)

    return [train,test]

def process_string(s):
    s = s.strip()    # remove whitespaces
    s = s.lower() # to lowercase
    s = re.sub(r'\d+', '', s) # remove numbers
    s = s.translate(str.maketrans("","", string.punctuation)) # remove punctuation
    return s


##########################
###Process files to CSV###
##########################

# train, test = make_train_test_set()
# train.to_csv("./train.csv", header = ['Raw', 'Processed', 'Label'], index=False)
# test.to_csv("./test.csv", header = ['Raw', 'Processed', 'Label'], index=False)



In [7]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("test.csv")

print(f"\n Shape of training set: {train.shape}")
print(train.head())
print(f"\n Shape of test set: {test.shape}")
print(test.head())

train.head()


 Shape of training set: (640, 3)
                                                 Raw  ... Label
0  My husband and I stayed at the Sofitel Chicago...  ...     0
1  Staying at the Sofitel was one of the less ple...  ...     0
2  I stayed at Sofitel with my husband for a week...  ...     0
3  I stayed at the Sofitel Chicago Water Tower ho...  ...     0
4  After arriving at the Sofitel Chicago Water To...  ...     0

[5 rows x 3 columns]

 Shape of test set: (160, 3)
                                                 Raw  ... Label
0  I stayed at the InterContinental in Chicago fo...  ...     0
1  I have to agree that the InterContinental Chic...  ...     0
2  Upon entering my hotel room at The Palmer Hous...  ...     0
3  We will not be back to this hotel. There are s...  ...     0
4  The Intercontinental Chicago Magnificent Mile ...  ...     0

[5 rows x 3 columns]


Unnamed: 0,Raw,Processed,Label
0,My husband and I stayed at the Sofitel Chicago...,my husband and i stayed at the sofitel chicago...,0
1,Staying at the Sofitel was one of the less ple...,staying at the sofitel was one of the less ple...,0
2,I stayed at Sofitel with my husband for a week...,i stayed at sofitel with my husband for a week...,0
3,I stayed at the Sofitel Chicago Water Tower ho...,i stayed at the sofitel chicago water tower ho...,0
4,After arriving at the Sofitel Chicago Water To...,after arriving at the sofitel chicago water to...,0


In [8]:
# def Tfidf_word(data, stopwords = None):
#     vectorizer = TfidfVectorizer(stop_words = stopwords)
#     vec = vectorizer.fit_transform(data)
#     return pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())

# def Tfidf_bigram(data, stopwords = None):
#     vectorizer = TfidfVectorizer(stop_words = stopwords, ngram_range=(2,2))
#     vec = vectorizer.fit_transform(data)
#     return pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())

# def make_xy(data, bigram = False):
#     if bigram:
#         features = Tfidf_bigram(data["Processed"], stopwords = stopwords.words('english'))    
#     else:
#         features = Tfidf_word(data["Processed"], stopwords = stopwords.words('english'))
#     merged = pd.merge(data, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
#     return merged.iloc[:,3:], merged["Label"] #return x and y

# def make_xy_train_test(train, test, bigram = False):
#     if bigram: # use training data to make vectorizer (vocabulary)
#         features = Tfidf_bigram(train["Processed"], stopwords = stopwords.words('english'))    
#     else:
    #     features = Tfidf_word(train["Processed"], stopwords = stopwords.words('english'))
    # train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    # test_merged = pd.merge(test, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
#     print("features shape", features.shape)
#     print("test shape", test.shape)
#     print("test merged", test_merged.shape)
#     return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

def make_xy_train_test(train, test, bigram = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = TfidfVectorizer(stop_words = stpw, ngram_range=(2,2))
    else:
        vectorizer = TfidfVectorizer(stop_words = stpw)
        
    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle

    # print("features shape", features.shape)
    # print("test features shape", test_features.shape)
    # print("test shape", test.shape)
    # print("test merged", test_merged.shape)
    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

def make_countvec_xy_train_test(train, test, bigram = False, binary = False):
    stpw = stopwords.words('english')

    if bigram: # use training data to make vectorizer (vocabulary)
        vectorizer = CountVectorizer(stop_words = stpw, binary=binary, ngram_range=(2,2))
    else:
        vectorizer = CountVectorizer(stop_words = stpw, binary = binary)

    vec = vectorizer.fit_transform(train["Processed"])
    features = pd.DataFrame(vec.toarray(),columns=vectorizer.get_feature_names())
    
    test_vec = vectorizer.transform(test["Processed"])
    test_features = pd.DataFrame(test_vec.toarray(),columns=vectorizer.get_feature_names())

    train_merged = pd.merge(train, features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle
    test_merged = pd.merge(test, test_features, left_index = True, right_index = True).sample(frac=1) #merge data and shuffle

    # print("features shape", features.shape)
    # print("test features shape", test_features.shape)
    # print("test shape", test.shape)
    # print("test merged", test_merged.shape)
    return [train_merged.iloc[:,3:], train_merged["Label"], test_merged.iloc[:,3:], test_merged["Label"]] #return [x_train, y_train,

# train_x, train_y, test_x, test_y = make_xy_train_test(train, test)
train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test)

#train_x, train_y = make_xy(train)
#test_x, test_y = make_xy(test)

# bitrain_x, bitrain_y, bitest_x, bitest_y = make_xy_train_test(train, test, bigram=True)
bitrain_x, bitrain_y, bitest_x, bitest_y = make_countvec_xy_train_test(train, test, bigram=True)

#bitrain_x, bitrain_y = make_xy(train, bigram = True)
#bitest_x, bitest_y = make_xy(test, bigram = True)


display(train_x.head())
display(train_y.head())
display(test_x.head())
display(test_y.head())

display(bitrain_x.head())
display(bitrain_y.head())
display(bitest_x.head())
display(bitest_y.head())


Unnamed: 0,aaa,aaahed,aback,abassador,ability,able,abound,abrupt,absence,absent,absolute,absolutely,absolutly,absurd,abundant,abutting,abysmal,abysmally,ac,accent,accept,acceptable,accepted,accepting,access,accessible,accidentally,accomadations,accomedations,accomidations,accommodate,accommodated,accommodating,accommodations,accomodate,accomodated,accomodating,accomodatingly,accomodations,accomondations,...,yappy,yawn,yeah,year,yearly,yearold,years,yearsthat,yell,yelled,yelling,yellow,yellowy,yelp,yep,yes,yesterday,yesthe,yet,yielded,yoga,yogurt,york,yorkers,yorks,youd,youidiot,youll,young,younger,youre,youth,youve,yrs,yuck,yummy,yunan,yup,zone,zoo
403,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
460,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
85,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
268,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


403    1
460    1
85     0
233    0
268    0
Name: Label, dtype: int64

Unnamed: 0,aaa,aaahed,aback,abassador,ability,able,abound,abrupt,absence,absent,absolute,absolutely,absolutly,absurd,abundant,abutting,abysmal,abysmally,ac,accent,accept,acceptable,accepted,accepting,access,accessible,accidentally,accomadations,accomedations,accomidations,accommodate,accommodated,accommodating,accommodations,accomodate,accomodated,accomodating,accomodatingly,accomodations,accomondations,...,yappy,yawn,yeah,year,yearly,yearold,years,yearsthat,yell,yelled,yelling,yellow,yellowy,yelp,yep,yes,yesterday,yesthe,yet,yielded,yoga,yogurt,york,yorkers,yorks,youd,youidiot,youll,young,younger,youre,youth,youve,yrs,yuck,yummy,yunan,yup,zone,zoo
51,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


51     0
20     0
15     0
109    1
153    1
Name: Label, dtype: int64

Unnamed: 0,aaa one,aaahed picture,aback phone,abassador east,abassador future,ability fully,ability play,ability supply,able change,able check,able compete,able complete,able create,able dinner,able disconnect,able enjoy,able find,able fix,able gain,able get,able give,able hear,able help,able leave,able look,able make,able moved,able pull,able reach,able relax,able report,able rest,able see,able sit,able successfully,able tell,able use,able walk,able watch,abound gripe,...,younger crowd,younger reserved,younger sister,youre already,youre also,youre check,youre expecting,youre first,youre floors,youre getting,youre going,youre hallway,youre hoping,youre hospitality,youre large,youre looking,youre massage,youre motel,youre paying,youre physically,youre really,youre right,youre sht,youre something,youre standing,youre staying,youre traveler,youre traveling,youth hotel,youve expecting,yrs old,yuck im,yuck next,yummy conclusion,yunan didnt,yup even,yup got,zone computer,zone would,zoo second
368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
421,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
632,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


368    1
198    0
421    1
44     0
632    1
Name: Label, dtype: int64

Unnamed: 0,aaa one,aaahed picture,aback phone,abassador east,abassador future,ability fully,ability play,ability supply,able change,able check,able compete,able complete,able create,able dinner,able disconnect,able enjoy,able find,able fix,able gain,able get,able give,able hear,able help,able leave,able look,able make,able moved,able pull,able reach,able relax,able report,able rest,able see,able sit,able successfully,able tell,able use,able walk,able watch,abound gripe,...,younger crowd,younger reserved,younger sister,youre already,youre also,youre check,youre expecting,youre first,youre floors,youre getting,youre going,youre hallway,youre hoping,youre hospitality,youre large,youre looking,youre massage,youre motel,youre paying,youre physically,youre really,youre right,youre sht,youre something,youre standing,youre staying,youre traveler,youre traveling,youth hotel,youve expecting,yrs old,yuck im,yuck next,yummy conclusion,yunan didnt,yup even,yup got,zone computer,zone would,zoo second
29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


29     0
20     0
1      0
148    1
125    1
Name: Label, dtype: int64

# Analysis
Use cross-validation or (for random forests) out-of-bag evaluation to select the values of the hyper-parameters of the algorithms on the training set.

## Multinomial naive Bayes (generative linear classifier)
For naive Bayes, the performance might be improved by applying some form of feature selection (in addition to removing the sparse terms).

In [9]:
class Classifier(object):
    """ Generic classifier object. """
    def __init__(self):
        self.name = "Classifier"
        self.esimator = None
    
    def evaluate(self, X_test, y_test):
        y_pred = self.estimator.predict(X_test)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        return recall, precision, accuracy, f1

class NaiveBayes(Classifier):
    """ Hyper-parameter tuning for the multinominal naive bayes classifier. """
    def __init__(self):
        self.name = "Naive Bayes"
        self.estimator = naive_bayes.MultinomialNB()

    def train(self, X_train, y_train):
        # TODO: Some feature selection needs to go here
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained")


## Regularized logistic regression (discriminative linear classifier)

In [10]:
class LogRegClassifier(Classifier):
    """ Hyper-parameter tuning for the logistic regression classifier. """
    def __init__(self):
        self.name = "Logistic Regression"
        self.estimator = linear_model.LogisticRegressionCV(cv=4, max_iter=1000, Cs=[0.001, 0.01, 0.1, 1, 10, 100, 1000])

    def train(self, X_train, y_train):
        print("training...", end='\r')
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with lambda: {self.estimator.C_}")


## Classification trees, (flexible classifier)


In [11]:
class TreeClassifier(Classifier):
    """ Hyper-parameter tuning for the decision tree classifier. """
    def __init__(self):
        self.name = "Decision Tree"
        self.estimator = tree.DecisionTreeClassifier()
    
    def train(self, X_train, y_train):
        print("set aplhas...", end='\r')
        path = self.estimator.cost_complexity_pruning_path(X_train, y_train)
        ccp_alphas = path.ccp_alphas
        parameters = {'ccp_alpha': ccp_alphas}
        clf = GridSearchCV(self.estimator, parameters, cv=4)

        print("training...", end='\r')
        clf.fit(X_train, y_train)
        self.estimator = clf.best_estimator_
        print(f"{self.name} trained with aplha: {self.estimator.ccp_alpha}")


## Random forests (flexible classifier)

In [12]:
class RandForestClassifier(Classifier):
    """ Hyper-parameter tuning for the random forest classifier. """
    def __init__(self, min_trees=20, max_trees=160):
        self.name = "Random Forest"
        self.estimator = ensemble.RandomForestClassifier(oob_score=True)
        self.max_features_list = ["auto", "sqrt", "log2"]
        self.n_trees = range(min_trees, max_trees, 10)

    def train(self, X_train, y_train):
        error_rates = OrderedDict((label, []) for label in self.max_features_list)
        min_oob_error = [None, 0, 100]
        for label in self.max_features_list:
            for n in self.n_trees:
                print(f"tuning... {label}, {n}", end='\r')
                self.estimator.set_params(n_estimators=n,max_features=label)
                self.estimator.fit(X_train, y_train)
                oob_error = 1 - self.estimator.oob_score_
                error_rates[label].append((n, oob_error))
                if oob_error < min_oob_error[2]:
                    min_oob_error = [label, n, oob_error]

        print("training...", end='\r')
        self.estimator.set_params(n_estimators=min_oob_error[1],max_features=min_oob_error[0])
        self.estimator.fit(X_train, y_train)
        print(f"{self.name} trained with hyper-parameters: {self.estimator.n_estimators}, {self.estimator.max_features}")


## Model accuracy comparison analysis
Comparisons of the accuracy of different models should be supported by a statistical test. For the comparison of the other quality measures (precision, recall, F1 score), a statistical test is not required.

In [13]:
def run_models(bigram = False, vec_type = "tfidf"):
    train = pd.read_csv("./train.csv")
    test = pd.read_csv("test.csv")
    

    if vec_type == "tfidf":
        train_x, train_y, test_x, test_y = make_xy_train_test(train, test, bigram)
    elif vec_type == "countvec":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram)
    elif vec_type == "binary":
        train_x, train_y, test_x, test_y = make_countvec_xy_train_test(train, test, bigram, binary=True)
    
    data = {}
    classifiers = [NaiveBayes(), LogRegClassifier(), TreeClassifier(), RandForestClassifier()]
    for clf in classifiers:
        clf.train(train_x, train_y)
        data[clf.name] = clf.evaluate(test_x, test_y)
    df = pd.DataFrame(data, \
        columns=[clf.name for clf in classifiers], \
        index=["recall", "precision", "accuracy", "f1"])
    print(f'{"bigram" if bigram else "unigram"} | {vec_type}:')
    print(df)

run_models(bigram=True)
run_models(vec_type="countvec", bigram=True)
run_models(vec_type="binary", bigram=True)

Naive Bayes trained
Logistic Regression trained with lambda: [1000.]
Decision Tree trained with aplha: 0.002604166666666667
tuning... auto, 20

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Random Forest trained with hyper-parameters: 80, sqrt
bigram | tfidf:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall        0.600000             0.712500       0.750000       0.912500
precision     0.857143             0.750000       0.645161       0.603306
accuracy      0.750000             0.737500       0.668750       0.656250
f1            0.705882             0.730769       0.693642       0.726368
Naive Bayes trained
Logistic Regression trained with lambda: [100.]
Decision Tree trained with aplha: 0.0024999999999999996
tuning... sqrt, 200

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Random Forest trained with hyper-parameters: 150, log2
bigram | countvec:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall        0.675000             0.800000       0.850000       0.900000
precision     0.830769             0.653061       0.591304       0.637168
accuracy      0.768750             0.687500       0.631250       0.693750
f1            0.744828             0.719101       0.697436       0.746114
Naive Bayes trained
Logistic Regression trained with lambda: [0.1]
Decision Tree trained with aplha: 0.002430555555555555
Random Forest trained with hyper-parameters: 30, sqrt
bigram | binary:
           Naive Bayes  Logistic Regression  Decision Tree  Random Forest
recall        0.662500             0.787500       0.850000       0.975000
precision     0.841270             0.677419       0.607143       0.573529
accuracy      0.768750             0.706250       0.650000       0.625000
f1            0.741259             0.728324       0.708333       0.