In [1]:
import logging
from datetime import datetime

import numpy as np

logger = logging.getLogger(__name__)
np.random.seed(2019)

log_path = datetime.now().strftime('./logs/%Y-%m-%d-%H-%M-%S.log')
logging.basicConfig(filename=log_path, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())

In [2]:
import os

import numpy as np

def load_data(dp):
    x = []
    for fn in sorted(os.listdir(dp), key=lambda y: int(y[:-4])):
        with open('{dp}{fn}'.format(dp=dp, fn=fn), 'r') as f:
            x.append(f.read())
    return x

logger.info('[{t}] Start'.format(t=datetime.now()))

x_tr_pos = np.array(load_data('./dataset/train/pos/'), dtype=np.str)
x_tr_neg = np.array(load_data('./dataset/train/neg/'), dtype=np.str)
x_tr = np.concatenate((x_tr_pos, x_tr_neg), axis=0)
y_tr = np.concatenate((np.ones_like(x_tr_pos, dtype=np.float64), np.zeros_like(x_tr_neg, dtype=np.float64)), axis=0)
x_ts = np.array(load_data('./dataset/test/'), dtype=np.str)

del x_tr_pos
del x_tr_neg

logger.info('[{t}] Finish'.format(t=datetime.now()))

[2019-02-18 11:03:35.583870] Start
[2019-02-18 11:03:48.168251] Finish


In [3]:
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocessor(s):
    s = s.lower()
    s = s.replace('<br /><br />', ' ')
    s = s.replace('-', ' ')
    s = s.replace('/', ' ')
    for ws in string.whitespace:
        s.replace(ws, ' ')
    s = s.translate(s.maketrans('', '', string.punctuation))
    s = s.translate(s.maketrans('', '', string.digits))
    s = ''.join(filter(lambda x: x in string.printable, s))
    return s


def tokenizer(s):
    wl = WordNetLemmatizer()
    st = SnowballStemmer('english', ignore_stopwords=True)
    ts = word_tokenize(s, 'english')
    ts = list(filter(lambda x: x not in stopwords.words('english'), ts))
    ts = list(map(lambda x: wl.lemmatize(x), ts))
    ts = list(map(lambda x: st.stem(x), ts))
    return ts

token_pattern = r'\w+|[%s]' % string.punctuation

In [None]:
import numpy as np

ig_path = './files/info_gain.csv'
vocabulary = None
if os.path.exists(ig_path):
    vocabulary = np.genfromtxt(ig_path, delimiter=',', dtype=np.str)[:, 1].squeeze()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tf_idf = TfidfVectorizer(preprocessor=preprocessor,
                         tokenizer=tokenizer,
                         ngram_range=(1, 3),
                         vocabulary=vocabulary,
                         binary=True,
                         use_idf=False,
                         smooth_idf=True)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cnt = CountVectorizer(token_pattern=token_pattern,
                      ngram_range=(1, 3),
                      binary=True)

In [5]:
import numpy as np

logger.info('[{t}] Start'.format(t=datetime.now()))

x_tr = cnt.fit_transform(x_tr)
x_ts = cnt.transform(x_ts)
fn = np.array(cnt.get_feature_names(), dtype=np.str)

logger.info('[{t}] Finish'.format(t=datetime.now()))

[2019-02-18 11:03:54.227965] Start
[2019-02-18 11:05:34.005563] Finish


In [6]:
print(x_tr.shape)
print(x_ts.shape)

(25000, 4996192)
(25000, 4996192)


In [7]:
from sklearn.model_selection import cross_validate, KFold

def k_fold_cross_validation(k, cl, x_tr, y_tr):
    cv = KFold(n_splits=k, shuffle=True)
    cv_s = cross_validate(cl, x_tr, y_tr, cv=cv, scoring='f1', return_train_score=True, verbose=1, n_jobs=-1)
    logger.info('KFold Cross Validation Scores: {cv_s}'.format(cv_s=cv_s, indent=4))
    logger.info('Mean CV Error: {e}'.format(e=sum(cv_s['test_score']) / k))

In [8]:
def predict_test(cl, x_tr, y_tr, x_ts, fn):
    cl.fit(x_tr, y_tr)
    with open('./results/{fn}.csv'.format(fn=fn), 'w') as f:
        f.write('Id,Category\n')
        for i, y_i in enumerate(cl.predict(x_ts)):
            f.write('{i},{y_i}\n'.format(i=i, y_i=int(y_i)))
    return None

In [None]:
from sklearn.base import clone
from sklearn.naive_bayes import GaussianNB

cl_gnb = GaussianNB()
k_fold_cross_validation(4, clone(cl_gnb), x_tr.toarray(), y_tr)
predict_test(cl_gnb, x_tr.toarray(), y_tr, x_ts.toarray(), 'gaussian_naive_bayes')

In [None]:
from sklearn.base import clone
from sklearn.naive_bayes import BernoulliNB

cl_bnb = BernoulliNB()
k_fold_cross_validation(4, clone(cl_bnb), x_tr.toarray(), y_tr)
predict_test(cl_bnb, x_tr.toarray(), y_tr, x_ts.toarray(), 'bernoulli_naive_bayes')

In [None]:
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression

cl_lr = LogisticRegression(solver='lbfgs', verbose=1, n_jobs=-1, C=1)
k_fold_cross_validation(4, clone(cl_lr), x_tr, y_tr)
predict_test(cl_lr, x_tr, y_tr, x_ts, 'logistic_regression')

In [None]:
from sklearn.base import clone
from sklearn.svm import LinearSVC

cl_ls = LinearSVC(C=0.75, verbose=1)
k_fold_cross_validation(4, clone(cl_ls), x_tr, y_tr)
predict_test(cl_ls, x_tr, y_tr, x_ts, 'linear_svc')

In [None]:
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

kc = KNeighborsClassifier(11, n_jobs=-1)
k_fold_cross_validation(4, clone(kc), x_tr, y_tr)
predict_test(kc, x_tr, y_tr, x_ts, 'knn_classifier')

In [9]:
from sklearn.base import clone
from nbsvm import NBSVM

cl_nb_svm = NBSVM()
k_fold_cross_validation(4, clone(cl_nb_svm), x_tr, y_tr)
predict_test(cl_nb_svm, x_tr, y_tr, x_ts, 'nb_svm')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  4.6min remaining:  4.6min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.7min finished
KFold Cross Validation Scores: {'fit_time': array([277.31083679, 277.61475086, 266.52206779, 271.23024178]), 'score_time': array([0.10443807, 0.11288118, 0.24634409, 0.37811899]), 'test_score': array([0.92029216, 0.91406625, 0.91956591, 0.92249047]), 'train_score': array([1., 1., 1., 1.])}
Mean CV Error: 0.919103697061775


In [None]:
# NOTE: Only run this to regenerate the info gain csv

# from sklearn.feature_selection import mutual_info_classif


# logger.info('[{t}] Start'.format(t=datetime.now()))

# mi = mutual_info_classif(x_tr, y_tr)
# with open(ig_path, 'w') as f:
#     for mi_i, fn_i in sorted(zip(mi, fn), key=lambda x: -x[0]):
#         f.write('{mi_i},{fn_i}\n'.format(mi_i=mi_i, fn_i=fn_i))

# logger.info('[{t}] Finish'.format(t=datetime.now()))