In [1]:
import logging
from datetime import datetime

import numpy as np

logger = logging.getLogger(__name__)
np.random.seed(2019)

log_path = datetime.now().strftime('./logs/%Y-%m-%d-%H-%M-%S.log')
logging.basicConfig(filename=log_path, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())

In [2]:
import os

import numpy as np


def load_data(dp):
    x = []
    for fn in sorted(os.listdir(dp), key=lambda y: int(y[:-4])):
        with open('{dp}{fn}'.format(dp=dp, fn=fn), 'r') as f:
            x.append(f.read())
    return x

logger.info('[{t}] Start'.format(t=datetime.now()))

x_tr_pos = np.array(load_data('./dataset/train/pos/'), dtype=np.str)
x_tr_neg = np.array(load_data('./dataset/train/neg/'), dtype=np.str)
x_tr = np.concatenate((x_tr_pos, x_tr_neg), axis=0)
y_tr = np.concatenate((np.ones_like(x_tr_pos, dtype=np.float64), np.zeros_like(x_tr_neg, dtype=np.float64)), axis=0)
x_ts = np.array(load_data('./dataset/test/'), dtype=np.str)

del x_tr_pos
del x_tr_neg

logger.info('[{t}] Finish'.format(t=datetime.now()))

[2019-02-15 10:00:39.045885] Start
[2019-02-15 10:00:51.840640] Finish


In [3]:
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer


def preprocessor(s):
    s = s.lower()
    s = s.replace('<br /><br />', ' ')
    s = s.replace('-', ' ')
    s = s.replace('/', ' ')
    for ws in string.whitespace:
        s.replace(ws, ' ')
    s = s.translate(s.maketrans('', '', string.punctuation))
    s = s.translate(s.maketrans('', '', string.digits))
    s = ''.join(filter(lambda x: x in string.printable, s))
    return s


def tokenizer(s):
    wl = WordNetLemmatizer()
    st = SnowballStemmer('english', ignore_stopwords=True)
    ts = word_tokenize(s, 'english')
    ts = list(filter(lambda x: x not in stopwords.words('english'), ts))
    ts = list(map(lambda x: wl.lemmatize(x), ts))
    ts = list(map(lambda x: st.stem(x), ts))
    return ts

In [4]:
ig_path = './files/info_gain.csv'
vocabulary = None
if os.path.exists(ig_path):
    vocabulary = np.genfromtxt(ig_path, delimiter=',', dtype=np.str)[:, 1].squeeze()[:10000]

In [5]:
logger.info('[{t}] Start'.format(t=datetime.now()))

tf_idf = TfidfVectorizer(preprocessor=preprocessor,
                         tokenizer=tokenizer,
                         ngram_range=(1, 3),
                         vocabulary=vocabulary,
                         binary=False,
                         use_idf=True,
                         smooth_idf=True)
x_tr = tf_idf.fit_transform(x_tr)
x_ts = tf_idf.transform(x_ts)
fn = np.array(tf_idf.get_feature_names(), dtype=np.str)

logger.info('[{t}] Finish'.format(t=datetime.now()))

[2019-02-15 10:01:56.824954] Start
[2019-02-15 10:46:03.877464] Finish


In [6]:
print(x_tr.shape)
print(x_ts.shape)

(25000, 10000)
(25000, 10000)


In [38]:
from sklearn.model_selection import cross_validate, KFold

def k_fold_cross_validation(k, cl, x_tr, y_tr):
    cv = KFold(n_splits=k, shuffle=True)
    cv_s = cross_validate(cl, x_tr, y_tr, cv=cv, scoring='f1', return_train_score=True, verbose=1, n_jobs=-1)
    logger.info('KFold Cross Validation Scores: {cv_s}'.format(cv_s=cv_s, indent=4))
    return cv_s

In [40]:
def predict_test(cl, x_tr, y_tr, x_ts, fn):
    cl.fit(x_tr, y_tr)
    with open('./results/{fn}.csv'.format(fn=fn), 'w') as f:
        f.write('Id,Category\n')
        for i, y_i in enumerate(cl.predict(x_ts)):
            f.write('{i},{y_i}\n'.format(i=i, y_i=int(y_i)))
    return None

In [75]:
from sklearn.base import clone
from sklearn.naive_bayes import GaussianNB

cl = GaussianNB()
ss = k_fold_cross_validation(4, clone(cl), x_tr.toarray(), y_tr)
predict_test(clone(cl), x_tr.toarray(), y_tr, x_ts.toarray(), 'gaussian_naive_bayes')
print(sum(ss['test_score']) / 4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  4.6min remaining:  4.6min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.6min finished
KFold Cross Validation Scores: {'fit_time': array([115.14102101, 129.2283709 , 129.31780195, 129.3186481 ]), 'score_time': array([ 4.13920498, 14.288203  , 14.2076571 , 14.29917502]), 'test_score': array([0.83143259, 0.84015224, 0.82800318, 0.83338613]), 'train_score': array([0.89296797, 0.891862  , 0.89334318, 0.89461235])}


0.8332435332181538


In [76]:
from sklearn.naive_bayes import BernoulliNB

cl = BernoulliNB()
ss = k_fold_cross_validation(4, clone(cl), x_tr.toarray(), y_tr)
predict_test(clone(cl), x_tr.toarray(), y_tr, x_ts.toarray(), 'bernoulli_naive_bayes')
print(sum(ss['test_score']) / 4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  1.9min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.9min finished
KFold Cross Validation Scores: {'fit_time': array([71.78296924, 72.02037883, 72.51355004, 71.86622286]), 'score_time': array([1.91542864, 3.89467216, 4.20052791, 4.02848697]), 'test_score': array([0.860799  , 0.86700284, 0.86404548, 0.86380098]), 'train_score': array([0.88330794, 0.88048986, 0.8807561 , 0.88122605])}


0.8639120724956857


In [54]:
from sklearn.linear_model import LogisticRegression

cl = LogisticRegression(solver='lbfgs', verbose=1, n_jobs=-1, C=1.5)
ss = k_fold_cross_validation(4, clone(cl), x_tr, y_tr)
predict_test(clone(cl), x_tr, y_tr, x_ts, 'logistic_regression')
print(sum(ss['test_score']) / 4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    1.4s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.5s finished
KFold Cross Validation Scores: {'fit_time': array([1.42756796, 1.20593405, 1.3495729 , 1.32562089]), 'score_time': array([0.00543499, 0.00838923, 0.00443697, 0.00358105]), 'test_score': array([0.88928227, 0.88130285, 0.88825215, 0.89872234]), 'train_score': array([0.93598907, 0.93716889, 0.93579047, 0.93263035])}
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


0.8893899020604852


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


In [74]:
from sklearn.svm import LinearSVC

cl = LinearSVC(C=0.2, verbose=1)
ss = k_fold_cross_validation(4, clone(cl), x_tr, y_tr)
predict_test(clone(cl), x_tr, y_tr, x_ts, 'linear_svc')
print(sum(ss['test_score']) / 4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.5s finished
KFold Cross Validation Scores: {'fit_time': array([0.37445402, 0.38459802, 0.371243  , 0.37494302]), 'score_time': array([0.00579715, 0.00595498, 0.00578403, 0.0034399 ]), 'test_score': array([0.89269442, 0.88917361, 0.89112333, 0.8884694 ]), 'train_score': array([0.94287076, 0.94187701, 0.94535984, 0.94552013])}


[LibLinear]0.8903651909519483


In [None]:
# NOTE: Only run this to regenerate the info gain csv

# from sklearn.feature_selection import mutual_info_classif


# logger.info('[{t}] Start'.format(t=datetime.now()))

# mi = mutual_info_classif(x_tr, y_tr)
# with open(ig_path, 'w') as f:
#     for mi_i, fn_i in sorted(zip(mi, fn), key=lambda x: -x[0]):
#         f.write('{mi_i},{fn_i}\n'.format(mi_i=mi_i, fn_i=fn_i))

# logger.info('[{t}] Finish'.format(t=datetime.now()))