In [1]:
import logging
from datetime import datetime

import numpy as np

logger = logging.getLogger(__name__)
np.random.seed(2019)

log_path = datetime.now().strftime('./logs/%Y-%m-%d-%H-%M-%S.log')
logging.basicConfig(filename=log_path, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())

In [2]:
import os

import numpy as np

def load_data(dp):
    x = []
    for fn in sorted(os.listdir(dp), key=lambda y: int(y[:-4])):
        with open('{dp}{fn}'.format(dp=dp, fn=fn), 'r') as f:
            x.append(f.read())
    return x

x_tr_pos = np.array(load_data('./dataset/train/pos/'), dtype=np.str)
x_tr_neg = np.array(load_data('./dataset/train/neg/'), dtype=np.str)
x_tr = np.concatenate((x_tr_pos, x_tr_neg), axis=0)
y_tr = np.concatenate((np.ones_like(x_tr_pos, dtype=np.float64), np.zeros_like(x_tr_neg, dtype=np.float64)), axis=0)
x_ts = np.array(load_data('./dataset/test/'), dtype=np.str)

del x_tr_pos
del x_tr_neg

In [3]:
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocessor(s):
    s = s.lower()
    s = s.replace('<br /><br />', ' ')
    s = s.replace('-', ' ')
    s = s.replace('/', ' ')
    for ws in string.whitespace:
        s.replace(ws, ' ')
    s = s.translate(s.maketrans('', '', string.punctuation))
    s = s.translate(s.maketrans('', '', string.digits))
    s = ''.join(filter(lambda x: x in string.printable, s))
    return s

def tokenizer(s):
    wl = WordNetLemmatizer()
    st = SnowballStemmer('english', ignore_stopwords=True)
    ts = word_tokenize(s, 'english')
    ts = list(filter(lambda x: x not in stopwords.words('english'), ts))
    ts = list(map(lambda x: wl.lemmatize(x), ts))
    ts = list(map(lambda x: st.stem(x), ts))
    return ts

token_pattern = r'\w+|[%s]' % string.punctuation

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf_transform(x_tr, x_ts):
    tf_idf = TfidfVectorizer(token_pattern=token_pattern,
                             ngram_range=(1, 3))
    x_tr = tf_idf.fit_transform(x_tr)
    x_ts = tf_idf.transform(x_ts)
    return x_tr, x_ts

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def count_transform(x_tr, x_ts):
    cnt = CountVectorizer(token_pattern=token_pattern,
                          ngram_range=(1, 3),
                          binary=True)
    x_tr = cnt.fit_transform(x_tr)
    x_ts = cnt.transform(x_ts)
    return x_tr, x_ts

In [6]:
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, KFold

def k_fold_cross_validation(k, cl, x_tr, y_tr):
    cv_s = []
    kf = KFold(n_splits=k, shuffle=True)
    for i, (tr_idx, cv_idx) in enumerate(kf.split(x_tr)):
        x_cnt_tr, x_cnt_ts = count_transform(x_tr[tr_idx], x_tr[cv_idx])

        cl.fit(x_cnt_tr, y_tr[tr_idx])
        cv_s.append(f1_score(y_tr[cv_idx], cl.predict(x_cnt_ts)))

        logger.info('KFold {} CV Score: {}'.format(i, cv_s[-1]))
    logger.info('KFold Mean CV Score: {}'.format(sum(cv_s) / k))

In [7]:
def predict_test(cl, x_tr, y_tr, x_ts, fn):
    x_tr, x_ts = count_transform(x_tr, x_ts)
    cl.fit(x_tr, y_tr)
    with open('./results/{fn}.csv'.format(fn=fn), 'w') as f:
        f.write('Id,Category\n')
        prd = cl.predict(x_ts)
        for i, y_i in enumerate(prd):
            f.write('{i},{y_i}\n'.format(i=i, y_i=int(y_i)))
    return prd, cl.predict(x_tr)

In [8]:
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression

cl_lr = LogisticRegression(solver='lbfgs', n_jobs=-1)
k_fold_cross_validation(5, clone(cl_lr), x_tr, y_tr)
prd_ts_lr, prd_tr_lr = predict_test(cl_lr, x_tr, y_tr, x_ts, 'logistic_regression')

KFold 0 CV Score: 0.8950715421303658
KFold 1 CV Score: 0.8957403651115617
KFold 2 CV Score: 0.9057560975609756
KFold 3 CV Score: 0.9042298483639266
KFold 4 CV Score: 0.8960601861017621
KFold Mean CV Score: 0.8993716078537183


In [9]:
from sklearn.base import clone
from sklearn.svm import LinearSVC

cl_ls = LinearSVC()
k_fold_cross_validation(5, clone(cl_ls), x_tr, y_tr)
prd_ts_ls, prd_tr_ls = predict_test(cl_ls, x_tr, y_tr, x_ts, 'linear_svc')

KFold 0 CV Score: 0.8979179300586214
KFold 1 CV Score: 0.8982248520710059
KFold 2 CV Score: 0.9068029885961464
KFold 3 CV Score: 0.8955462352706212
KFold 4 CV Score: 0.8904438224710116
KFold Mean CV Score: 0.8977871656934813


In [10]:
from sklearn.base import clone
from nbsvm import NBSVM

cl_nb_svm = NBSVM()
k_fold_cross_validation(5, clone(cl_nb_svm), x_tr, y_tr)
prd_ts_nb_svm, prd_tr_nb_svm = predict_test(cl_nb_svm, x_tr, y_tr, x_ts, 'nb_svm')

KFold 0 CV Score: 0.9239086904285142
KFold 1 CV Score: 0.9231993633107839
KFold 2 CV Score: 0.9208949260886935
KFold 3 CV Score: 0.9150456530369193
KFold 4 CV Score: 0.9196179864703541
KFold Mean CV Score: 0.920533323867053


In [11]:
from sklearn.naive_bayes import BernoulliNB

cl_bnb = BernoulliNB()
k_fold_cross_validation(5, clone(cl_bnb), x_tr, y_tr)
prd_ts_bnb, prd_tr_bnb = predict_test(cl_nb_svm, x_tr, y_tr, x_ts, 'bernoulli_nb')

KFold 0 CV Score: 0.8547968885047538
KFold 1 CV Score: 0.8905139991642289
KFold 2 CV Score: 0.8729351969504447
KFold 3 CV Score: 0.8771345272803
KFold 4 CV Score: 0.8791932496398437
KFold Mean CV Score: 0.8749147723079143


In [17]:
# NOTE: The logic of CV in this stacking has some problems.

x_stk_tr = np.concatenate((prd_tr_lr.reshape((-1, 1)),
                           prd_tr_ls.reshape((-1, 1)),
                           prd_tr_bnb.reshape((-1, 1))), axis=1)
x_stk_ts = np.concatenate((prd_ts_lr.reshape((-1, 1)),
                           prd_ts_ls.reshape((-1, 1)),
                           prd_ts_bnb.reshape((-1, 1))), axis=1)

In [18]:
print(x_stk_tr.shape)
print(x_stk_ts.shape)

(25000, 3)
(25000, 3)


In [19]:
cl_stk_lr = LogisticRegression(solver='lbfgs', n_jobs=-1)
cv_s = []
for i, (tr_idx, cv_idx) in enumerate(KFold(n_splits=5, shuffle=True).split(x_stk_tr)):
    cl_stk_lr.fit(x_stk_tr[tr_idx], y_tr[tr_idx])
    cv_s.append(f1_score(y_tr[cv_idx], cl_stk_lr.predict(x_stk_tr[cv_idx])))
    logger.info('KFold {} CV Score: {}'.format(i, cv_s[-1]))
logger.info('KFold Mean CV Score: {}'.format(sum(cv_s) / 5))

KFold 0 CV Score: 1.0
KFold 1 CV Score: 1.0
KFold 2 CV Score: 1.0
KFold 3 CV Score: 1.0
KFold 4 CV Score: 1.0
KFold Mean CV Score: 1.0


In [20]:
cl_stk_lr.fit(x_stk_tr, y_tr)
with open('./results/stacking.csv', 'w') as f:
    f.write('Id,Category\n')
    prd = cl_stk_lr.predict(x_stk_ts)
    for i, y_i in enumerate(prd):
        f.write('{i},{y_i}\n'.format(i=i, y_i=int(y_i)))

In [16]:
# NOTE: Only run this to generate the info gain csv

# from sklearn.feature_selection import mutual_info_classif

# ig_path = './files/info_gain.csv'
# mi = mutual_info_classif(x_tr, y_tr)
# with open(ig_path, 'w') as f:
#     for mi_i, fn_i in sorted(zip(mi, fn), key=lambda x: -x[0]):
#         f.write('{mi_i},{fn_i}\n'.format(mi_i=mi_i, fn_i=fn_i))