In [1]:
#import os
#os.environ['CUDA_VISIBLE_DEVICES'] = ''

import numpy as np
from sklearn.linear_model import LogisticRegression
from utils.dataset_helper import load_imdb
from utils.utils import vectorize_keywords_docs

In [2]:
X_train_corpus, y_train, X_test_corpus, y_test = load_imdb('dataset/aclImdb/')

In [4]:
X_train, X_test, cv = vectorize_keywords_docs(X_train_corpus, X_test_corpus, return_cv=True)

In [5]:
X_train = X_train['docs']
X_test = X_test['docs']

In [5]:
clf = LogisticRegression(solver='lbfgs', max_iter=500)
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
weights = clf.coef_[0]
words = cv.get_feature_names()
indices = np.argsort(weights)

In [7]:
# Positive Top 300
pos_kw = []
for i in indices[::-1][:300]:
    pos_kw.append(words[i])
    print (f'{words[i]:<20} {weights[i]:>15}')

7/10                 4.336600803854769
8/10                 2.4144803343466053
refreshing           2.149615732689694
hooked               1.9160959423876192
superbly             1.7894879379334536
noir                 1.7296280028657474
haunting             1.7237652408592257
appreciated          1.6910950362008819
rare                 1.6036769678259306
incredible           1.5987104059769555
9/10                 1.5451496998137006
10/10                1.5358242170640997
captures             1.5202431318983916
7                    1.5105610563975609
expressions          1.4887995876571851
wonderfully          1.4858698721963524
existed              1.4539629189775287
worlds               1.4277266219290567
prince               1.4256438134203013
steals               1.4038803996103169
gem                  1.3759687759165928
hong                 1.3732693826174451
definite             1.3631659379077696
flawless             1.3621792061142197
batman               1.3618029295862344
tr

In [8]:
# Negative Top 300
neg_kw = []
for i in indices[:300]:
    neg_kw.append(words[i])
    print (f'{words[i]:<20} {weights[i]:>15}')

4/10                 -4.012366115209716
3/10                 -3.251198633312059
2/10                 -2.9386533087272246
1/10                 -2.8169183564608957
waste                -2.2803912241003204
disappointment       -2.2438871650831156
poorly               -2.1651501966323674
worst                -2.145820482480448
unwatchable          -2.066573374019713
uninteresting        -1.9493112878368544
stinker              -1.9386152854525236
unfunny              -1.9374091554878277
obnoxious            -1.9084168113030375
boredom              -1.8802390169997014
forgettable          -1.850572322951797
laughable            -1.8045332272183732
mildly               -1.7829370896084318
cardboard            -1.776961515005667
pointless            -1.7490694226446606
stupidity            -1.7148412661292454
wooden               -1.7138650664490243
awful                -1.6733317618380896
uninspired           -1.5845466813782914
lousy                -1.548588199595469
dull                 -1

In [9]:
with open('sentiment-imdb-300.txt', 'w') as out:
    for word in pos_kw:
        out.write(word + '\n')
    for word in neg_kw:
        out.write(word + '\n')

## Consistency

In [65]:
# load text data
import os
import numpy as np
import pandas as pd
os.environ['CUDA_VISIBLE_DEVICES'] = ''

KEYWORD_DIR = '/home/anneke/Documents/ann-mitchell-text-classification/data'

In [66]:
os.listdir(KEYWORD_DIR)

['amazon-video-unigrams-more.txt',
 'textclassification_meta.md',
 'sentiment-imdb-100.txt',
 '.ipynb_checkpoints',
 'amazon-video-unigrams.txt',
 'sentiment-imdb-200.txt',
 'sentiment-imdb-300.txt',
 'scientific-papers-meta.md',
 'imdb-unigrams.txt',
 'imdb-keywords-mitchell',
 'ecom-unigrams.txt']

In [67]:
keylist = [(r,'sentiment-imdb-{}.txt'.format(r)) for r in range(100, 400, 100)]

In [68]:
keylist

[(100, 'sentiment-imdb-100.txt'),
 (200, 'sentiment-imdb-200.txt'),
 (300, 'sentiment-imdb-300.txt')]

In [69]:
imdb_keyword = {}

for l in keylist:
    with open(os.path.join(KEYWORD_DIR, l[1]), 'r') as key:
        f = key.readlines()
        k = set()
        for line in f:
            k.add(line.strip())
        imdb_keyword[str(l[0])] = sorted(k)

In [70]:
for l in keylist:
    print(len(imdb_keyword[str(l[0])]))

68
107
150


In [71]:
intersect = set(imdb_keyword['200']) & set(imdb_keyword['300'])

In [72]:
len(intersect)

107

## Processing all keywords for AGnews (Sci/tech and sports)

In [4]:
import json
import os

KEYWORD_DIR = "/home/anneke/Documents/ann-mitchell-text-classification/data/imdb-keywords"

filelist = os.listdir(KEYWORD_DIR)

imdb_keyword = {}
imdb_keyword['100'] = {}
imdb_keyword['200'] = {}
imdb_keyword['300'] = {}

keys = {}

for file in filelist:
    with open(os.path.join(KEYWORD_DIR, file), 'r') as text:
        file_token = file[:-4].split('-')

        f = text.readlines()
        keylist = []
        for l in f:
            keylist.append(l.strip())

        imdb_keyword[file_token[-1]][file_token[-2]] = keylist
        
for i in range(100, 301, 100):
    imdb_keyword[str(i)]['summary'] = {}
    imdb_keyword[str(i)]['summary']['total_pos'] = len(imdb_keyword[str(i)]['pos'])
    imdb_keyword[str(i)]['summary']['total_neg'] = len(imdb_keyword[str(i)]['neg'])
    imdb_keyword[str(i)]['summary']['total'] = imdb_keyword[str(i)]['summary']['total_pos'] + imdb_keyword[str(i)]['summary']['total_neg']
    
for i in range(100, 301, 100):
    print(imdb_keyword[str(i)]['summary'])
    
# save to JSON file
with open(os.path.join(KEYWORD_DIR, 'imdb_keywords.json'), 'w') as json_file:
    json.dump(imdb_keyword, json_file, indent=4)

{'total_pos': 32, 'total_neg': 41, 'total': 73}
{'total_pos': 59, 'total_neg': 66, 'total': 125}
{'total_pos': 84, 'total_neg': 86, 'total': 170}


In [5]:
imdb_dict = {}
imdb_dict['length'] = []
imdb_dict['pos'] = []
imdb_dict['neg'] = []
imdb_dict['total'] = []

for i in range(100, 301, 100):
    imdb_dict['length'].append(i)
    imdb_dict['pos'].append(imdb_keyword[str(i)]['summary']['total_pos'])
    imdb_dict['neg'].append(imdb_keyword[str(i)]['summary']['total_neg'])
    imdb_dict['total'].append(imdb_keyword[str(i)]['summary']['total'])

In [7]:
import pandas as pd

imdb_df = pd.DataFrame(imdb_dict)
imdb_df

Unnamed: 0,length,pos,neg,total
0,100,32,41,73
1,200,59,66,125
2,300,84,86,170


## Test the keywordBank

In [3]:
from utils import utils, dataset_helper
from utils.utils import show_explanations
from train import InterpretableCautiousText, train, test
from KeywordBank import KeywordBank

import json

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [4]:
IMDB_KEYWORD_PATH = '/home/anneke/Documents/ann-mitchell-text-classification/data/imdb-keywords/imdb_keywords.json'

In [5]:
keyword = json.load(open(IMDB_KEYWORD_PATH, 'r'))

In [6]:
imdbKeyword = KeywordBank(keyword=keyword)
imdbKeyword.assign_connotation()

In [11]:
list(imdbKeyword.connotation.keys())

['4/10',
 '3/10',
 '2/10',
 '1/10',
 'waste',
 'disappointment',
 'poorly',
 'worst',
 'unwatchable',
 'uninteresting',
 'stinker',
 'unfunny',
 'obnoxious',
 'boredom',
 'forgettable',
 'laughable',
 'mildly',
 'cardboard',
 'pointless',
 'stupidity',
 'wooden',
 'awful',
 'uninspired',
 'lousy',
 'dull',
 'endless',
 'lacks',
 'incoherent',
 'mst3k',
 'dire',
 'appalling',
 'trite',
 'disappointing',
 'alright',
 'avoid',
 'fails',
 'boring',
 'mess',
 'junk',
 'skip',
 'remotely',
 '7/10',
 '8/10',
 'refreshing',
 'hooked',
 'superbly',
 'haunting',
 'appreciated',
 'rare',
 'incredible',
 '9/10',
 '10/10',
 'captures',
 '7',
 'expressions',
 'wonderfully',
 'steals',
 'gem',
 'definite',
 'flawless',
 'underrated',
 'funniest',
 'excellent',
 'balance',
 'smooth',
 'perfect',
 'surprisingly',
 '8',
 'outstanding',
 'enjoyable',
 'builds',
 'executed',
 'superb']