In [2]:
import os
import logging
import numpy as np
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from utils.dataset_helper import load_ag_news

os.environ['TZ'] = 'America/Chicago'
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
PATH = './dataset/dbpedia_csv/'

In [4]:
#dbpedia and ag_news have the same format
x_train, x_test, y_train, y_test = load_ag_news(PATH,
                                                  shuffle = True, 
                                                  lower = True, 
                                                  tokenize = True)

2019-06-23 16:16:27,472 INFO Shuffled.
2019-06-23 16:16:28,383 INFO Lowered.
2019-06-23 16:22:44,912 INFO Tokenized.


In [5]:
print('Total training : {}'.format(len(x_train)))
print('Total testing : {}'.format(len(x_test)))

Total training : 560000
Total testing : 70000


In [8]:
' '.join(x_train[1])

"s-motor was the class designation given by the new york central to its alco-ge built s-1 s-2 s-2a and s-3 electric locomotives . the s-motors hold the distinction of being the world 's first mass-produced main line electric locomotives with the prototype # 6000 being constructed in 1904. the s-motors would serve alone until the more powerful t-motors began to arrive in 1913 eventually displacing them from main line passenger duties ."

In [10]:
np.unique(y_train)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

## Class, headline, content
Click to go to each Logistic Regression result

1 [Company](#company-1) <br>
2 [EducationalInstitution](#edu-2) <br>
3 [Artist](#artist-3) <br>
4 [Athlete](#athelete-4) <br>
5 OfficeHolder <br>
6 MeanOfTransportation <br>
7 Building <br>
8 NaturalPlace <br>
9 Village <br>
10 Animal <br>
11 Plant <br>
12 Album <br>
13 Film <br>
14 WrittenWork <br>
<br> <br>
The load function load label as {1,2,3,4.... 14}. <br>
We need to use one-hot vector

In [14]:
y_1_company = [1 if label == 1 else 0 for label in y_train]
y_2_edu = [1 if label == 2 else 0 for label in y_train]
y_3_artist = [1 if label == 3 else 0 for label in y_train]
y_4_athlete = [1 if label == 4 else 0 for label in y_train]

In [15]:
from utils import utils
import sklearn

In [16]:
X_train, X_test, cv = utils.vectorize_keywords_docs(x_train, x_test, return_cv=True)

In [17]:
X_train['docs'].shape, X_test['docs'].shape

((560000, 12299), (70000, 12299))

In [18]:
del x_train, x_test

<a id='company-1'></a>
# 1 Company vs rest


In [20]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_1_company)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [22]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [COMPANY] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [COMPANY] CATEGORY
winery 		 4.598
airline 		 4.23
brewery 		 3.878
manufacturer 		 3.257
label 		 3.186
publisher 		 2.897
retailer 		 2.839
company 		 2.834
llp 		 2.757
brand 		 2.75
operator 		 2.734
manufactures 		 2.596
distributor 		 2.567
firm 		 2.566
developer 		 2.467
device 		 2.456
bank 		 2.409
soap 		 2.39
transit 		 2.356
supplier 		 2.354
founded 		 2.352
chain 		 2.333
auction 		 2.183
headquartered 		 2.118
banco 		 2.113
foundry 		 2.088
imprint 		 2.078
press 		 2.07
provider 		 2.052
mma 		 2.044
bikes 		 1.926
beers 		 1.92
store 		 1.917
gaisha 		 1.897
agency 		 1.89
société 		 1.884
builder 		 1.852
publish 		 1.852
shipyard 		 1.826
users 		 1.814
publishers 		 1.805
appliances 		 1.797
lindsay 		 1.785
bus 		 1.783
shipbuilders 		 1.782
produces 		 1.735
clothing 		 1.694
shipyards 		 1.689
pulp 		 1.688
enterprise 		 1.669
regulator 		 1.667
patented 		 1.663
airlines 		 1.663
routes 		 1.66
vineyard 		 1.656
formed 		

<a id='edu-2'></a>
# 2 Educational Institution VS Rest

In [23]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_2_edu)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]

In [24]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [EDUCATIONAL INSTITUTION ] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [EDUCATIONAL INSTITUTION ] CATEGORY
school 		 4.499
college 		 3.285
university 		 3.195
universidad 		 2.994
coeducational 		 2.774
pupils 		 2.74
dàxué 		 2.572
daigaku 		 2.561
seminary 		 2.468
gymnasium 		 2.376
academy 		 2.308
vidyalaya 		 2.24
faculties 		 2.227
aged 		 2.189
colleges 		 2.156
unified 		 2.147
universidade 		 2.076
polytechnic 		 2.032
campuses 		 2.021
therapy 		 1.974
université 		 1.916
universität 		 1.913
liberal 		 1.904
institute 		 1.897
école 		 1.87
curriculum 		 1.826
public 		 1.808
университет 		 1.751
secondary 		 1.75
degrees 		 1.747
conservatory 		 1.735
missionary 		 1.724
narayan 		 1.708
khyber 		 1.685
honourable 		 1.634
sekolah 		 1.613
constituent 		 1.611
boys 		 1.596
headteacher 		 1.582
private 		 1.579
schools 		 1.577
nigeria 		 1.572
syria 		 1.572
collège 		 1.526
qualification 		 1.519
established 		 1.514
high 		 1.511
educates 		 1.482
sailors 		 1.474
tian 		 1.47
inception 		 1.416
educ

<a id='artist-3'></a>
# 3 Artist VS Rest

In [25]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_3_artist)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [26]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [ARTIST] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [ARTIST] CATEGORY
painter 		 4.638
photographer 		 3.631
sculptor 		 3.409
rapper 		 3.338
novelist 		 3.284
singer/songwriter 		 3.22
writer 		 3.168
singer 		 3.059
engraver 		 2.92
composer 		 2.876
author 		 2.871
poet 		 2.868
cartoonist 		 2.867
musician 		 2.866
actress 		 2.825
designer 		 2.806
pornographic 		 2.78
artist 		 2.773
actor 		 2.638
screenwriter 		 2.636
fiddle 		 2.445
idol 		 2.436
drummer 		 2.275
vocalist 		 2.26
pianist 		 2.24
violinist 		 2.222
dj 		 2.208
fl 		 2.193
illustrator 		 2.181
biographer 		 2.176
organist 		 2.107
playwright 		 2.065
born 		 2.052
trumpet 		 2.038
she 		 1.999
dramatist 		 1.966
theorist 		 1.939
patron 		 1.928
paraguay 		 1.917
conductor 		 1.903
performer 		 1.888
comedian 		 1.884
he 		 1.863
soloist 		 1.862
guitarist 		 1.847
maintained 		 1.847
violin 		 1.797
writes 		 1.791
collector 		 1.788
astronomical 		 1.786
better 		 1.778
signed 		 1.774
laureate 		 1.762
translator 		 1.74

<a id='athelete-4'></a>
# 4 Athelete VS Rest

In [27]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_4_athlete)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [28]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [ATHLETE] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [ATHLETE] CATEGORY
cricketer 		 5.593
footballer 		 5.457
wrestler 		 4.859
martial 		 4.623
driver 		 4.127
boxer 		 3.887
swimmer 		 3.254
golfer 		 3.247
player 		 3.239
skater 		 3.235
cyclist 		 3.011
rower 		 2.914
midfielder 		 2.896
ski 		 2.889
racer 		 2.864
skier 		 2.815
gymnast 		 2.781
curler 		 2.742
jockey 		 2.714
chess 		 2.662
thrower 		 2.654
olympics 		 2.65
pitcher 		 2.603
football 		 2.583
athlete 		 2.492
defender 		 2.429
rugby 		 2.404
striker 		 2.391
professional 		 2.36
sprinter 		 2.283
judoka 		 2.256
competitor 		 2.237
fencer 		 2.209
tennis 		 2.154
goalkeeper 		 2.127
runner 		 2.123
paralympic 		 2.086
middleweight 		 2.053
rider 		 2.04
grandmaster 		 2.017
jumper 		 1.982
medalist 		 1.96
goaltender 		 1.958
instructor 		 1.953
champion 		 1.934
outfielder 		 1.873
weightlifter 		 1.865
plays 		 1.8
coach 		 1.792
league 		 1.764
heavyweight 		 1.724
team 		 1.721
chi 		 1.688
played 		 1.687
winger 		 1.685
