In [2]:
import os
import logging
import numpy as np
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from utils.dataset_helper import load_ag_news

os.environ['TZ'] = 'America/Chicago'
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
PATH = './dataset/dbpedia_csv/'

In [4]:
#dbpedia and ag_news have the same format
x_train, x_test, y_train, y_test = load_ag_news(PATH,
                                                  shuffle = True, 
                                                  lower = True, 
                                                  tokenize = True)

2019-06-23 16:16:27,472 INFO Shuffled.
2019-06-23 16:16:28,383 INFO Lowered.
2019-06-23 16:22:44,912 INFO Tokenized.


In [5]:
print('Total training : {}'.format(len(x_train)))
print('Total testing : {}'.format(len(x_test)))

Total training : 560000
Total testing : 70000


In [52]:
for i in range(1,15):
    print('class {} : {}'.format(i,np.sum(y_train==i)))

class 1 : 40000
class 2 : 40000
class 3 : 40000
class 4 : 40000
class 5 : 40000
class 6 : 40000
class 7 : 40000
class 8 : 40000
class 9 : 40000
class 10 : 40000
class 11 : 40000
class 12 : 40000
class 13 : 40000
class 14 : 40000


In [8]:
' '.join(x_train[1])

"s-motor was the class designation given by the new york central to its alco-ge built s-1 s-2 s-2a and s-3 electric locomotives . the s-motors hold the distinction of being the world 's first mass-produced main line electric locomotives with the prototype # 6000 being constructed in 1904. the s-motors would serve alone until the more powerful t-motors began to arrive in 1913 eventually displacing them from main line passenger duties ."

In [10]:
np.unique(y_train)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

## Class, headline, content
Click to go to each Logistic Regression result

1 [Company](#company-1) <br>
2 [EducationalInstitution](#edu-2) <br>
3 [Artist](#artist-3) <br>
4 [Athlete](#athelete-4) <br>
5 [OfficeHolder](#office-5) <br>
6 [MeanOfTransportation](#transport-6) <br>
7 [Building](#building-7) <br>
8 [NaturalPlace](#natural-8) <br>
9 [Village](#village-9) <br>
10 [Animal](#animal-10) <br>
11 [Plant](#plant-11) <br>
12 [Album](#album-12) <br>
13 [Film](#film-13) <br>
14 [WrittenWork](#written-14) <br>
<br> <br>
The load function load label as {1,2,3,4.... 14}. <br>
We need to use one-hot vector

In [30]:
y_1_company = [1 if label == 1 else 0 for label in y_train]
y_2_edu = [1 if label == 2 else 0 for label in y_train]
y_3_artist = [1 if label == 3 else 0 for label in y_train]
y_4_athlete = [1 if label == 4 else 0 for label in y_train]
y_5_office = [1 if label == 5 else 0 for label in y_train]
y_6_transport = [1 if label == 6 else 0 for label in y_train]
y_7_building = [1 if label == 7 else 0 for label in y_train]
y_8_natural = [1 if label == 8 else 0 for label in y_train]
y_9_village = [1 if label == 9 else 0 for label in y_train]
y_10_animal = [1 if label == 10 else 0 for label in y_train]
y_11_plant = [1 if label == 11 else 0 for label in y_train]
y_12_album = [1 if label == 12 else 0 for label in y_train]
y_13_film = [1 if label == 13 else 0 for label in y_train]
y_14_written = [1 if label == 14 else 0 for label in y_train]

In [15]:
from utils import utils
import sklearn

In [16]:
X_train, X_test, cv = utils.vectorize_keywords_docs(x_train, x_test, return_cv=True)

In [17]:
X_train['docs'].shape, X_test['docs'].shape

((560000, 12299), (70000, 12299))

In [18]:
del x_train, x_test

<a id='company-1'></a>
# 1 Company vs rest


In [20]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_1_company)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [22]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [COMPANY] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [COMPANY] CATEGORY
winery 		 4.598
airline 		 4.23
brewery 		 3.878
manufacturer 		 3.257
label 		 3.186
publisher 		 2.897
retailer 		 2.839
company 		 2.834
llp 		 2.757
brand 		 2.75
operator 		 2.734
manufactures 		 2.596
distributor 		 2.567
firm 		 2.566
developer 		 2.467
device 		 2.456
bank 		 2.409
soap 		 2.39
transit 		 2.356
supplier 		 2.354
founded 		 2.352
chain 		 2.333
auction 		 2.183
headquartered 		 2.118
banco 		 2.113
foundry 		 2.088
imprint 		 2.078
press 		 2.07
provider 		 2.052
mma 		 2.044
bikes 		 1.926
beers 		 1.92
store 		 1.917
gaisha 		 1.897
agency 		 1.89
société 		 1.884
builder 		 1.852
publish 		 1.852
shipyard 		 1.826
users 		 1.814
publishers 		 1.805
appliances 		 1.797
lindsay 		 1.785
bus 		 1.783
shipbuilders 		 1.782
produces 		 1.735
clothing 		 1.694
shipyards 		 1.689
pulp 		 1.688
enterprise 		 1.669
regulator 		 1.667
patented 		 1.663
airlines 		 1.663
routes 		 1.66
vineyard 		 1.656
formed 		

<a id='edu-2'></a>
# 2 Educational Institution VS Rest

In [23]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_2_edu)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]

In [24]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [EDUCATIONAL INSTITUTION ] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [EDUCATIONAL INSTITUTION ] CATEGORY
school 		 4.499
college 		 3.285
university 		 3.195
universidad 		 2.994
coeducational 		 2.774
pupils 		 2.74
dàxué 		 2.572
daigaku 		 2.561
seminary 		 2.468
gymnasium 		 2.376
academy 		 2.308
vidyalaya 		 2.24
faculties 		 2.227
aged 		 2.189
colleges 		 2.156
unified 		 2.147
universidade 		 2.076
polytechnic 		 2.032
campuses 		 2.021
therapy 		 1.974
université 		 1.916
universität 		 1.913
liberal 		 1.904
institute 		 1.897
école 		 1.87
curriculum 		 1.826
public 		 1.808
университет 		 1.751
secondary 		 1.75
degrees 		 1.747
conservatory 		 1.735
missionary 		 1.724
narayan 		 1.708
khyber 		 1.685
honourable 		 1.634
sekolah 		 1.613
constituent 		 1.611
boys 		 1.596
headteacher 		 1.582
private 		 1.579
schools 		 1.577
nigeria 		 1.572
syria 		 1.572
collège 		 1.526
qualification 		 1.519
established 		 1.514
high 		 1.511
educates 		 1.482
sailors 		 1.474
tian 		 1.47
inception 		 1.416
educ

<a id='artist-3'></a>
# 3 Artist VS Rest

In [25]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_3_artist)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [26]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [ARTIST] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [ARTIST] CATEGORY
painter 		 4.638
photographer 		 3.631
sculptor 		 3.409
rapper 		 3.338
novelist 		 3.284
singer/songwriter 		 3.22
writer 		 3.168
singer 		 3.059
engraver 		 2.92
composer 		 2.876
author 		 2.871
poet 		 2.868
cartoonist 		 2.867
musician 		 2.866
actress 		 2.825
designer 		 2.806
pornographic 		 2.78
artist 		 2.773
actor 		 2.638
screenwriter 		 2.636
fiddle 		 2.445
idol 		 2.436
drummer 		 2.275
vocalist 		 2.26
pianist 		 2.24
violinist 		 2.222
dj 		 2.208
fl 		 2.193
illustrator 		 2.181
biographer 		 2.176
organist 		 2.107
playwright 		 2.065
born 		 2.052
trumpet 		 2.038
she 		 1.999
dramatist 		 1.966
theorist 		 1.939
patron 		 1.928
paraguay 		 1.917
conductor 		 1.903
performer 		 1.888
comedian 		 1.884
he 		 1.863
soloist 		 1.862
guitarist 		 1.847
maintained 		 1.847
violin 		 1.797
writes 		 1.791
collector 		 1.788
astronomical 		 1.786
better 		 1.778
signed 		 1.774
laureate 		 1.762
translator 		 1.74

<a id='athelete-4'></a>
# 4 Athelete VS Rest

In [27]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_4_athlete)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [28]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [ATHLETE] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [ATHLETE] CATEGORY
cricketer 		 5.593
footballer 		 5.457
wrestler 		 4.859
martial 		 4.623
driver 		 4.127
boxer 		 3.887
swimmer 		 3.254
golfer 		 3.247
player 		 3.239
skater 		 3.235
cyclist 		 3.011
rower 		 2.914
midfielder 		 2.896
ski 		 2.889
racer 		 2.864
skier 		 2.815
gymnast 		 2.781
curler 		 2.742
jockey 		 2.714
chess 		 2.662
thrower 		 2.654
olympics 		 2.65
pitcher 		 2.603
football 		 2.583
athlete 		 2.492
defender 		 2.429
rugby 		 2.404
striker 		 2.391
professional 		 2.36
sprinter 		 2.283
judoka 		 2.256
competitor 		 2.237
fencer 		 2.209
tennis 		 2.154
goalkeeper 		 2.127
runner 		 2.123
paralympic 		 2.086
middleweight 		 2.053
rider 		 2.04
grandmaster 		 2.017
jumper 		 1.982
medalist 		 1.96
goaltender 		 1.958
instructor 		 1.953
champion 		 1.934
outfielder 		 1.873
weightlifter 		 1.865
plays 		 1.8
coach 		 1.792
league 		 1.764
heavyweight 		 1.724
team 		 1.721
chi 		 1.688
played 		 1.687
winger 		 1.685


<a id='office-5'></a>
# 5 Office VS Rest

In [31]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_5_office)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [32]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [ATHLETE] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [ATHLETE] CATEGORY
politician 		 4.608
mayor 		 4.185
representatives 		 3.527
senator 		 3.237
statesman 		 3.233
daimyo 		 3.227
diplomat 		 3.099
commissioner 		 2.773
judge 		 2.737
economist 		 2.693
governor 		 2.591
administrator 		 2.556
constituency 		 2.523
minister 		 2.471
servant 		 2.437
ambassador 		 2.434
representative 		 2.218
legislator 		 2.149
representing 		 2.088
banker 		 2.048
ff 		 2.023
sheriff 		 2.021
merchant 		 2.019
senate 		 2.0
businessman 		 1.997
republican 		 1.996
pc 		 1.99
colony 		 1.965
democratic 		 1.913
chancellor 		 1.913
president 		 1.878
leader 		 1.869
deputy 		 1.853
commander 		 1.84
parliamentarian 		 1.826
delegates 		 1.797
ao 		 1.774
column 		 1.741
gujarat 		 1.736
malta 		 1.733
commons 		 1.724
parliament 		 1.705
chairman 		 1.674
philanthropist 		 1.672
entitled 		 1.655
jurist 		 1.641
ministers 		 1.636
commercials 		 1.632
nobleman 		 1.62
councillor 		 1.609
poetic 		 1.582
alan 		 

<a id='transport-6'></a>
# 6 Office VS Rest

In [33]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_6_transport)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [35]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [Office] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [Office] CATEGORY
locomotive 		 4.66
ship 		 3.856
vessel 		 3.851
uav 		 3.411
steamboat 		 3.336
class 		 3.023
ss 		 2.901
car 		 2.807
wrecked 		 2.706
schooner 		 2.659
monoplane 		 2.601
steamer 		 2.559
mv 		 2.536
sank 		 2.536
biplane 		 2.512
boat 		 2.478
navy 		 2.467
glider 		 2.444
yacht 		 2.396
hms 		 2.34
variants 		 2.338
tanker 		 2.328
tugboat 		 2.316
ms 		 2.282
frigate 		 2.218
uss 		 2.215
motorcycle 		 2.208
homebuilt 		 2.196
ferry 		 2.192
produced 		 2.189
vehicle 		 2.143
clipper 		 2.139
generations 		 2.086
aircraft 		 2.049
developed 		 2.038
introduced 		 2.024
airliner 		 1.983
built 		 1.98
helicopter 		 1.956
she 		 1.938
wreck 		 1.93
masted 		 1.908
launched 		 1.87
floating 		 1.868
suv 		 1.85
jet 		 1.843
variant 		 1.839
submarine 		 1.837
steamship 		 1.832
cutter 		 1.81
drilling 		 1.798
speed 		 1.796
icebreaker 		 1.776
configuration 		 1.744
bomber 		 1.735
fighter 		 1.715
barque 		 1.709
flown 		 1

<a id='building-7'></a>
# 7 Building VS Rest

In [36]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_7_building)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [37]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [Building] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [Building] CATEGORY
mosque 		 4.516
château 		 4.472
synagogue 		 4.376
museum 		 4.233
hospital 		 4.165
priory 		 4.081
castle 		 3.64
prison 		 3.634
skyscraper 		 3.232
mall 		 3.126
historic 		 3.055
restaurant 		 2.848
shopping 		 2.805
hotel 		 2.782
zen 		 2.737
michelin 		 2.702
church 		 2.692
beam 		 2.684
buddhist 		 2.68
residence 		 2.633
cemetery 		 2.428
nhs 		 2.413
monastery 		 2.403
palace 		 2.373
patients 		 2.296
courthouse 		 2.292
beds 		 2.275
truss 		 2.268
museums 		 2.251
floors 		 2.233
jail 		 2.227
villa 		 2.201
mast 		 2.141
house 		 2.1
basilica 		 2.074
depot 		 2.073
tower 		 2.068
deanery 		 2.054
kirke 		 2.051
correctional 		 2.035
rise 		 2.014
ji 		 1.991
arch 		 1.958
gallery 		 1.915
exhibits 		 1.901
rustic 		 1.9
congregation 		 1.889
exhibitions 		 1.885
ruined 		 1.872
building 		 1.87
monument 		 1.801
cathedral 		 1.801
plantation 		 1.789
completed 		 1.772
venue 		 1.756
hm 		 1.756
pub 		 1.753
m

<a id='natural-8'></a>
# 8 natural VS Rest

In [38]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_8_natural)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [39]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [natural] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [natural] CATEGORY
lake 		 4.844
cave 		 4.056
mountain 		 4.019
reservoir 		 3.966
tributary 		 3.92
river 		 3.884
crater 		 3.62
pond 		 3.419
stratovolcano 		 3.215
peak 		 3.134
volcanic 		 3.115
volcano 		 3.108
canal 		 2.907
stream 		 2.819
glacier 		 2.759
kilometre 		 2.492
pass 		 2.449
hill 		 2.426
rises 		 2.409
loch 		 2.402
mount 		 2.368
caves 		 2.354
summit 		 2.34
mercury 		 2.189
fjord 		 2.125
drains 		 2.063
waterway 		 2.038
artificial 		 2.035
alps 		 2.022
lagoon 		 1.913
highest 		 1.859
creek 		 1.848
shropshire 		 1.799
hills 		 1.773
catchment 		 1.772
luxembourg 		 1.759
titan 		 1.744
brazil 		 1.736
massif 		 1.733
mountains 		 1.719
cone 		 1.714
mars 		 1.712
inhabited 		 1.699
hesse 		 1.696
flows 		 1.688
estuary 		 1.686
swimming 		 1.673
mw 		 1.656
thrissur 		 1.64
tyrol 		 1.636
bay 		 1.62
summits 		 1.619
ridge 		 1.606
map 		 1.595
pike 		 1.593
germany 		 1.592
fiji 		 1.588
fishing 		 1.578
reservoirs 

<a id='village-9'></a>
# 9 village VS Rest

In [40]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_9_village)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [41]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [village] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [village] CATEGORY
village 		 6.05
population 		 3.173
census 		 2.811
settlement 		 2.729
estonia 		 2.262
here 		 2.217
panchayat 		 2.191
tehsil 		 2.16
municipality 		 2.101
croatia 		 2.059
palestinian 		 1.995
province 		 1.933
mandal 		 1.921
taluk 		 1.881
croix 		 1.86
commune 		 1.839
district 		 1.828
irish 		 1.816
inhabitants 		 1.807
town 		 1.804
administrative 		 1.715
hamlet 		 1.7
cork 		 1.679
block 		 1.648
herzegovina 		 1.606
hungary 		 1.574
voivodeship 		 1.562
community 		 1.519
uruguay 		 1.516
arabic 		 1.507
ghana 		 1.492
karnataka 		 1.489
india 		 1.453
birthplace 		 1.438
farming 		 1.436
place 		 1.434
kerala 		 1.433
suburb 		 1.43
samoa 		 1.427
goa 		 1.406
township 		 1.39
alberta 		 1.387
virgin 		 1.375
grenada 		 1.368
bosnia 		 1.359
residents 		 1.345
saskatchewan 		 1.314
etc 		 1.303
netherlands 		 1.289
home 		 1.267
capital 		 1.264
taluka 		 1.26
zone 		 1.26
governorate 		 1.236
iceland 		 1.23
altit

<a id='animal-10'></a>
# 10 animal VS Rest

In [42]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_10_animal)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [43]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [animal] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [animal] CATEGORY
racehorse 		 5.22
moth 		 4.68
beetle 		 4.647
frog 		 4.44
spider 		 3.969
thoroughbred 		 3.833
moths 		 3.819
butterfly 		 3.813
beetles 		 3.732
rodent 		 3.456
fish 		 3.451
salamander 		 3.418
lizard 		 3.417
bat 		 3.347
superfamily 		 3.254
fly 		 3.214
bug 		 3.198
bird 		 3.19
catfish 		 3.124
wasp 		 3.075
snake 		 3.035
centimetres 		 2.995
flies 		 2.926
bee 		 2.915
suborder 		 2.894
mites 		 2.864
insect 		 2.846
pest 		 2.79
mantis 		 2.779
breeds 		 2.755
crab 		 2.747
ant 		 2.708
blenny 		 2.658
toad 		 2.617
squirrel 		 2.554
subspecies 		 2.489
cited 		 2.463
owl 		 2.445
worm 		 2.42
passerine 		 2.389
trained 		 2.353
turtle 		 2.346
foaled 		 2.343
spiders 		 2.342
tailed 		 2.333
praying 		 2.328
feeds 		 2.323
predators 		 2.293
venomous 		 2.279
depths 		 2.277
extinct 		 2.274
rat 		 2.26
feed 		 2.243
finch 		 2.242
butterflies 		 2.24
eels 		 2.224
parrot 		 2.223
fishes 		 2.201
bred 		 2.201
gastro

<a id='plant-11'></a>
# 11 plant VS Rest

In [44]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_11_plant)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [45]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [plant] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [plant] CATEGORY
orchid 		 5.93
algae 		 5.394
shrub 		 5.268
fern 		 5.227
cultivar 		 4.739
ferns 		 4.621
plant 		 4.48
legume 		 4.16
grape 		 4.095
poaceae 		 3.985
conifer 		 3.957
moss 		 3.935
cactus 		 3.778
herb 		 3.574
orchidaceae 		 3.448
subsp 		 3.339
euphorbiaceae 		 3.224
bamboo 		 3.195
cultivars 		 3.156
tree 		 3.128
buddleja 		 3.128
plants 		 3.072
amaranthaceae 		 3.053
orchids 		 3.053
grass 		 3.043
fabaceae 		 2.989
rosaceae 		 2.948
vine 		 2.92
palm 		 2.904
asteraceae 		 2.872
flowering 		 2.868
tillandsia 		 2.82
ulmus 		 2.793
grows 		 2.699
pinus 		 2.621
drosera 		 2.579
lily 		 2.563
leaves 		 2.539
perennial 		 2.509
hybrid 		 2.503
flowers 		 2.467
botanical 		 2.459
var 		 2.439
grown 		 2.437
syn 		 2.406
cactaceae 		 2.401
cultivated 		 2.38
bromeliaceae 		 2.355
aechmea 		 2.345
phyllanthaceae 		 2.286
eucalyptus 		 2.247
carex 		 2.236
leaved 		 2.219
banksia 		 2.218
daisy 		 2.207
grasses 		 2.176
crataeg

<a id='album-12'></a>
# 12 album VS Rest

In [46]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_12_album)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [47]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [album] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [album] CATEGORY
album 		 6.257
ep 		 4.199
soundtrack 		 4.113
mixtape 		 3.676
70s 		 2.857
compilation 		 2.623
extended 		 2.574
released 		 2.518
soundtracks 		 2.34
nice 		 2.026
collaboration 		 1.953
by 		 1.872
release 		 1.862
interview 		 1.86
package 		 1.829
herself 		 1.807
cd 		 1.792
sessions 		 1.78
vol 		 1.712
mel 		 1.705
composed 		 1.672
live 		 1.645
tribute 		 1.624
sinatra 		 1.621
celebrate 		 1.616
orchestra 		 1.608
chosen 		 1.607
lp 		 1.604
copies 		 1.593
box 		 1.57
songs 		 1.555
tour 		 1.533
issued 		 1.531
session 		 1.521
sleeve 		 1.518
motion 		 1.512
tracks 		 1.493
atkins 		 1.482
consisted 		 1.432
disc 		 1.416
recorded 		 1.415
roots 		 1.382
stated 		 1.356
split 		 1.327
chet 		 1.325
reissue 		 1.324
vinyl 		 1.32
essential 		 1.312
benefit 		 1.305
powell 		 1.282
whole 		 1.278
studio 		 1.276
reissued 		 1.273
third 		 1.272
themes 		 1.266
download 		 1.266
allmusic 		 1.264
capital 		 1.234
maxi

<a id='film-13'></a>
# 13 film VS Rest

In [48]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_13_film)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [49]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [film] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [film] CATEGORY
film 		 5.458
documentary 		 3.743
directed 		 3.649
movie 		 3.48
miniseries 		 3.345
stars 		 2.781
starring 		 2.561
mini 		 2.545
aired 		 2.353
filmed 		 2.226
comedy 		 2.204
serial 		 2.187
drama 		 2.18
minute 		 2.034
shorts 		 1.97
directorial 		 1.905
argentine 		 1.894
dinosaur 		 1.878
shot 		 1.782
melodrama 		 1.737
rr 		 1.712
australian 		 1.663
minutes 		 1.601
inventor 		 1.543
theatrically 		 1.54
cartoon 		 1.537
animated 		 1.522
3d 		 1.492
mexican 		 1.487
expression 		 1.483
moves 		 1.482
translit 		 1.475
computer 		 1.474
booklet 		 1.468
layer 		 1.45
screened 		 1.437
assistant 		 1.434
nbc 		 1.43
gets 		 1.423
musical 		 1.421
gang 		 1.399
thriller 		 1.397
talents 		 1.396
kay 		 1.392
propaganda 		 1.381
segments 		 1.377
value 		 1.376
sarah 		 1.368
episodes 		 1.362
brunswick 		 1.35
sum 		 1.328
fleming 		 1.326
cheltenham 		 1.322
generated 		 1.321
dvd 		 1.32
journey 		 1.319
based 		 1.315

<a id='written-14'></a>
# 14 written VS Rest

In [50]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train['docs'], y_14_written)

weight = clf.coef_[0]
words = cv.get_feature_names()

indices = np.argsort(weight)[::-1]



In [51]:
print('-'*10,'100 KEYWORD ASSOCIATED WITH [written] CATEGORY')
for i in indices[:100]:
    print('{} \t\t {}'.format(words[i], np.around(weight[i], 3)))

---------- 100 KEYWORD ASSOCIATED WITH [written] CATEGORY
newspaper 		 5.289
novel 		 4.123
magazine 		 3.845
journal 		 3.825
autobiography 		 3.691
manga 		 3.55
play 		 3.123
book 		 3.121
ova 		 2.85
poem 		 2.756
monthly 		 2.71
publication 		 2.622
published 		 2.515
memoir 		 2.465
by 		 2.421
newsletter 		 2.373
novella 		 2.363
quarterly 		 2.26
strip 		 2.172
paper 		 2.162
peer 		 2.152
encyclopedia 		 2.04
anime 		 2.032
isbn 		 2.029
dungeons 		 2.019
anthology 		 2.012
argues 		 2.011
circulation 		 1.998
biography 		 1.996
written 		 1.994
978 		 1.987
periodical 		 1.98
basis 		 1.978
coverage 		 1.907
introduces 		 1.894
ran 		 1.891
edition 		 1.883
panel 		 1.876
issue 		 1.843
garfield 		 1.835
broadway 		 1.835
discusses 		 1.823
pulitzer 		 1.805
trends 		 1.78
theatre 		 1.778
recounts 		 1.755
adapted 		 1.752
covering 		 1.739
volumes 		 1.716
perspective 		 1.7
tabloid 		 1.694
dragons 		 1.683
spike 		 1.659
hana 		 1.642
consequences 		 1.641
losing 		 1.641