# Imports

In [1]:
from gensim.models import KeyedVectors
from gensim.models import FastText
from parser import Parser
import numpy as np
import scipy.stats as st
import pandas as pd

In [51]:
pt_model = KeyedVectors.load_word2vec_format('wiki.pt/wiki.pt.vec')
#pt_model = FastText.load_fasttext_format('wiki.pt/wiki.pt.bin')

In [6]:
words = []
for word in pt_model.vocab:
    words.append(word)

In [7]:
print("Number of Tokens: {}".format(len(words)))

Number of Tokens: 592108


In [8]:
print("Dimension of a word vector: {}".format(
    len(pt_model[words[0]])
))

Dimension of a word vector: 300


In [9]:
words[1000]

'artes'

In [10]:
print("Vector components of a word: {}".format(
    pt_model[words[0]]
))

Vector components of a word: [-0.04817     0.11343    -0.21973    -0.096098   -0.0053306   0.20583
  0.15409     0.20406    -0.010307   -0.045906   -0.019991   -0.14881
  0.071691   -0.010353   -0.040639   -0.19218    -0.054845   -0.099474
 -0.032876   -0.19517     0.091734   -0.1428      0.031234    0.069291
 -0.014771   -0.0059432  -0.092041   -0.11749    -0.012621   -0.037947
  0.045571   -0.086255    0.0032468   0.038197    0.05541    -0.14174
  0.046568   -0.24105    -0.020556    0.075958    0.046163    0.14249
  0.053788    0.073211   -0.020421    0.068428    0.078169    0.10405
  0.058263   -0.22894    -0.083738    0.027697    0.060742    0.12547
  0.018675   -0.059984    0.027855    0.12125    -0.089253    0.053595
 -0.076205   -0.11818     0.016257    0.031477   -0.080403    0.079504
 -0.24153    -0.069171    0.019415   -0.040792    0.081118   -0.072191
 -0.026516   -0.17692    -0.033728   -0.1433     -0.0073834  -0.043411
 -0.16559     0.084007   -0.043258    0.075611   -0.03

In [11]:
find_similar_to = 'carro'

for similar_word in pt_model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: carros, Similarity: 0.75
Word: caminhão, Similarity: 0.73
Word: automóvel, Similarity: 0.70
Word: carroção, Similarity: 0.69
Word: motorista, Similarity: 0.68
Word: caminhonete, Similarity: 0.67
Word: capotar, Similarity: 0.66
Word: furgão, Similarity: 0.65
Word: carroçeria, Similarity: 0.65
Word: capotando, Similarity: 0.65


  if np.issubdtype(vec.dtype, np.int):


In [12]:
word_add = ['dormir', 'beber']
word_sub = ['cama']


for resultant_word in pt_model.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

  if np.issubdtype(vec.dtype, np.int):


Word : comer , Similarity: 0.61
Word : beberem , Similarity: 0.60
Word : beberdes , Similarity: 0.59
Word : beberes , Similarity: 0.57
Word : bebemorar , Similarity: 0.57
Word : beberagem , Similarity: 0.56
Word : bebem , Similarity: 0.56
Word : bebera , Similarity: 0.56
Word : dormirem , Similarity: 0.55
Word : beberá , Similarity: 0.55


In [13]:
def gkern(kernlen=5, nsig=3):
    """Returns a 2D Gaussian kernel array."""

    interval = (2*nsig+1.)/(kernlen)
    x = np.linspace(-nsig-interval/2., nsig+interval/2., kernlen+1)
    kern1d = np.diff(st.norm.cdf(x))
    kernel_raw = np.sqrt(np.outer(kern1d, kern1d))
    kernel = kernel_raw/kernel_raw.sum()
    return kernel

In [14]:
from scipy.signal import convolve2d


class Classifier:
    
    def __init__(self, model=None):
        self.parser = Parser()
        if model is None:
            self.model = KeyedVectors.load_word2vec_format('wiki.pt/wiki.pt.vec')
        else:
            self.model = model
    
    def w2v(self, words):
        pass
    
    def calc_var(self, words):
            vs = np.array([self.model[word] for word in words])
            mean = np.mean(vs, axis=0)
            
            return np.sum((vs - mean)**2)/vs.shape[0]
            
            
    def calc_dists(self, word, kws):
        dists = []
        for kw in kws:
            dists += [self.model.similarity(word, kw)]
    
        return np.array(dists)
    
    def rm_unseen(self, words):
        return [word for word in words if word in self.model.wv.vocab] 
    
    def classify(self, url, kws, labels, dist_thresh=0.20, kws_thresh=0.49):
        kws = self.rm_unseen(kws)
        
        words = self.parser.parse(url)
        words = self.rm_unseen(words)
        for label in labels:
            label.keywords = self.rm_unseen(label.keywords)
        dists = []
        for word in words:
            dists += [self.calc_dists(word, kws)]
            
        dists = np.array(dists)
        
        #print(dists)
        #dists = convolve2d(dists, gkern(), mode='same')
        #print(dists)
        df = pd.DataFrame(dists, columns=kws)
        
        result = dict()
        for label in labels:
            print(df[label.keywords].mean(axis=0))
            print(label.name, (df[label.keywords].mean(axis=0) > dist_thresh).mean())
            result[label.name] = (df[label.keywords].mean(axis=0) > dist_thresh).mean() > kws_thresh
            
        return result        
        

In [15]:
class Label:
    def __init__(self, name, kws):
        self.keywords = kws
        self.name = name

In [160]:
gun_keywords = ['arma', 'munição', 'calibre', 'revólver', 'cano', 'carabina', 'espingarda']
cigar_keywords = ['cigarro', 'vape', 'narguile', 'fumar', 'tragar',
                  'tabaco', 'nicotina', 'vaporizador', 'ervas']
prost_keywords = ['sexo', 'prostituta', 'fetiche', 'cache', 'acompanhante', 'programa', 
                  'seios', 'bunda', 'travesti', 'gostosa']
med_keywords = ['remédio', 'aborto', 'comprimido', 'secundários', 'efeitos']
serv_keywords = ['operadora', 'cabo', 'assinatura', 'liberação', 'sem', 'aparelhos', 'net', 'vivo']

kws = gun_keywords + cigar_keywords + prost_keywords + med_keywords + serv_keywords
kws = list(set(kws))

In [161]:
gun_label = Label("arma", gun_keywords)
cigar_label = Label("cigarro", cigar_keywords)
prost_label = Label("prostituição", prost_keywords)
med_label = Label("remedios proibidos", med_keywords)
serv_label = Label("serviços ilegais", serv_keywords)

labels = [gun_label, cigar_label, prost_label, med_label, serv_label]

In [162]:
cls = Classifier(model=pt_model)

In [163]:
cls.classify("https://www.falconarmas.com.br/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.253326
munição       0.255737
calibre       0.237513
revólver      0.237542
cano          0.225109
carabina      0.224969
espingarda    0.242755
dtype: float32
arma 1.0
cigarro        0.201649
narguile       0.194104
fumar          0.181008
tragar         0.151890
tabaco         0.175283
nicotina       0.196943
vaporizador    0.223267
ervas          0.157624
dtype: float32
cigarro 0.25
sexo            0.145493
prostituta      0.150997
fetiche         0.165805
cache           0.189345
acompanhante    0.181028
programa        0.166852
seios           0.167073
bunda           0.155649
travesti        0.153018
gostosa         0.169046
dtype: float32
prostituição 0.0
remédio        0.160872
aborto         0.149549
comprimido     0.247040
secundários    0.143253
efeitos        0.174954
dtype: float32
remedios proibidos 0.2
operadora     0.187996
cabo          0.155596
assinatura    0.168312
liberação     0.167180
sem           0.184102
aparelhos     0.207391
net           0.1

{'arma': True,
 'cigarro': False,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [164]:
cls.classify("https://belicosarsenais.wixsite.com/armas", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.269945
munição       0.271822
calibre       0.250421
revólver      0.233669
cano          0.224187
carabina      0.221521
espingarda    0.248639
dtype: float32
arma 1.0
cigarro        0.193026
narguile       0.196481
fumar          0.186121
tragar         0.160582
tabaco         0.185632
nicotina       0.198560
vaporizador    0.205406
ervas          0.161726
dtype: float32
cigarro 0.125
sexo            0.143577
prostituta      0.149248
fetiche         0.164413
cache           0.172525
acompanhante    0.168777
programa        0.168876
seios           0.149663
bunda           0.145937
travesti        0.141096
gostosa         0.157661
dtype: float32
prostituição 0.0
remédio        0.168928
aborto         0.161862
comprimido     0.236990
secundários    0.157982
efeitos        0.187185
dtype: float32
remedios proibidos 0.2
operadora     0.182683
cabo          0.154909
assinatura    0.170537
liberação     0.185439
sem           0.201791
aparelhos     0.207564
net           0.

{'arma': True,
 'cigarro': False,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [165]:
cls.classify("https://www.mundodovapor.com/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.189442
munição       0.188115
calibre       0.168177
revólver      0.187231
cano          0.163297
carabina      0.154322
espingarda    0.169484
dtype: float32
arma 0.0
cigarro        0.213053
narguile       0.204581
fumar          0.202962
tragar         0.204456
tabaco         0.169153
nicotina       0.201056
vaporizador    0.221912
ervas          0.148763
dtype: float32
cigarro 0.75
sexo            0.159067
prostituta      0.151278
fetiche         0.177133
cache           0.203095
acompanhante    0.194989
programa        0.182642
seios           0.162874
bunda           0.170823
travesti        0.162515
gostosa         0.195989
dtype: float32
prostituição 0.1
remédio        0.175637
aborto         0.159074
comprimido     0.219911
secundários    0.142496
efeitos        0.179804
dtype: float32
remedios proibidos 0.2
operadora     0.179952
cabo          0.133944
assinatura    0.181491
liberação     0.174037
sem           0.197564
aparelhos     0.181980
net           0.1

{'arma': False,
 'cigarro': True,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [166]:
cls.classify("https://www.elitesmokebr.com/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.193164
munição       0.197624
calibre       0.162900
revólver      0.194959
cano          0.165957
carabina      0.159977
espingarda    0.181876
dtype: float32
arma 0.0
cigarro        0.257276
narguile       0.205394
fumar          0.212883
tragar         0.165370
tabaco         0.210259
nicotina       0.227994
vaporizador    0.236178
ervas          0.167323
dtype: float32
cigarro 0.75
sexo            0.149087
prostituta      0.160413
fetiche         0.167892
cache           0.196434
acompanhante    0.198016
programa        0.177076
seios           0.160446
bunda           0.159190
travesti        0.166035
gostosa         0.176605
dtype: float32
prostituição 0.0
remédio        0.178895
aborto         0.163219
comprimido     0.219879
secundários    0.143010
efeitos        0.178993
dtype: float32
remedios proibidos 0.2
operadora     0.191174
cabo          0.132208
assinatura    0.175557
liberação     0.178008
sem           0.168389
aparelhos     0.194539
net           0.1

{'arma': False,
 'cigarro': True,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [167]:
cls.classify("https://www.vaporesabor.com.br/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.194085
munição       0.194330
calibre       0.164809
revólver      0.197411
cano          0.164427
carabina      0.159500
espingarda    0.180159
dtype: float32
arma 0.0
cigarro        0.247953
narguile       0.209256
fumar          0.208849
tragar         0.175712
tabaco         0.200606
nicotina       0.220818
vaporizador    0.237907
ervas          0.168611
dtype: float32
cigarro 0.75
sexo            0.152867
prostituta      0.162277
fetiche         0.176254
cache           0.199786
acompanhante    0.208054
programa        0.181398
seios           0.166611
bunda           0.164968
travesti        0.171772
gostosa         0.195100
dtype: float32
prostituição 0.1
remédio        0.178672
aborto         0.155243
comprimido     0.223266
secundários    0.141194
efeitos        0.178203
dtype: float32
remedios proibidos 0.2
operadora     0.194568
cabo          0.128648
assinatura    0.176196
liberação     0.177226
sem           0.172929
aparelhos     0.199827
net           0.1

{'arma': False,
 'cigarro': True,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [168]:
cls.classify("https://www.queenflavor.com/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.184945
munição       0.188858
calibre       0.170565
revólver      0.177075
cano          0.158271
carabina      0.154797
espingarda    0.167729
dtype: float32
arma 0.0
cigarro        0.209731
narguile       0.205299
fumar          0.194032
tragar         0.179374
tabaco         0.187875
nicotina       0.213012
vaporizador    0.212595
ervas          0.164457
dtype: float32
cigarro 0.5
sexo            0.155054
prostituta      0.154011
fetiche         0.174853
cache           0.193667
acompanhante    0.188357
programa        0.176659
seios           0.159408
bunda           0.159698
travesti        0.163986
gostosa         0.193137
dtype: float32
prostituição 0.0
remédio        0.177837
aborto         0.152720
comprimido     0.219110
secundários    0.146909
efeitos        0.179084
dtype: float32
remedios proibidos 0.2
operadora     0.183081
cabo          0.139382
assinatura    0.168118
liberação     0.173791
sem           0.186022
aparelhos     0.188378
net           0.16

{'arma': False,
 'cigarro': True,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [169]:
cls.classify("https://www.photoacompanhantes.com/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.177170
munição       0.172994
calibre       0.149895
revólver      0.175618
cano          0.161377
carabina      0.150537
espingarda    0.167099
dtype: float32
arma 0.0
cigarro        0.205684
narguile       0.226911
fumar          0.216100
tragar         0.225093
tabaco         0.163891
nicotina       0.192966
vaporizador    0.182025
ervas          0.185856
dtype: float32
cigarro 0.5
sexo            0.195547
prostituta      0.225428
fetiche         0.213191
cache           0.169141
acompanhante    0.221100
programa        0.185064
seios           0.216384
bunda           0.221241
travesti        0.224739
gostosa         0.283087
dtype: float32
prostituição 0.7
remédio        0.207050
aborto         0.182920
comprimido     0.194882
secundários    0.145088
efeitos        0.172133
dtype: float32
remedios proibidos 0.2
operadora     0.182025
cabo          0.134444
assinatura    0.154250
liberação     0.169756
sem           0.224709
aparelhos     0.179068
net           0.14

{'arma': False,
 'cigarro': True,
 'prostituição': True,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [170]:
cls.classify("https://garotacomlocal.com/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.159080
munição       0.163765
calibre       0.147237
revólver      0.155631
cano          0.132529
carabina      0.133942
espingarda    0.140811
dtype: float32
arma 0.0
cigarro        0.179042
narguile       0.202099
fumar          0.188617
tragar         0.207189
tabaco         0.156194
nicotina       0.169515
vaporizador    0.158430
ervas          0.164858
dtype: float32
cigarro 0.25
sexo            0.187417
prostituta      0.183996
fetiche         0.186213
cache           0.174024
acompanhante    0.205391
programa        0.193971
seios           0.175544
bunda           0.170670
travesti        0.196618
gostosa         0.216442
dtype: float32
prostituição 0.2
remédio        0.173246
aborto         0.178729
comprimido     0.172596
secundários    0.152415
efeitos        0.163530
dtype: float32
remedios proibidos 0.0
operadora     0.195578
cabo          0.142647
assinatura    0.168243
liberação     0.176178
sem           0.224653
aparelhos     0.184020
net           0.1

{'arma': False,
 'cigarro': False,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [171]:
cls.classify("https://www.oskaras.com/cytotec/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.156135
munição       0.161215
calibre       0.139249
revólver      0.149096
cano          0.139225
carabina      0.138947
espingarda    0.141058
dtype: float32
arma 0.0
cigarro        0.165163
narguile       0.173357
fumar          0.174421
tragar         0.174184
tabaco         0.151072
nicotina       0.152970
vaporizador    0.154076
ervas          0.139252
dtype: float32
cigarro 0.0
sexo            0.153795
prostituta      0.170741
fetiche         0.148657
cache           0.167679
acompanhante    0.191994
programa        0.192505
seios           0.147959
bunda           0.155581
travesti        0.172277
gostosa         0.194487
dtype: float32
prostituição 0.0
remédio        0.155697
aborto         0.164171
comprimido     0.169827
secundários    0.141057
efeitos        0.157096
dtype: float32
remedios proibidos 0.0
operadora     0.188633
cabo          0.144580
assinatura    0.174625
liberação     0.171674
sem           0.200773
aparelhos     0.168622
net           0.16

{'arma': False,
 'cigarro': False,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': False}

In [172]:
cls.classify("http://www.cscentralvip.com/", kws, labels)

  if np.issubdtype(vec.dtype, np.int):


arma          0.175491
munição       0.188871
calibre       0.165555
revólver      0.163653
cano          0.146549
carabina      0.139545
espingarda    0.161846
dtype: float32
arma 0.0
cigarro        0.182682
narguile       0.191522
fumar          0.187680
tragar         0.207671
tabaco         0.164720
nicotina       0.190687
vaporizador    0.188993
ervas          0.151215
dtype: float32
cigarro 0.125
sexo            0.166769
prostituta      0.153627
fetiche         0.162840
cache           0.201256
acompanhante    0.191836
programa        0.210065
seios           0.159247
bunda           0.145613
travesti        0.166841
gostosa         0.177032
dtype: float32
prostituição 0.2
remédio        0.177598
aborto         0.177672
comprimido     0.224114
secundários    0.158619
efeitos        0.184305
dtype: float32
remedios proibidos 0.2
operadora     0.243432
cabo          0.165081
assinatura    0.197873
liberação     0.208543
sem           0.227901
aparelhos     0.213462
net           0.

{'arma': False,
 'cigarro': False,
 'prostituição': False,
 'remedios proibidos': False,
 'serviços ilegais': True}

In [145]:
cls.classify("https://www.brasiltatica.com.br/", kws, labels)



IndexError: string index out of range

In [74]:
cls.calc_dists('arma', ['munição', 'pistola', 'espingarda'])
ut.TestCase.assertTrue((cls.calc_dists('arma', ['munição', 'pistola', 'espingarda']) == np.array([0.677431  , 0.6866069 , 0.71047467])).all())

  if np.issubdtype(vec.dtype, np.int):


TypeError: assertTrue() missing 1 required positional argument: 'expr'

In [47]:
import unittest as ut
class TestClassifier(ut.TestCase):
    classifier =  Classifier()
    word = ['hahsbaja', 'casa', 'arma']

    def test_rm_unseen(self):
        self.assertEqual(self.classifier.rm_unseen(self.word), ['casa', 'arma'])

    def test_w2v_training(self):
        self.assertTrue(self.classifier.model.similarity('carro', 'carreta') > 0.7)
        self.assertTrue(self.classifier.model.similarity('pedra', 'pedreira') > 0.7)

if __name__ == '__main__':
    ut.main(argv=['first-arg-is-ignored'], exit=False)
    

KeyboardInterrupt: 

In [104]:
find_similar_to = 'rosa'

for similar_word in pt_model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: rosa,, Similarity: 0.61
Word: rosabel, Similarity: 0.52
Word: rosas, Similarity: 0.52
Word: agujetas, Similarity: 0.51
Word: violeta, Similarity: 0.51
Word: branca,, Similarity: 0.49
Word: rosado, Similarity: 0.49
Word: geralda, Similarity: 0.49
Word: rosaeodora, Similarity: 0.48
Word: vosagiaca, Similarity: 0.48


  if np.issubdtype(vec.dtype, np.int):
