# Imports

In [21]:
from gensim.models import KeyedVectors
from gensim.models import FastText
from parser import Parser
import numpy as np
import scipy.stats as st
import pandas as pd

In [2]:
pt_model = KeyedVectors.load_word2vec_format('wiki.pt/wiki.pt.vec')
#pt_model = FastText.load_fasttext_format('wiki.pt/wiki.pt.bin')

In [3]:
words = []
for word in pt_model.vocab:
    words.append(word)

In [4]:
print("Number of Tokens: {}".format(len(words)))

Number of Tokens: 592108


In [5]:
print("Dimension of a word vector: {}".format(
    len(pt_model[words[0]])
))

Dimension of a word vector: 300


In [6]:
words[1000]

'artes'

In [7]:
print("Vector components of a word: {}".format(
    pt_model[words[0]]
))

Vector components of a word: [-0.04817     0.11343    -0.21973    -0.096098   -0.0053306   0.20582999
  0.15409     0.20406    -0.010307   -0.045906   -0.019991   -0.14881
  0.071691   -0.010353   -0.040639   -0.19217999 -0.054845   -0.099474
 -0.032876   -0.19517     0.091734   -0.1428      0.031234    0.069291
 -0.014771   -0.0059432  -0.092041   -0.11749    -0.012621   -0.037947
  0.045571   -0.086255    0.0032468   0.038197    0.05541    -0.14173999
  0.046568   -0.24105    -0.020556    0.075958    0.046163    0.14249
  0.053788    0.073211   -0.020421    0.068428    0.078169    0.10405
  0.058263   -0.22894    -0.083738    0.027697    0.060742    0.12547
  0.018675   -0.059984    0.027855    0.12125    -0.089253    0.053595
 -0.076205   -0.11818     0.016257    0.031477   -0.080403    0.079504
 -0.24153    -0.069171    0.019415   -0.040792    0.081118   -0.072191
 -0.026516   -0.17692    -0.033728   -0.1433     -0.0073834  -0.043411
 -0.16559     0.084007   -0.043258    0.075611  

In [8]:
find_similar_to = 'carro'

for similar_word in pt_model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: carros, Similarity: 0.75
Word: caminhão, Similarity: 0.73
Word: automóvel, Similarity: 0.70
Word: carroção, Similarity: 0.69
Word: motorista, Similarity: 0.68
Word: caminhonete, Similarity: 0.67
Word: capotar, Similarity: 0.66
Word: furgão, Similarity: 0.65
Word: carroçeria, Similarity: 0.65
Word: capotando, Similarity: 0.65


In [9]:
word_add = ['dormir', 'beber']
word_sub = ['cama']


for resultant_word in pt_model.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : comer , Similarity: 0.61
Word : beberem , Similarity: 0.60
Word : beberdes , Similarity: 0.59
Word : beberes , Similarity: 0.57
Word : bebemorar , Similarity: 0.57
Word : beberagem , Similarity: 0.56
Word : bebem , Similarity: 0.56
Word : bebera , Similarity: 0.56
Word : dormirem , Similarity: 0.55
Word : beberá , Similarity: 0.55


In [10]:
def gkern(kernlen=5, nsig=3):
    """Returns a 2D Gaussian kernel array."""

    interval = (2*nsig+1.)/(kernlen)
    x = np.linspace(-nsig-interval/2., nsig+interval/2., kernlen+1)
    kern1d = np.diff(st.norm.cdf(x))
    kernel_raw = np.sqrt(np.outer(kern1d, kern1d))
    kernel = kernel_raw/kernel_raw.sum()
    return kernel

In [83]:
from scipy.signal import convolve2d


class Classifier:
    
    def __init__(self, model=None):
        self.parser = Parser()
        if model is None:
            self.model = KeyedVectors.load_word2vec_format('wiki.pt/wiki.pt.vec')
        else:
            self.model = model
    
    def w2v(self, words):
        pass
    
    def calc_var(self, words):
            vs = np.array([self.model[word] for word in words])
            mean = np.mean(vs, axis=0)
            
            return np.sum((vs - mean)**2)/vs.shape[0]
            
            
    def calc_dists(self, word, kws):
        dists = []
        for kw in kws:
            dists += [self.model.similarity(word, kw)]
    
        return np.array(dists)
    
    def rm_unseen(self, words):
        return [word for word in words if word in self.model.wv.vocab] 
    
    def classify(self, url, kws, labels, dist_thresh=0.20, kws_thresh=0.49):
        kws = self.rm_unseen(kws)
        
        words = self.parser.parse(url)
        words = self.rm_unseen(words)
        for label in labels:
            label.keywords = self.rm_unseen(label.keywords)
        dists = []
        for word in words:
            dists += [self.calc_dists(word, kws)]
            
        dists = np.array(dists)
        
        #print(dists)
        #dists = convolve2d(dists, gkern(), mode='same')
        #print(dists)
        df = pd.DataFrame(dists, columns=kws)
        
        result = dict()
        for label in labels:
            print(df[label.keywords].mean(axis=0))
            print(label.name, (df[label.keywords].mean(axis=0) > dist_thresh).mean())
            result[label.name] = (df[label.keywords].mean(axis=0) > dist_thresh).mean() > kws_thresh
            
        return result        
        

In [84]:
class Label:
    def __init__(self, name, kws):
        self.keywords = kws
        self.name = name

In [105]:
gun_keywords = ['arma', 'munição', 'preço', 'comprar']
cigar_keywords = ['cigarro', 'vape', 'narguile', 'fumar', 'tragar',
                  'tabaco', 'nicotina', 'vaporizador', 'preço', 'comprar']
prost_keywords = ['sexo', 'prostituta', 'fetiche', 'cache']
med_keywords = ['remedio', 'aborto', 'comprimido', 'comprar', 'preço']

kws = gun_keywords + cigar_keywords + prost_keywords + med_keywords
kws = list(set(kws))

In [106]:
gun_label = Label("arma", gun_keywords)
cigar_label = Label("cigarro", cigar_keywords)
prost_label = Label("prottituição", prost_keywords)
med_label = Label("remedios proibidos", med_keywords)

labels = [gun_label, cigar_label, prost_label, med_label]

In [107]:
cls = Classifier(model=pt_model)

In [108]:
cls.classify("https://www.falconarmas.com.br/", kws, labels)



arma       0.253520
munição    0.255525
preço      0.212237
comprar    0.203844
dtype: float32
arma 1.0
cigarro        0.201608
narguile       0.193860
fumar          0.180427
tragar         0.152395
tabaco         0.175249
nicotina       0.196549
vaporizador    0.221791
preço          0.212237
comprar        0.203844
dtype: float32
cigarro 0.444444444444
sexo          0.145827
prostituta    0.151533
fetiche       0.166576
cache         0.189556
dtype: float32
prottituição 0.0
dosagem       0.217435
substancia    0.198227
remedio       0.141547
aborto        0.148929
comprimido    0.245910
comprar       0.203844
preço         0.212237
dtype: float32
remedios proibidos 0.571428571429


{'arma': True,
 'cigarro': False,
 'prottituição': False,
 'remedios proibidos': True}

In [109]:
cls.classify("https://belicosarsenais.wixsite.com/armas", kws, labels)



arma       0.269945
munição    0.271822
preço      0.193490
comprar    0.199469
dtype: float32
arma 0.5
cigarro        0.193026
narguile       0.196481
fumar          0.186121
tragar         0.160582
tabaco         0.185632
nicotina       0.198560
vaporizador    0.205406
preço          0.193490
comprar        0.199469
dtype: float32
cigarro 0.111111111111
sexo          0.143577
prostituta    0.149248
fetiche       0.164413
cache         0.172525
dtype: float32
prottituição 0.0
dosagem       0.211695
substancia    0.210297
remedio       0.147447
aborto        0.161862
comprimido    0.236990
comprar       0.199469
preço         0.193490
dtype: float32
remedios proibidos 0.428571428571


{'arma': False,
 'cigarro': False,
 'prottituição': False,
 'remedios proibidos': False}

In [110]:
cls.classify("https://www.mundodovapor.com/", kws, labels)



arma       0.189346
munição    0.188113
preço      0.216258
comprar    0.230047
dtype: float32
arma 0.5
cigarro        0.213257
narguile       0.204642
fumar          0.203056
tragar         0.204809
tabaco         0.169380
nicotina       0.201417
vaporizador    0.221866
preço          0.216258
comprar        0.230047
dtype: float32
cigarro 0.888888888889
sexo          0.159164
prostituta    0.151398
fetiche       0.177388
cache         0.202935
dtype: float32
prottituição 0.25
dosagem       0.203592
substancia    0.191731
remedio       0.146326
aborto        0.159347
comprimido    0.220126
comprar       0.230047
preço         0.216258
dtype: float32
remedios proibidos 0.571428571429


{'arma': False,
 'cigarro': True,
 'prottituição': False,
 'remedios proibidos': True}

In [111]:
cls.classify("https://www.elitesmokebr.com/", kws, labels)



arma       0.193336
munição    0.197074
preço      0.223227
comprar    0.221696
dtype: float32
arma 0.5
cigarro        0.252957
narguile       0.204864
fumar          0.210741
tragar         0.167100
tabaco         0.207222
nicotina       0.225927
vaporizador    0.234400
preço          0.223227
comprar        0.221696
dtype: float32
cigarro 0.888888888889
sexo          0.149551
prostituta    0.160125
fetiche       0.168180
cache         0.197216
dtype: float32
prottituição 0.0
dosagem       0.209294
substancia    0.186601
remedio       0.149772
aborto        0.162355
comprimido    0.218966
comprar       0.221696
preço         0.223227
dtype: float32
remedios proibidos 0.571428571429


{'arma': False,
 'cigarro': True,
 'prottituição': False,
 'remedios proibidos': True}

In [112]:
cls.classify("https://www.vaporesabor.com.br/", kws, labels)



arma       0.194407
munição    0.194837
preço      0.221747
comprar    0.230150
dtype: float32
arma 0.5
cigarro        0.249540
narguile       0.208948
fumar          0.209445
tragar         0.174099
tabaco         0.201565
nicotina       0.221553
vaporizador    0.238757
preço          0.221747
comprar        0.230150
dtype: float32
cigarro 0.888888888889
sexo          0.152412
prostituta    0.162190
fetiche       0.176345
cache         0.199159
dtype: float32
prottituição 0.0
dosagem       0.206222
substancia    0.181896
remedio       0.147814
aborto        0.155128
comprimido    0.223478
comprar       0.230150
preço         0.221747
dtype: float32
remedios proibidos 0.571428571429


{'arma': False,
 'cigarro': True,
 'prottituição': False,
 'remedios proibidos': True}

In [113]:
cls.classify("https://www.queenflavor.com/", kws, labels)



arma       0.184879
munição    0.188843
preço      0.223718
comprar    0.213079
dtype: float32
arma 0.5
cigarro        0.209974
narguile       0.205318
fumar          0.194229
tragar         0.179364
tabaco         0.188180
nicotina       0.213198
vaporizador    0.212433
preço          0.223718
comprar        0.213079
dtype: float32
cigarro 0.666666666667
sexo          0.154939
prostituta    0.153999
fetiche       0.174856
cache         0.193468
dtype: float32
prottituição 0.0
dosagem       0.211069
substancia    0.206004
remedio       0.153022
aborto        0.152899
comprimido    0.219088
comprar       0.213079
preço         0.223718
dtype: float32
remedios proibidos 0.714285714286


{'arma': False,
 'cigarro': True,
 'prottituição': False,
 'remedios proibidos': True}

In [114]:
cls.classify("https://www.photoacompanhantes.com/", kws, labels)



arma       0.177141
munição    0.173571
preço      0.199720
comprar    0.213049
dtype: float32
arma 0.25
cigarro        0.204323
narguile       0.227130
fumar          0.214656
tragar         0.226146
tabaco         0.166177
nicotina       0.190655
vaporizador    0.182930
preço          0.199720
comprar        0.213049
dtype: float32
cigarro 0.555555555556
sexo          0.193010
prostituta    0.224085
fetiche       0.210483
cache         0.166553
dtype: float32
prottituição 0.5
dosagem       0.196053
substancia    0.182847
remedio       0.170088
aborto        0.180973
comprimido    0.194651
comprar       0.213049
preço         0.199720
dtype: float32
remedios proibidos 0.142857142857


{'arma': False,
 'cigarro': True,
 'prottituição': False,
 'remedios proibidos': False}

In [115]:
cls.classify("https://garotacomlocal.com/", kws, labels)



arma       0.159080
munição    0.163765
preço      0.190162
comprar    0.216096
dtype: float32
arma 0.25
cigarro        0.179042
narguile       0.202099
fumar          0.188617
tragar         0.207189
tabaco         0.156194
nicotina       0.169515
vaporizador    0.158431
preço          0.190162
comprar        0.216096
dtype: float32
cigarro 0.333333333333
sexo          0.187417
prostituta    0.183996
fetiche       0.186213
cache         0.174024
dtype: float32
prottituição 0.0
dosagem       0.170265
substancia    0.168830
remedio       0.147367
aborto        0.178729
comprimido    0.172596
comprar       0.216096
preço         0.190162
dtype: float32
remedios proibidos 0.142857142857


{'arma': False,
 'cigarro': False,
 'prottituição': False,
 'remedios proibidos': False}

In [116]:
cls.classify("https://www.oskaras.com/cytotec/", kws, labels)



arma       0.155930
munição    0.160920
preço      0.176054
comprar    0.196128
dtype: float32
arma 0.0
cigarro        0.164970
narguile       0.173672
fumar          0.174711
tragar         0.175073
tabaco         0.150885
nicotina       0.153432
vaporizador    0.153965
preço          0.176054
comprar        0.196128
dtype: float32
cigarro 0.0
sexo          0.154838
prostituta    0.171297
fetiche       0.149536
cache         0.168723
dtype: float32
prottituição 0.0
dosagem       0.157164
substancia    0.152181
remedio       0.154409
aborto        0.164154
comprimido    0.169128
comprar       0.196128
preço         0.176054
dtype: float32
remedios proibidos 0.0


{'arma': False,
 'cigarro': False,
 'prottituição': False,
 'remedios proibidos': False}

In [101]:
cls.classify("https://www.brasiltatica.com.br/", kws, labels)



IndexError: string index out of range