# Set up the enviromnent

In [0]:
!pip install langid
!pip install -U sentence-transformers


In [0]:
!git clone https://github.com/LucaBassanese/Tesi

Cloning into 'Tesi'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 162 (delta 13), reused 17 (delta 8), pack-reused 129[K
Receiving objects: 100% (162/162), 289.80 MiB | 28.01 MiB/s, done.
Resolving deltas: 100% (43/43), done.
Checking out files: 100% (76/76), done.


In [0]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy


import matplotlib.pyplot as plt

#Keywords
import sklearn

#wordcloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from nltk.corpus import stopwords

#transalte
import langid

import random


#cluster of papers
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

#cluster hierarchical
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram

#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
pd.options.display.max_rows = 150

#SBERT
from sentence_transformers import SentenceTransformer

#cosine similarity
from scipy import spatial

#combinazioni
from itertools import combinations 


from nltk.util import ngrams

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Titolo e Abstract

In [0]:
url =  'https://raw.githubusercontent.com/LucaBassanese/Tesi/master/data/abstract.xlsx'
df = pd.read_excel(url)
dft = df.copy()
df = df[df['Abstract inglese'].notnull()]
df = df.drop_duplicates(['Abstract inglese'])
df['titabs']=  df.Titolo + ' ' + df['Abstract inglese']

lingua = [langid.classify(testo)[0] for testo in df.Titolo]
lingua = pd.Series(lingua)
df = df[(lingua == 'en').values]

#tokenizer che toglie la punteggiatura
tokenizer = RegexpTokenizer(r'\w+')


In [0]:
#tokenizzo togliendo la punteggiatura
words_ta = [tokenizer.tokenize(fr.lower()) for fr in df.titabs] 
#creo bigrammi e trigrammi
bigrams_t = [list(ngrams(word,2)) for word in words_ta ]
bigrams_ta = [['_'.join(list(w)) for w in bi] for bi in bigrams_t]
trigrams_t = [list(ngrams(word,3)) for word in words_ta ]
trigrams_ta = [['_'.join(list(w)) for w in tri] for tri in trigrams_t]

In [0]:
#creo i tagged document per il doc2vec
tagged_data_ta = [TaggedDocument(words=words_ta[i], tags=[str(i)]) for i in range(len(df))]
tagged_bi_ta = [TaggedDocument(words=bigrams_ta[i], tags=[str(i)]) for i in range(len(df))]
tagged_tri_ta = [TaggedDocument(words=trigrams_ta[i], tags=[str(i)]) for i in range(len(df))]

# Titoli


In [0]:
dft = dft.drop_duplicates(['Titolo'])
# Seleziono sogli i titoli in inglese
lingua = [langid.classify(testo)[0] for testo in dft.Titolo]
lingua = pd.Series(lingua)
dft = dft[(lingua == 'en').values]

In [0]:
#tokenizzo togliendo la punteggiatura
words = [tokenizer.tokenize(fr.lower()) for fr in dft.Titolo ] 
#creo bigrammi e trigrammi
bigrams_t = [list(ngrams(word,2)) for word in words ]
bigrams_s = [['_'.join(list(w)) for w in bi] for bi in bigrams_t]
trigrams_t = [list(ngrams(word,3)) for word in words ]
trigrams_s = [['_'.join(list(w)) for w in tri] for tri in trigrams_t]

  """


In [0]:
bigrams = [uni + bi  for uni,bi in zip(words,bigrams_s)]
trigrams = [ bi + tri  for bi,tri in zip(bigrams,trigrams_s)]

In [0]:
#creo i tagged document per il doc2vec
tagged_data = [TaggedDocument(words=words[i], tags=[str(i)]) for i in range(len(dft))]
tagged_bi = [TaggedDocument(words=bigrams[i], tags=[str(i)]) for i in range(len(dft))]
tagged_tri = [TaggedDocument(words=trigrams[i], tags=[str(i)]) for i in range(len(dft))]


# Frasi unite

In [0]:
frasi = [' '.join(word) for word in words]
frasi_bi = [' '.join(bi) for bi in bigrams]
frasi_tri = [' '.join(tri) for tri in trigrams]
frasi_ta =  [' '.join(word) for word in words_ta]
frasi_bi_ta = [' '.join(bi) for bi in bigrams_ta]
frasi_tri_ta = [' '.join(tri) for tri in trigrams_ta]

In [0]:
frasi

# Distribuzione parole 


In [0]:
#creo lessico di tutte le parole
lessico = tokenizer.tokenize(' '.join(frasi ))
stop_words = set(stopwords.words('english')) 
lessico = [w for w in lessico if not w in stop_words] 
#creo la distribuzione delle parole nei titoli
distr = (pd.Series(lessico)).value_counts()
#elimino le parole troppo rare
distr = distr[distr >2] 

In [0]:
parover95 = list((distr[distr > np.percentile(distr, 95)]).index)
par75_80 = list((distr[distr.between(np.percentile(distr, 75), np.percentile(distr, 80))]).index)
par50_55 = list((distr[distr.between(np.percentile(distr, 50), np.percentile(distr, 55))]).index)

In [0]:
#indici per 95 percentile
parindoc_1 = [[d.count(wr) for wr in parover95]for d in frasi]
somma_1 = [sum(doc) for doc in parindoc_1]
ind_1 = sorted(range(len(somma_1)), key=lambda i: somma_1[i])[-10:]

#indici per parole tra il 75 e 80
parindoc_2 = [[d.count(wr) for wr in par75_80]for d in frasi]
somma_2 = [sum(doc) for doc in parindoc_2]
ind_2 = sorted(range(len(somma_2)), key=lambda i: somma_2[i])[-10:]

# indici per parole tra 50 e 55
parindoc_3 = [[d.count(wr) for wr in par50_55]for d in frasi]
somma_3 = [sum(doc) for doc in parindoc_3]
ind_3 = sorted(range(len(somma_3)), key=lambda i: somma_3[i])[-11:]

#creo indici
indici = (pd.Series(ind_1+ ind_2 + ind_3)).drop_duplicates()


# Distribuzione keyword

In [0]:
lingua = []
for testo in dft['Parole chiave']:
  if type(testo) == str:
          a = langid.classify(testo)[0]
  else:
           a =  'Niente'
  lingua.append(a)

In [0]:
lingua

In [0]:

lingua = pd.Series(lingua)


In [0]:

dft = dft[(lingua == 'en').values]

In [0]:
dft.keywords = dft['Parole chiave inglese']

  """Entry point for launching an IPython kernel.


In [0]:
dft.keywords

In [0]:
dft.keywords[(lingua == 'en').values] = dft['Parole chiave'][(lingua == 'en').values]

In [0]:
dft.keywords

In [0]:
chiavi = list(dft.keywords.dropna())
separati = [chiave.split(', ') for chiave in chiavi ]
from itertools import chain
separati = list(chain.from_iterable(separati))
separati2 = [chiave.split('; ') for chiave in separati ]
chiavifinal = list(chain.from_iterable(separati2))
chiavifinal = [parola.lower() for parola in chiavifinal]

In [0]:
# Distribuzione parole
distr = (pd.Series(chiavifinal)).value_counts()
distr = distr[distr >2]

In [0]:
parover95 = list((distr[distr > np.percentile(distr, 95)]).index)
par75_80 = list((distr[distr.between(np.percentile(distr, 75), np.percentile(distr, 80))]).index)
par50_55 = list((distr[distr.between(np.percentile(distr, 50), np.percentile(distr, 55))]).index)

In [0]:
print(len(par75_80))
print(len(parover95))
print(len(par50_55))

34
20
84


In [0]:
keywords = parover95[:11] + par75_80[:11] + par50_55[:12]

In [0]:
keywords

In [0]:
keywords.pop(1)
keywords.pop(13)
keywords.pop(24)
keywords.pop(28)

'japan'

In [0]:
len(keywords)

30

In [0]:
keywords = [' '.join(chiave.split('-')) for chiave in keywords]

In [0]:
keywords

In [0]:
pd.DataFrame(keywords, columns=['parole']).to_csv('keyword.txt',  index=None, header = False)

# Cosine similarity documenti scelti

## Hs

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_hs_300.csv')

In [0]:
comb = combinations(indici.values, 2)
sims = []
ind_comb = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append( sim)
  ind_comb.append(str(i[0]) + ' - ' + str(i[1]))

In [0]:
matriciona = pd.DataFrame(columns= ind_comb)

In [0]:
matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)

In [0]:
ind_mod = ['sent2vec_uni_hs_300']

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_hs_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_hs_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_hs_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_hs_500') 

## ns

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_ns_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_ns_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_ns_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_ns_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_ns_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_ns_500') 

## Softmax

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_softmax_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_softmax_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_softmax_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_softmax_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_uni_softmax_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_uni_softmax_500') 

In [0]:
matriciona.index = ind_mod

In [0]:
matriciona

## Bi hs

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_hs_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_hs_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_hs_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_hs_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_hs_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_hs_500') 

## Bi ns

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_ns_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_ns_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_ns_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_ns_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_ns_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_ns_500') 

## Bi softmax

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_softmax_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_softmax_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_softmax_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_softmax_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_bi_softmax_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_bi_softmax_500') 

## Tri hs

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_hs_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_hs_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_hs_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_hs_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_hs_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_hs_500') 

## Tri ns

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_ns_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_ns_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_ns_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_ns_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_ns_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_ns_500') 

## Tri softmax

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_softmax_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_softmax_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_softmax_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_softmax_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/sent2vec_tri_softmax_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('sent2vec_tri_softmax_500') 

In [0]:
matriciona.index = ind_mod

In [0]:
matriciona

Unnamed: 0,781 - 833,781 - 838,781 - 844,781 - 877,781 - 892,781 - 1326,781 - 103,781 - 842,781 - 1107,781 - 1356,781 - 1384,781 - 1395,781 - 1402,781 - 1404,781 - 1405,781 - 1475,781 - 0,781 - 187,781 - 504,781 - 1117,781 - 1124,781 - 590,781 - 787,781 - 1118,781 - 1205,781 - 634,781 - 952,781 - 445,781 - 587,833 - 838,833 - 844,833 - 877,833 - 892,833 - 1326,833 - 103,833 - 842,833 - 1107,833 - 1356,833 - 1384,833 - 1395,...,1117 - 634,1117 - 952,1117 - 445,1117 - 587,1124 - 590,1124 - 787,1124 - 1118,1124 - 1205,1124 - 634,1124 - 952,1124 - 445,1124 - 587,590 - 787,590 - 1118,590 - 1205,590 - 634,590 - 952,590 - 445,590 - 587,787 - 1118,787 - 1205,787 - 634,787 - 952,787 - 445,787 - 587,1118 - 1205,1118 - 634,1118 - 952,1118 - 445,1118 - 587,1205 - 634,1205 - 952,1205 - 445,1205 - 587,634 - 952,634 - 445,634 - 587,952 - 445,952 - 587,445 - 587
sent2vec_uni_hs_300,0.22,0.34,0.94,0.22,0.97,0.34,-0.17,0.32,0.58,0.5,-0.26,-0.08,0.54,0.78,0.72,-0.05,0.08,0.86,0.84,0.73,0.7,0.47,0.93,0.93,-0.05,0.86,0.71,0.9,0.81,0.38,0.03,0.71,0.3,0.38,0.71,0.88,0.63,0.88,0.58,0.79,...,0.76,0.76,0.77,0.86,0.88,0.86,0.86,0.59,0.76,0.82,0.56,0.83,0.72,0.67,0.79,0.56,0.83,0.41,0.74,0.99,0.2,0.91,0.84,0.89,0.94,0.15,0.94,0.8,0.9,0.94,0.05,0.47,-0.2,0.23,0.81,0.89,0.94,0.69,0.9,0.86
sent2vec_uni_hs_400,0.22,0.4,0.94,0.29,0.97,0.4,-0.14,0.33,0.59,0.46,-0.2,-0.03,0.54,0.78,0.71,0.01,0.15,0.88,0.85,0.75,0.69,0.5,0.92,0.92,-0.01,0.85,0.72,0.9,0.8,0.34,0.04,0.73,0.32,0.34,0.73,0.89,0.64,0.89,0.61,0.81,...,0.79,0.8,0.77,0.9,0.89,0.87,0.86,0.63,0.77,0.83,0.55,0.83,0.75,0.7,0.8,0.61,0.85,0.43,0.78,0.99,0.26,0.92,0.86,0.88,0.95,0.2,0.94,0.82,0.89,0.94,0.11,0.51,-0.17,0.28,0.82,0.89,0.94,0.7,0.9,0.85
sent2vec_uni_hs_500,0.19,0.55,0.95,0.24,0.98,0.55,-0.14,0.3,0.58,0.45,-0.24,-0.05,0.52,0.79,0.75,-0.02,0.09,0.87,0.84,0.83,0.73,0.58,0.95,0.95,-0.02,0.87,0.71,0.92,0.85,0.34,-0.01,0.76,0.27,0.34,0.76,0.92,0.69,0.9,0.65,0.83,...,0.82,0.75,0.85,0.91,0.91,0.86,0.85,0.59,0.79,0.85,0.58,0.86,0.76,0.72,0.75,0.64,0.88,0.48,0.8,0.99,0.17,0.92,0.82,0.91,0.95,0.12,0.94,0.79,0.92,0.95,0.07,0.52,-0.21,0.24,0.8,0.88,0.94,0.64,0.89,0.86
sent2vec_uni_ns_300,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sent2vec_uni_ns_400,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sent2vec_uni_ns_500,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sent2vec_uni_softmax_300,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,1.0,1.0,1.0,0.99,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sent2vec_uni_softmax_400,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,1.0,1.0,1.0,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sent2vec_uni_softmax_500,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sent2vec_bi_hs_300,0.22,0.35,0.94,0.26,0.97,0.35,-0.13,0.35,0.59,0.49,-0.26,-0.05,0.54,0.78,0.72,-0.01,0.12,0.86,0.85,0.73,0.71,0.49,0.93,0.93,-0.02,0.85,0.71,0.9,0.81,0.32,0.02,0.63,0.3,0.32,0.72,0.87,0.61,0.88,0.56,0.77,...,0.74,0.77,0.76,0.86,0.89,0.86,0.85,0.61,0.75,0.82,0.55,0.83,0.73,0.68,0.8,0.56,0.84,0.42,0.76,0.99,0.23,0.9,0.84,0.88,0.94,0.16,0.93,0.8,0.89,0.93,0.05,0.5,-0.18,0.26,0.79,0.88,0.93,0.69,0.9,0.85


## Doc2vec uni

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_uni_dm_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dm_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_uni_dm_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dm_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_uni_dm_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dm_500') 

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_uni_dbow_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dbow_300') 

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_uni_dbow_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dbow_400') 

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_uni_dbow_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dbow_500') 

## Doc2vec Bi

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_bi_dm_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dm_300')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_bi_dm_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dm_400')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_bi_dm_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dm_500')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_bi_dbow_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_300')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_bi_dbow_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_400')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_bi_dbow_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_500')

## Doc2vec tri

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_tri_dm_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dm_300')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_tri_dm_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dm_400')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_tri_dm_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dm_500')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_tri_dbow_300.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dbow_300')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_tri_dbow_400.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dbow_400')

In [0]:
a = pd.read_csv('/content/Tesi/data/doc2vec_tri_dbow_500.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dbow_500')

In [0]:
#matriciona.index = ind_mod

In [0]:
#matriciona.to_excel('matriciona.xlsx')

## Universal

In [0]:
a = pd.read_csv('/content/Tesi/data/universal.csv')
#per tutti i modelli
comb = combinations(indici.values, 2)
sims = []
for i in list(comb):
  sim = round(1-spatial.distance.cosine(a.iloc[i[0]], a.iloc[i[1]]),2)
  sims.append( sim)

matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('universal')

In [0]:
matriciona

Unnamed: 0,781 - 833,781 - 838,781 - 844,781 - 877,781 - 892,781 - 1326,781 - 103,781 - 842,781 - 1107,781 - 1356,781 - 1384,781 - 1395,781 - 1402,781 - 1404,781 - 1405,781 - 1475,781 - 0,781 - 187,781 - 504,781 - 1117,781 - 1124,781 - 590,781 - 787,781 - 1118,781 - 1205,781 - 634,781 - 952,781 - 445,781 - 587,833 - 838,833 - 844,833 - 877,833 - 892,833 - 1326,833 - 103,833 - 842,833 - 1107,833 - 1356,833 - 1384,833 - 1395,...,1117 - 634,1117 - 952,1117 - 445,1117 - 587,1124 - 590,1124 - 787,1124 - 1118,1124 - 1205,1124 - 634,1124 - 952,1124 - 445,1124 - 587,590 - 787,590 - 1118,590 - 1205,590 - 634,590 - 952,590 - 445,590 - 587,787 - 1118,787 - 1205,787 - 634,787 - 952,787 - 445,787 - 587,1118 - 1205,1118 - 634,1118 - 952,1118 - 445,1118 - 587,1205 - 634,1205 - 952,1205 - 445,1205 - 587,634 - 952,634 - 445,634 - 587,952 - 445,952 - 587,445 - 587
0,0.22,0.34,0.94,0.22,0.97,0.34,-0.17,0.32,0.58,0.5,-0.26,-0.08,0.54,0.78,0.72,-0.05,0.08,0.86,0.84,0.73,0.7,0.47,0.93,0.93,-0.05,0.86,0.71,0.9,0.81,0.38,0.03,0.71,0.3,0.38,0.71,0.88,0.63,0.88,0.58,0.79,...,0.76,0.76,0.77,0.86,0.88,0.86,0.86,0.59,0.76,0.82,0.56,0.83,0.72,0.67,0.79,0.56,0.83,0.41,0.74,0.99,0.2,0.91,0.84,0.89,0.94,0.15,0.94,0.8,0.9,0.94,0.05,0.47,-0.2,0.23,0.81,0.89,0.94,0.69,0.9,0.86
1,0.22,0.4,0.94,0.29,0.97,0.4,-0.14,0.33,0.59,0.46,-0.2,-0.03,0.54,0.78,0.71,0.01,0.15,0.88,0.85,0.75,0.69,0.5,0.92,0.92,-0.01,0.85,0.72,0.9,0.8,0.34,0.04,0.73,0.32,0.34,0.73,0.89,0.64,0.89,0.61,0.81,...,0.79,0.8,0.77,0.9,0.89,0.87,0.86,0.63,0.77,0.83,0.55,0.83,0.75,0.7,0.8,0.61,0.85,0.43,0.78,0.99,0.26,0.92,0.86,0.88,0.95,0.2,0.94,0.82,0.89,0.94,0.11,0.51,-0.17,0.28,0.82,0.89,0.94,0.7,0.9,0.85
2,0.19,0.55,0.95,0.24,0.98,0.55,-0.14,0.3,0.58,0.45,-0.24,-0.05,0.52,0.79,0.75,-0.02,0.09,0.87,0.84,0.83,0.73,0.58,0.95,0.95,-0.02,0.87,0.71,0.92,0.85,0.34,-0.01,0.76,0.27,0.34,0.76,0.92,0.69,0.9,0.65,0.83,...,0.82,0.75,0.85,0.91,0.91,0.86,0.85,0.59,0.79,0.85,0.58,0.86,0.76,0.72,0.75,0.64,0.88,0.48,0.8,0.99,0.17,0.92,0.82,0.91,0.95,0.12,0.94,0.79,0.92,0.95,0.07,0.52,-0.21,0.24,0.8,0.88,0.94,0.64,0.89,0.86
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.99,1.0,1.0,1.0,0.99,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,1.0,1.0,1.0,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.22,0.35,0.94,0.26,0.97,0.35,-0.13,0.35,0.59,0.49,-0.26,-0.05,0.54,0.78,0.72,-0.01,0.12,0.86,0.85,0.73,0.71,0.49,0.93,0.93,-0.02,0.85,0.71,0.9,0.81,0.32,0.02,0.63,0.3,0.32,0.72,0.87,0.61,0.88,0.56,0.77,...,0.74,0.77,0.76,0.86,0.89,0.86,0.85,0.61,0.75,0.82,0.55,0.83,0.73,0.68,0.8,0.56,0.84,0.42,0.76,0.99,0.23,0.9,0.84,0.88,0.94,0.16,0.93,0.8,0.89,0.93,0.05,0.5,-0.18,0.26,0.79,0.88,0.93,0.69,0.9,0.85


## Aggiunta titolo ed abstract a matriciona

In [0]:
comb = combinations(indici.values, 2)
titolo1 = []
titolo2 = []
abstract1 = []
abstract2 = []
for i in list(comb):
  titolo1.append(dft.Titolo.iloc[i[0]]) 
  titolo2.append(dft.Titolo.iloc[i[1]])
  abstract1.append(dft['Abstract inglese'].iloc[i[0]])
  abstract2.append(dft['Abstract inglese'].iloc[i[1]])


In [0]:
matriciona = matriciona.append(pd.Series(titolo1, index=matriciona.columns ), ignore_index= True)
matriciona = matriciona.append(pd.Series(titolo2, index=matriciona.columns ), ignore_index= True)
matriciona = matriciona.append(pd.Series(abstract1, index=matriciona.columns ), ignore_index= True)
matriciona = matriciona.append(pd.Series(abstract2, index=matriciona.columns ), ignore_index= True)

In [0]:
ind_mod.append('Titolo1') 
ind_mod.append('Titolo2')
ind_mod.append('Abstract1')
ind_mod.append('Abstract2')
matriciona.index = ind_mod

In [0]:
matriciona.to_excel('matriciona.xlsx')

# Frasi to txt

In [0]:
pd.DataFrame(frasi, columns=['frasi']).to_csv('frasi.txt',  index=None, header = False)
pd.DataFrame(frasi_bi, columns=['frasi']).to_csv('frasi_bi.txt',  index=None, header = False)
pd.DataFrame(frasi_tri, columns=['frasi']).to_csv('frasi_tri.txt',  index=None, header = False)

# Doc2vec


In [0]:
# esempio 
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha




## Data dm


In [0]:
%%time
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha


model.save('doc2vec_uni_dm_300') 

# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_uni_dm_300.csv', index= False)

In [0]:
%%time
max_epochs = 100
vec_size = 400
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save('doc2vec_uni_dm_400') 

# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_uni_dm_400.csv', index= False)

In [0]:
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_uni_dm_500.csv', index= False)
    

model.save('doc2vec_uni_dm_500')


## Data dbow

In [0]:
%%time
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_uni_dbow_300.csv', index= False)
    

model.save('doc2vec_uni_dbow_300')

In [0]:
%%time
max_epochs = 100
vec_size = 400
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_uni_dbow_400.csv', index= False)

model.save('doc2vec_uni_dbow_400')

In [0]:
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_uni_dbow_500.csv', index= False)

model.save('doc2vec_uni_dbow_500')


## Bigram dm

In [0]:
%%time
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_bi)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_bi,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_bi_dm_300.csv', index= False)
model.save('doc2vec_bi_dm_300')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
%%time
max_epochs = 100
vec_size = 400
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_bi)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_bi,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_bi_dm_400.csv', index= False)
model.save('doc2vec_bi_dm_400')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 4min 7s, sys: 5.72 s, total: 4min 13s
Wall time: 2min 19s


In [0]:
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_bi)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_bi,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_bi_dm_500.csv', index= False)
model.save('doc2vec_bi_dm_500')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 5min 4s, sys: 5.31 s, total: 5min 9s
Wall time: 2min 47s


## Bigram dbow

In [0]:
%%time
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_bi)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_bi,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_bi_dbow_300.csv', index= False)
model.save('doc2vec_bi_dbow_300')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 2min 32s, sys: 21.1 s, total: 2min 53s
Wall time: 2min 3s


In [0]:
%%time
max_epochs = 100
vec_size = 400
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_bi)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_bi,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_bi_dbow_400.csv', index= False)
model.save('doc2vec_bi_dbow_400')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_bi)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_bi,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_bi_dbow_500.csv', index= False)
model.save('doc2vec_bi_dbow_500')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 2min 51s, sys: 21.3 s, total: 3min 12s
Wall time: 2min 14s


## Trigram dm

In [0]:
%%time
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    

# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_tri_dm_300.csv', index= False)
model.save('doc2vec_tri_dm_300')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 5min 7s, sys: 5.43 s, total: 5min 13s
Wall time: 2min 54s


In [0]:
%%time
max_epochs = 100
vec_size = 400
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    

# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_tri_dm_400.csv', index= False)
model.save(F"/content/gdrive/My Drive/Modelli/doc2vec_tri_dm_400" )

In [0]:
model.save('doc2vec_tri_dm_400' )

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                #dbow_words = 1
                dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_tri_dm_500.csv', index= False)

model.save('doc2vec_tri_dm_500')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Trigram dbow

In [0]:
%%time
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    

# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_tri_dbow_300.csv', index= False)
model.save('doc2vec_tri_dbow_300')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 3min 3s, sys: 18.4 s, total: 3min 22s
Wall time: 2min 15s


In [0]:
%%time
max_epochs = 100
vec_size = 400
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_tri_dbow_400.csv', index= False)
model.save('doc2vec_tri_dbow_400')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 3min 17s, sys: 17.3 s, total: 3min 34s
Wall time: 2min 20s


In [0]:
%%time
max_epochs = 100
vec_size = 500
alpha = 0.025

model = Doc2Vec(vector_size = vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                window = 3,
                #dm = 0,
                dbow_words = 1
                #dm_concat = 1
                )
  
model.build_vocab(tagged_tri)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_tri,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('/content/Tesi/data/doc2vec_tri_dbow_500.csv', index= False)
model.save('doc2vec_tri_dbow_500')

# Similarity doc2vec

In [0]:
#cosine similarity
from scipy import spatial
vec1 = model.infer_vector('economics and econometrics'.split())
vec2 = model.infer_vector('innovation'.split())

similairty = 1 - spatial.distance.cosine(vec1, vec2)


# Load doc2vec-creazione matriciona_key

In [0]:
parole = pd.read_csv('/content/Tesi/data/keyword.txt', header = None, names= ['parole'])

In [0]:
matriciona= pd.read_excel('/content/Tesi/data/matricionasent.xlsx', index_col=0)

In [0]:
parole

In [0]:
#tokenizzo togliendo la punteggiatura
words = [tokenizer.tokenize(fr.lower()) for fr in parole.parole ] 


In [0]:
words

In [0]:

#creo bigrammi e trigrammi
bigrams_t = [list(ngrams(word,2)) for word in words ]
bigrams_s = [['_'.join(list(w)) for w in bi] for bi in bigrams_t]
trigrams_t = [list(ngrams(word,3)) for word in words ]
trigrams_s = [['_'.join(list(w)) for w in tri] for tri in trigrams_t]

  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
bigrams_key = [uni + bi  for uni,bi in zip(words,bigrams_s)]
trigrams_key = [ bi + tri  for bi,tri in zip(bigrams,trigrams_s)]

In [0]:
bigrams_key = [' '.join(bi) for bi in bigrams_key]
trigrams_key = [' '.join(tri) for tri in trigrams_key]

In [0]:
pd.DataFrame(bigrams_key, columns=['bigrammi'])

In [0]:
bigrams_key = pd.DataFrame(bigrams_key, columns=['bigrammi'])
trigrams_key = pd.DataFrame(trigrams_key, columns=['trigrammi'])

## Uni

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_uni_dbow_300')


# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_uni_dbow_300.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# from scipy import spatial
# from itertools import combinations 
# vec1 = model.infer_vector('economics and econometrics'.split())
# vec2 = model.infer_vector('innovation'.split())

# similairty = 1 - spatial.distance.cosine(vec1, vec2)

In [0]:
ind_mod = matriciona.index

In [0]:
ind_mod = list(ind_mod)

In [0]:
from scipy import spatial
from itertools import combinations 
comb = combinations(list(parole.parole), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dbow_300')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_uni_dbow_400')

# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_uni_dbow_400.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(parole.parole), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dbow_400')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_uni_dbow_500')

# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_uni_dbow_500.csv', index= False)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(parole.parole), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dbow_500')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_uni_dm_300')

# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_uni_dm_300.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(parole.parole), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dm_300')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_uni_dm_400')
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_uni_dm_400.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(parole.parole), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dm_400')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_uni_dm_500')
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_uni_dm_500.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(parole.parole), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_uni_dm_500')

In [0]:
matriciona.index = ind_mod

In [0]:
matriciona

## Bi

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_bi_dbow_300')
# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_bi_dbow_300.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(bigrams_key.bigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_300')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_bi_dbow_400')
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_bi_dbow_400.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(bigrams_key.bigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_400')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_bi_dbow_500')
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_bi_dbow_500.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(bigrams_key.bigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_500')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_bi_dm_300')
# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_bi_dm_300.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(bigrams_key.bigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dm_300')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_bi_dm_400')
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_bi_dm_400.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(bigrams_key.bigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dm_400')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_bi_dm_500')
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_bi_dm_500.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(bigrams_key.bigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_bi_dbow_500')

## Tri

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_tri_dm_300')

# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_tri_dm_300.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(trigrams_key.trigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dm_300')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_tri_dm_400')
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_tri_dm_400.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(trigrams_key.trigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dm_400')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_tri_dm_500')
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_tri_dm_500.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(trigrams_key.trigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dm_500')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_tri_dbow_300')
# matrix = np.zeros((len(dft),300))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_tri_dbow_300.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(trigrams_key.trigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dbow_300')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_tri_dbow_400')
# matrix = np.zeros((len(dft),400))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_tri_dbow_400.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(trigrams_key.trigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dbow_400')

In [0]:
model = Doc2Vec.load(r'/content/drive/My Drive/Modelli/doc2vec_tri_dbow_500')
# matrix = np.zeros((len(dft),500))
# for i in range(len(dft)):
#   matrix[i,:] = model.docvecs[i]

# a = pd.DataFrame(matrix)
# a.to_csv('doc2vec_tri_dbow_500.csv', index= False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
comb = combinations(list(trigrams_key.trigrammi), 2)
sims = []

for i in list(comb):
  v1 = model.infer_vector(i[0].split())
  v2 = model.infer_vector(i[1].split())
  sim = round(1-spatial.distance.cosine(v1, v2), 3)
  # print(f'First: {i[0]}, Second: {i[1]},\
  # Similarity: {sim}')
  sims.append(sim)


matriciona = matriciona.append(pd.Series(sims, index=matriciona.columns ), ignore_index= True)
ind_mod.append('doc2vec_tri_dbow_500')

In [0]:
matriciona.index = ind_mod

In [0]:
matriciona.to_excel('matriciona_key.xlsx')

# SBERT

In [0]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(frasi)
sbert = np.stack( sentence_embeddings, axis=0 )

100%|██████████| 405M/405M [00:24<00:00, 16.4MB/s]


In [0]:
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

100%|██████████| 1.24G/1.24G [00:18<00:00, 67.1MB/s]


In [0]:
import pandas as pd
pd.read_csv??

In [0]:
sentence_embeddings = model.encode(frasi)
sbert = np.stack( sentence_embeddings, axis=0 )

In [0]:
sbert.shape

(1548, 1024)

In [0]:
sbert

array([[-1.1434323 , -0.18188004,  0.91623545, ..., -0.48097208,
        -0.9856035 ,  0.8776383 ],
       [-0.40389788, -0.41126215,  0.42457682, ...,  0.5315726 ,
        -0.87461996,  0.44353035],
       [-0.24677515, -0.12850131,  0.44034338, ..., -0.7237369 ,
        -0.31194127, -0.29219583],
       ...,
       [-0.3599878 ,  0.2995323 ,  0.62660736, ..., -0.2494359 ,
        -0.6457086 ,  0.389404  ],
       [-0.49856946,  0.05594404,  0.3238621 , ..., -0.1141041 ,
        -0.24745905,  0.60181206],
       [ 0.03410498, -0.08580752,  1.8610344 , ..., -1.1958299 ,
        -1.1618495 , -0.1159507 ]], dtype=float32)