In [3]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
import gensim
import os
import collections
import smart_open
import random

# Set file names for train and test data
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

#create the model
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_distances

data = [
"Rete_di_computer Connessione Comunicazione Dato Livello_fisico Informazione Città_intelligente Azienda",
"Comune Sestriere Turismo Via_Lattea_(comprensorio_sciistico) Torino Wireless Proprietà_(diritto) Casa Servizio",
"Taxi Servizio Filosofia Acronimo Torino Europa Wireless",
"Monaco di Baviera Siemens_(azienda) Kickoff Internet_delle_cose Euro",
"Ministero_dei_trasporti Torino Wireless Comune_medievale Attraversamento_pedonale",
"Unione_europea Torino Wireless Stati_Uniti_d'America Canada Piccola_e_media_impresa Mercato Città intelligente"



        ]
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                # 'yield' returns a generator, useful for large set of data (in terms of memory consumption)
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
train_corpus = list(read_corpus(lee_train_file))  # train data has tag associated to each document
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))
                

#tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
tagged_data = train_corpus
# print (tagged_data)


max_epochs = 300
vec_size = 35
alpha = 0.030

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00030,
                min_count=2,
                dm=0) #dm=0 means "distributed bag of words"

#C'è ancora da giocare un po' con il tuning dei parametri. Io mi sono messa in condizione di farlo e ci posso pure guardare nei prox giorni

model.build_vocab(tagged_data)


for epoch in range(max_epochs):
    if(epoch%50==0):
        print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")


iteration 0
iteration 50
iteration 100
iteration 150
iteration 200
iteration 250
Model Saved


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/nick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec
from nltk import word_tokenize
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import cosine_distances, pairwise_distances
from scipy import sparse
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation
from sklearn.decomposition import PCA
from sklearn.neighbors.ball_tree import BallTree
from sklearn.preprocessing import StandardScaler

model= Doc2Vec.load("d2v.model")
print (model)
#print(model.vocabulary)
#to find the vector of a document which is not in training data
#test_data = word_tokenize("Rete_di_computer Connessione Comunicazione Dato Livello_fisico Informazione Città_intelligente Azienda".lower())
test_data = train_corpus[0].words # test_data is a list of taggedDocuments,as returned by read_corpus
print("TEST DATA TO INFER:{}".format(' '.join(test_data)))
v1 = model.infer_vector(test_data)
# print("V1_infer", v1)

# Find the top-N(topn default = 10) most similar docvecs from the training set
# This method computes cosine similarity between a simple mean of the projection weight vectors of the given docs
# Returns: Sequence of (doctag/index, similarity).
# Return type: list of ({str, int}, float)
similar_docs = model.docvecs.most_similar([v1], topn = 3)

# n_similarity(ds1, ds2) Compute cosine similarity between two sets of docvecs from the trained set.

# print all similar documents with their score
for doc_tag, similarity in similar_docs:
    print("\nSimilar Doc-->(doctag:{0},score:{1}):<<{2}>>".format(doc_tag, similarity, ' '.join(train_corpus[int(doc_tag)].words)))
# data contains words like: "i, love, chatbots".. ' '.join puts together strings separating them with a white space

# #############################################################################
vecs = []
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
# check notes for more detailed info
for doc in iter(range(0, len(model.docvecs))):
    doc_vec = model.docvecs[doc]
    vecs.append(doc_vec.reshape((1, 35)))

# print(vecs[0]) not really interesting, they're just 1x35 vectors
# print(vecs[1])
print(model.docvecs.offset2doctag)
        #  print model.docvecs.doctags.keys()
doc_vecs = np.array(vecs, dtype='float')  # TSNE expects float type values

# print doc_vecs
docs = []
for i in doc_vecs:
    docs.append(i[0])
    # print  docs

# print vocabulary -list of words known-
# print(model.wv.vocab)

print ("Clustering vectors by DBSCAN")

# lets try lower eps (eps: the minimum distance between two points. It means that if
# the distance between two points is lower or equal to this value (eps), these points are considered neighbors)
#  As a general rule, a minimum minPoints can be derived from a
# number of dimensions (D) in the data set, as minPoints ≥ D + 1
db = DBSCAN(eps=3.0,min_samples=4, metric='euclidean').fit(pairwise_distances(docs, metric='cosine'))
#db = DBSCAN(eps=0.9, min_samples=4, metric='cosine',algorithm='brute').fit(docs)
labels = db.labels_
print (db.labels_)
    # Number of clusters in labels, ignoring noise if present.
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters)

word_centroid_map = dict(zip(model.docvecs.offset2doctag, labels))

print (list(word_centroid_map))


ids = []
rest = []
for cluster in iter(range(0, n_clusters)):
    for key, item in word_centroid_map.items():
        if item == cluster:
            ids.append(key)
            rest.append(item)
print (ids)
print (rest)


Doc2Vec(dbow,d35,n5,mc2,s0.001,t3)
TEST DATA TO INFER:hundreds of people have been forced to vacate their homes in the southern highlands of new south wales as strong winds today pushed huge bushfire towards the town of hill top new blaze near goulburn south west of sydney has forced the closure of the hume highway at about pm aedt marked deterioration in the weather as storm cell moved east across the blue mountains forced authorities to make decision to evacuate people from homes in outlying streets at hill top in the new south wales southern highlands an estimated residents have left their homes for nearby mittagong the new south wales rural fire service says the weather conditions which caused the fire to burn in finger formation have now eased and about fire units in and around hill top are optimistic of defending all properties as more than blazes burn on new year eve in new south wales fire crews have been called to new fire at gunning south of goulburn while few details are ava

  if np.issubdtype(vec.dtype, np.int):


In [28]:
# let's try another way for clustering data: K-Mean, an even more popular algorithm,
# which I know from the introductory course on AI, so it might be smarter 
# to utilize algorithms which I know and can talk about in the presentation
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# build k-means model
# we could set-up an incremental version, increasing
# number of clusters at each iteration
kmeans = KMeans(n_clusters = 5, verbose=0) 
docs = []
for doc in iter(range(0, len(model.docvecs))):
    doc_vec = model.docvecs[doc]
    docs.append(doc_vec.reshape((1, 35)))
doc_vecs = np.array(docs).astype(float) 
# scikit-learn expects 2d num arrays for the training dataset for a fit function. 
# The dataset you are passing in is a 3d array you need to reshape the array into a 2d
nsamples, nx, ny = doc_vecs.shape
d2_train_dataset = doc_vecs.reshape((nsamples,nx*ny)) 
#First dimension is maintained and the other two dimensions are flattened (so 28x28 becomes 784

kmeans.fit(d2_train_dataset) # data, as vectors of documents

# let's see if it works by trying to predict the cluster of all elements in the training set 
correct = 0
for i in range(len(d2_train_dataset)):
    predict_me = d2_train_dataset[i]
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = kmeans.predict(predict_me)
    if((d2_train_dataset[prediction[0]]-d2_train_dataset[i]).all()):
        correct += 1

print(correct/len(d2_train_dataset))


1.0


## Data Visualisation (fail)

In [7]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=2)
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca_result = pca.fit_transform(X)

df = dict()
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
#df['pca-three'] = pca_result[:,2]

from ggplot import *


chart = ggplot( X, aes(x='pca-one', y='pca-two', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("Some title")
chart


print ('Explained variation per principal component: {0}'.format(pca.explained_variance_ratio_))

ModuleNotFoundError: No module named 'ggplot'