In [1]:
# PCA imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
# Doc2Vec imports
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim

import json # to open our data file
DATA_FILENAME = "trend_analisys.json"
# open json file
with open(DATA_FILENAME, "r") as json_file:
    json_data = json.load(json_file)
# we're expecting a list now, since our json file is a json array
assert type(json_data) is list

## Version 1 - Training Model with Abstract field (whole text)

In [3]:
# we have our json data now, let's go ahead and divide into training and test set
n_examples =  len(json_data)
# how much of the data we're going to be using for training and for testing
# default values: 80% train, 20% test
TRAIN_DATA_LENGTH = 8 * n_examples // 10
TEST_DATA_LENGTH = n_examples - TRAIN_DATA_LENGTH
ABSTRACT_FIELD_NAME = 'abstract'

print("Total examples: {0}, number of train examples: {1}, number of test examples: {2}".format(n_examples,TRAIN_DATA_LENGTH, TEST_DATA_LENGTH))

# TODO: Randomize selection of examples, don't just take the first ones
# build training corpus: take the needed abstract, preprocess them (tokenize, delete spaces..)
# and create the TaggedDocument needed for training
train_corpus = [gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(dictionary[ABSTRACT_FIELD_NAME]), [i]) for i, dictionary in enumerate(json_data) if i<TRAIN_DATA_LENGTH]
assert len(train_corpus)==TRAIN_DATA_LENGTH
print(train_corpus[:1])

Total examples: 293, number of train examples: 234, number of test examples: 59
[TaggedDocument(words=['il', 'mercato', 'degli', 'smartphone', 'si', 'fa', 'ogni', 'giorno', 'più', 'ricco', 'le', 'possibilità', 'di', 'scelta', 'tra', 'prodotti', 'di', 'buon', 'ottima', 'qualità', 'prezzi', 'diversi', 'sono', 'in', 'costante', 'crescita', 'certamente', 'tra', 'gli', 'smartphone', 'che', 'vi', 'consigliamo', 'prendere', 'in', 'considerazione', 'tra', 'quelli', 'usciti', 'di', 'recente', 'il', 'thinq', 'di', 'lg', 'sottile', 'elegante', 'il', 'modello', 'che', 'abbiamo', 'provato', 'aveva', 'un', 'elegantissimo', 'colore', 'blu', 'comodo', 'di', 'dimensioni', 'giuste', 'il', 'thinq', 'ha', 'tutte', 'le', 'caratteristiche', 'necessarie', 'per', 'accontentare', 'anche', 'il', 'pubblico', 'più', 'esigente', 'mobile', 'platform', 'qualcomm', 'snapdragon', 'gb', 'di', 'ram', 'gb', 'di', 'memoria', 'interna', 'espandibile', 'ovvia', 'dotazione', 'di', 'accelerometro', 'giroscopio', 'magnetometro

In [None]:
# create the doc2vec model
# TODO: tune this parameters (personally, I think we could use a bigger vec_size, like 50)
max_epochs = 300
vec_size = 35
alpha = 0.030
MODEL_NAME = "d2v_trend_analisys_model.model"

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00030,
                min_count=2,
                dm=0) #dm=0 means "distributed bag of words"
# build our vocabulary of words (all the uniques words encountered inside our corpus, needed for training)
model.build_vocab(train_corpus)

# train the model on the given data!
for epoch in range(max_epochs):
    if(epoch%50==0):
        print('iteration {0}'.format(epoch))
    model.train(train_corpus,
                total_examples = model.corpus_count,
                epochs = model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save(MODEL_NAME)
print("Model Saved")


iteration 0
iteration 50
iteration 100


## Visualizing Data
credits: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [None]:
# let's try to visualize all document_vectors
# get all vectors of documents we created from model training
docs_vecs = []
# docvecs (list of Doc2VecKeyedVectors) 
# – Vector representations of the documents in the corpus. Each vector has size == vector_size
for doc in iter(range(0, len(model.docvecs))):
    docs_vecs.append(model.docvecs[doc])

# loading dataset into Pandas DataFrame
df = pd.DataFrame.from_records(docs_vecs)
df.head()

#df[['target']]

In [None]:
# PCA is effected by scale so you need to scale the features in your data before applying PCA. 
features = [i for i in range(vec_size)]

x = df.loc[:, features].values # get features values
print(x)
# we don't have target here y = df.loc[:,['target']].values # get target values (guess kind of flower/Iris)

# standardize data
x = StandardScaler().fit_transform(x) # scale data (especially in case different measures are used)
# pd.DataFrame(data = x, columns = features).head() # show first data


## 2D Projection with PCA

In [None]:
# build PCA model in 2D
pca = PCA(n_components=2) # The new components are just the two main dimensions of variation.

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf.head()
# these components drawn don't hold a lot of information 'per-se', they're just the result 
# of dimension-reduction

In [None]:
# we can now visualize the data by plotting them

# finalDf is the final DataFrame before plotting the data (nothing great, just taking result of PCA and attaching target to it)
#finalDf = pd.concat([principalDf, df[['target']]], axis = 1)

finalDf = principalDf # we don't have targets to show

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)

ax.scatter(finalDf.loc[:, 'principal component 1']
          , finalDf.loc[:,'principal component 2']
          , c = 'r'
          , s = 50)

#targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
#colors = ['r', 'g', 'b']
#for target, color in zip(targets, colors):
#    indicesToKeep = finalDf['target'] == target
#    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
#               , finalDf.loc[indicesToKeep, 'principal component 2']
#               , c = color
#               , s = 50)
#ax.legend(targets)
ax.grid()
print("Number of points shown ", len(finalDf))



In [None]:
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.tools as tls
import plotly.graph_objs as go

tls.set_credentials_file(username='D4nt3', api_key='FdMB4O6qCfciGDOnLvdQ')