In [1]:
import pandas as pd
import gensim
import numpy as np
from gensim.models import Word2Vec
from time import time  # To time our operations
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

data = pd.read_json("noStopModded.json")
stemmed_words = pd.Series(' '.join(data['stemmed']).split()).value_counts()
stemmed_words1 = stemmed_words[stemmed_words > 3]
data['stemmed_counted'] = data['stemmed'].apply(lambda x: " ".join(x for x in x.split() if x in stemmed_words1))




In [2]:
sentences = data[data["year"] == 2016]["stemmed_counted"].tolist()
sentences

['foxnew live member famili pm ring new year togeth make america great',
 'happi new year amp thank',
 '',
 'happi new year maralago thank great famili support',
 'hillari said wa war explan lie benghazi allow wh vote trump potu',
 'cnn realdonaldtrump theyr spend million still go win go donald trump',
 'well year ha offici begun mani stop plan work veri hard win turn countri around',
 'realdonaldtrump love u trump famili god bless',
 'jodil792 stand spread presid 2016 wake amp sleep pray amp',
 'im one biggest fan mr trump cant wait make america great never forget support',
 'huckabe good need get behind realdonaldtrump agre',
 'go mississippi tomorrow night hear crowd go massiv look forward',
 'never interest polit becaus want get polit scienc degre trump2016 great',
 'person hillari clinton least want run far largest voter turnout ever swamp',
 'thank much name 2015 man year thi inde great honor',
 'votetrump2016 amp togeth makeamericagreatagain thank support',
 'massiv crowd expect

In [3]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sentences)]
tagged_data

[TaggedDocument(words=['foxnew', 'live', 'member', 'famili', 'pm', 'ring', 'new', 'year', 'togeth', 'make', 'america', 'great'], tags=['0']),
 TaggedDocument(words=['happi', 'new', 'year', 'amp', 'thank'], tags=['1']),
 TaggedDocument(words=[], tags=['2']),
 TaggedDocument(words=['happi', 'new', 'year', 'maralago', 'thank', 'great', 'famili', 'support'], tags=['3']),
 TaggedDocument(words=['hillari', 'said', 'wa', 'war', 'explan', 'lie', 'benghazi', 'allow', 'wh', 'vote', 'trump', 'potu'], tags=['4']),
 TaggedDocument(words=['cnn', 'realdonaldtrump', 'theyr', 'spend', 'million', 'still', 'go', 'win', 'go', 'donald', 'trump'], tags=['5']),
 TaggedDocument(words=['well', 'year', 'ha', 'offici', 'begun', 'mani', 'stop', 'plan', 'work', 'veri', 'hard', 'win', 'turn', 'countri', 'around'], tags=['6']),
 TaggedDocument(words=['realdonaldtrump', 'love', 'u', 'trump', 'famili', 'god', 'bless'], tags=['7']),
 TaggedDocument(words=['jodil792', 'stand', 'spread', 'presid', '2016', 'wake', 'amp', 

In [4]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)




In [5]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha



iteration 0


  """


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [6]:
test_data = word_tokenize("mexico wall year great country")

v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar("1")
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])



V1_infer [-0.01242636 -0.08125022 -0.06801497  0.0438619   0.06372025 -0.15879166
  0.09508386  0.03828781 -0.24038483  0.18821862 -0.15045753 -0.10396171
 -0.06249496  0.13536678 -0.00690989 -0.04156863 -0.0214142   0.09683295
 -0.00984927  0.01431439]
[('3', 0.9048179388046265), ('4201', 0.8213292360305786), ('2398', 0.8120646476745605), ('2128', 0.7824974060058594), ('2124', 0.7821071147918701), ('2991', 0.7740702629089355), ('2210', 0.761194109916687), ('4221', 0.7600843906402588), ('331', 0.7532159090042114), ('1672', 0.751181423664093)]
[-1.366704    1.0714912   0.16714197 -0.80326587 -0.42610678 -0.6153329
 -0.06480563  1.7138712  -5.593838    0.07779804 -1.6117653   0.70242065
 -1.9184773   2.4012358  -2.771469   -2.0525253  -0.9499078  -0.34409392
  0.47594926  1.5390769 ]


  if np.issubdtype(vec.dtype, np.int):


In [7]:
from sklearn.metrics import silhouette_score

start_alpha=0.01
infer_epoch=1000

X=[]
for d in sentences:
     
    X.append( model.infer_vector(d, alpha=start_alpha, steps=infer_epoch) )

k=5
 
from sklearn.cluster import Birch
 
brc = Birch(branching_factor=50, n_clusters=k, threshold=0.1, compute_labels=True)
brc.fit(X)
 
clusters = brc.predict(X)
 
labels = brc.labels_
 
print ("Clusters: ")
print (clusters)
 
 
silhouette_score = silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

#silhouette
#0.140 noStop_modded   5 cluster
#0.169 stemmed (no counted)
#0.118 temmed_counted

Clusters: 
[0 0 1 ... 2 1 2]
Silhouette_score: 
0.11873257


  return distances if squared else np.sqrt(distances, out=distances)


In [11]:
data1 = data[data["year"] == 2016]
data1["labels"] = labels
data1.drop(["id_str","in_reply_to_user_id_str","is_retweet","source","created_at"],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


KeyError: "['id_str' 'in_reply_to_user_id_str' 'is_retweet' 'source' 'created_at'] not found in axis"

In [12]:
data1[data1["labels"] == 3]

Unnamed: 0,created_at,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text,modded_text,@johnboehner,...,@cnn,month,year,hour,week_year,date,stemmed,noStop_modded,stemmed_counted,labels
25973,2016-01-01 23:06:09,3066,683061678246903808,,False,893,Twitter for Android,"""@JodiL792: We are Standing with you! Spreadin...",jodil792 we are standing with you spreading th...,0,...,0,1,2016,23,53,2016-01-01,jodil792 stand spread wordtrump presid 2016 wa...,jodil792 standing spreading wordtrump presiden...,jodil792 stand spread presid 2016 wake amp sle...,3
25976,2016-01-01 23:24:13,5481,683066224251645952,,False,1688,Twitter for Android,I will be going to Mississippi tomorrow night ...,i will be going to mississippi tomorrow night ...,0,...,0,1,2016,23,53,2016-01-01,go mississippi tomorrow night hear crowd go ma...,going mississippi tomorrow night hear crowds g...,go mississippi tomorrow night hear crowd go ma...,3
25981,2016-01-02 04:17:23,6363,683140004625936384,,False,2145,Twitter for iPhone,Massive crowds expected in Mississippi tomorro...,massive crowds expected in mississippi tomorro...,0,...,0,1,2016,4,53,2016-01-02,massiv crowd expect mississippi tomorrow night...,massive crowds expected mississippi tomorrow n...,massiv crowd expect mississippi tomorrow night...,3
25989,2016-01-02 22:42:16,5756,683418056417251328,,False,1609,Twitter for iPhone,"Heading to Biloxi, Mississippi. Massive crowds...",heading to biloxi mississippi massive crowds e...,0,...,0,1,2016,22,53,2016-01-02,head biloxi mississippi massiv crowd expect th...,heading biloxi mississippi massive crowds expe...,head biloxi mississippi massiv crowd expect th...,3
25992,2016-01-03 04:45:45,4528,683509528453869568,,False,1706,Twitter for Android,"""@Granite_Hope: @brandonstinney How can you de...",granite_hope brandonstinney how can you deny t...,0,...,0,1,2016,4,53,2016-01-03,granite_hop brandonstinney deni thi facthillar...,granite_hope brandonstinney deny facthillarycl...,deni thi barackobama two peopl made isi big pr...,3
25993,2016-01-03 04:53:31,5815,683511482684936192,,False,2472,Twitter for iPhone,"Thank you #Biloxi, #Mississippi! Remember this...",thank you biloxi mississippi remember this nig...,0,...,0,1,2016,4,53,2016-01-03,thank biloxi mississippi rememb thi night amp ...,thank biloxi mississippi remember night amp sp...,thank biloxi mississippi rememb thi night amp ...,3
26000,2016-01-03 16:45:35,6401,683690682649833472,,False,2307,Twitter for iPhone,"Thank you for your support in Biloxi, MS! Let'...",thank you for your support in biloxi ms lets a...,0,...,0,1,2016,16,53,2016-01-03,thank support biloxi ms let get amp vote 2016 ...,thank support biloxi ms lets get amp vote 2016...,thank support biloxi ms let get amp vote 2016 ...,3
26001,2016-01-03 17:55:26,6775,683708260784967680,,False,2560,Twitter for Android,"Does anybody remember when Bill Clinton, in 20...",does anybody remember when bill clinton in 200...,0,...,0,1,2016,17,53,2016-01-03,doe anybodi rememb bill clinton 2008 work long...,anybody remember bill clinton 2008 worked long...,doe anybodi rememb bill clinton 2008 work long...,3
26002,2016-01-03 18:13:29,3861,683712803392323584,,False,1343,Twitter for Android,".@chucktodd said today on @meetthepress that ""...",chucktodd said today on meetthepress that atta...,0,...,0,1,2016,18,53,2016-01-03,chucktodd said today meetthepress attack bill ...,chucktodd said today meetthepress attacking bi...,chucktodd said today meetthepress attack bill ...,3
26011,2016-01-04 17:31:48,6957,684064699760115712,,False,2209,Twitter Web Client,"I look forward to being in Lowell, Massachuset...",i look forward to being in lowell massachusett...,0,...,0,1,2016,17,1,2016-01-04,look forward lowel massachusett today hear ver...,look forward lowell massachusetts today hear b...,look forward lowel massachusett today hear ver...,3


In [10]:
from sklearn.cluster import KMeans

true_k = 300
kmeans = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=10)
kmeans.fit(X)
print(kmeans.inertia_)
#con l'ultima maniera è più bassa di 1000 rispetto a fede, viene tipo 2400 alei 3800
print("Top terms per cluster:")
#d = silhouette_score(X, kmeans.labels_,metric='euclidean')

172769.45666759193
Top terms per cluster:
