In [1]:
import random
import numpy as np
import pickle

from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing as ppr
from scipy import sparse

In [2]:
def tfidf(count_matrix):
    ''' Takes as input a count matrix 
        of term occurencies in documents
        'doc-term' (N_doc x N_term)
        and returns a tfidf matrix of 
        same dimensions'''
    
    [total_number_of_documents, total_number_of_terms] = count_matrix.shape
            
    tf = count_matrix
    
    documents_containing_the_word = np.count_nonzero(count_matrix, axis = 0)

    df = (documents_containing_the_word + 1)/(total_number_of_documents +1)
    idf = np.log(np.reciprocal(df)) + 1
    
    tf_idf = np.zeros([total_number_of_documents, total_number_of_terms])

    for i in range(total_number_of_documents):
        tf_idf[i,:] = tf[i,:]*idf
    
    tf_idf = ppr.normalize(tf_idf, norm='l2')
    return(tf_idf)

In [20]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic %d \n" % (topic_idx + 1)
        message += "\n".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        print('\n')
    print()

In [4]:
data_file = open('LDA_pseudodocuments.train' , 'r')
lines = data_file.readlines()
print(len(lines))
# random.Random(78).shuffle(lines)
data_file.close()

45388


In [5]:
voc_dict = {}
voc_dict_inv = {}
i = 0
with open('vocab.bpe.from','r') as f:
    keys = f.read().splitlines()
    for key in keys:
        if i>2 :
            voc_dict[i-3] = key
            voc_dict_inv[key] = i-3            
#             print(key)
        i+=1
print('\nVocabulary size:')
print(i)


Vocabulary size:
15003


In [6]:
count_matrix= np.zeros([len(lines), i-3])

for l, pseudodocument in enumerate(lines):
    words = pseudodocument.split(' ')
    for w in words:
        if w!='\n':
            count_matrix[l, voc_dict_inv[w]] += 1

In [7]:
count_matrix.shape

(45388, 15000)

# Κόβουμε όσα υπερβαίνουν την ζητούμενη συχνότητα 0.9

In [10]:
small_keys=[]
voc_dict_clean={}
voc_dict_inv_clean={}
count_matrix_small = np.zeros([45388,14599])

counter = 0
non_stop_words = 0
for i in range(15000):
#     print(np.count_nonzero(count_matrix[:,i]))
    if np.count_nonzero(count_matrix[:,i])/45388 < 0.9 :
#         print(voc_dict[i])
        non_stop_words+=1
        small_keys.append(voc_dict[i])
        
        voc_dict_clean[counter]=voc_dict[i]
        voc_dict_inv_clean[voc_dict[i]]=counter
        count_matrix_small[:,counter] += count_matrix[:,i]
        counter+=1
        
        
print(non_stop_words)

14599


In [11]:
# small_keys

In [12]:
tfidf_pseudodocuments_small = tfidf(count_matrix_small)

# Grid Search

In [11]:
# dt = [0.5, 0.7, 1]
# tw = [0.01, 0.05, 0.1, 0.5]

# for d in dt:
#     for t in tw:
#         print('\n\n FOR doc_topic_prior = '+ str(d) +' topic_word_prior =  '+str(t)+ '............  \n\n')
#         lda = LatentDirichletAllocation(n_components=40, doc_topic_prior=d, topic_word_prior=t, learning_method = 'online', random_state = 42)

#         lda.fit(tfidf_pseudodocuments_small)
        
#         print_top_words(lda, small_keys, 12)



 FOR doc_topic_prior = 0.5 topic_word_prior =  0.01............  


Topic #0: ▁water ▁food ▁eat ▁weight ▁eating ▁meat ▁fat ▁coffee ▁drink ▁cheese ▁chicken ▁milk


Topic #1: ▁song ▁album ▁smoke ▁songs ▁drugs ▁dick ▁Sim ▁weed ▁plant hh ▁awkward ▁smoking


Topic #2: AN IN Q ▁review AD IT AL ▁log ▁AT OW U ▁THE


Topic #3: ▁bot ▁Gen ▁crowd ▁banned ▁enjoyed ▁minimum ▁obvious ▁mis ▁kicked ▁stupid box els


Topic #4: ▁stream com Man ▁stage ▁boss ▁quit ik ▁fighting ▁shield ▁Le ▁epic ora


Topic #5: ▁PC ▁min ▁reasonable ▁chat ep ▁companies ▁mode game ▁account ▁require ▁relatively ▁ban


Topic #6: ▁map OT RE ▁character ▁IN ▁counter ▁art ▁shoot AT ▁army ▁Japan ▁PS


Topic #7: ▁engine ▁grind ▁comp ▁bug ▁kills ability ▁air omb ▁plane & ▁fly ▁war


Topic #8: ▁majority ▁Japanese ▁related ▁gen ▁Mon ▁talked ▁rem ▁fore ▁Time ony ▁truly ▁cycle


Topic #9: ▁driver ▁rent ▁truck ▁camera ▁laughing ▁pat ▁lazy ▁display ▁tour ▁office ▁itself ene


Topic #10: ▁game ▁play ▁games ▁players ▁playing ▁played ▁player

Topic #0: ▁game ▁games ▁play ^ ▁character ▁characters ▁players ~ ▁playing ▁damage ▁her ▁V


Topic #1: ▁Bojack ▁episode ▁Bitcoin uj ▁crypto ▁palette ▁BTC message contact ▁she ▁episodes ▁Doctor


Topic #2: ▁Bitcoin ▁Bojack ▁episode ^ ▁crypto ▁BTC uj ▁palette message contact ▁her ▁she


Topic #3: ▁Bojack ▁Bitcoin uj ▁episode ▁crypto ▁BTC ▁palette ▁she message contact ^ ▁episodes


Topic #4: ▁Bojack ▁episode uj ^ ▁Bitcoin ▁palette ▁she ▁crypto message contact ▁her ▁BTC


Topic #5: ▁Bitcoin ▁BTC ▁crypto ▁Bojack ▁episode uj ▁palette coin message chain ▁bitcoin contact


Topic #6: ▁Bojack ▁Bitcoin ▁episode ▁crypto uj ▁BTC ^ ▁palette message ▁her contact ▁she


Topic #7: ▁Bitcoin ▁Bojack ▁episode ▁crypto ▁BTC uj ▁palette message contact ▁she ▁episodes ▁Doctor


Topic #8: ▁Bitcoin ▁Bojack ▁episode ▁crypto ▁BTC uj ▁palette message ▁she contact ^ ▁her


Topic #9: ▁Bojack ▁episode uj ▁palette ▁Bitcoin ^ ▁crypto message ▁she contact ▁BTC ▁episodes


Topic #10: ▁Bojack ▁Bitcoin ▁episode ▁crypto uj ▁

Topic #0: ▁phone ▁Apple 7 ▁Google ▁phones ▁iPhone ▁engine ▁app ▁battery ▁camera ▁price ▁screen


Topic #1: ▁water ▁meat ▁eat ▁food ▁cheese ▁dry ▁salt ▁plant ▁chicken ▁sauce ▁taste ▁fish


Topic #2: ▁golf ▁Friday ▁Ferrari ▁Howard EC ▁mark aze ▁9 ▁Em 7 ▁slam ▁longest


Topic #3: ▁accounts ▁money ▁Mueller ▁calls ▁Elon ▁job ▁cases ▁management ▁diversity ▁interview ▁case ▁sites


Topic #4: ▁fight ▁Khabib ▁fighting ▁fights ▁fighter ai ▁belt ~ ▁SP Z ▁fighters ou


Topic #5: ▁$ ▁car ▁pay ▁money ▁city ▁cars ▁company ▁driving ▁road ▁drive ▁insurance ▁rent


Topic #6: ▁War ▁character ar ath ▁DM ▁boss ▁level ▁kill ▁attack ▁war ▁damage ▁spell


Topic #7: ia ▁German ian ▁war ▁English ▁race ▁French ▁Roman ▁flag ▁Spanish ▁British ▁Italy


Topic #8: ▁her ▁subs ▁she elle ▁Japanese ▁Chad _ girl ▁idol ie aid ▁Part


Topic #9: ▁Emperor ▁fandom ▁Ain ▁shiny ▁Venom ▁Finally ▁Love ▁colors ror ▁downvotes ▁beauty YY


Topic #10: ▁team ▁game ▁season ▁teams ▁players ▁league ▁player ▁play ▁fans ▁defense ▁football ▁

Topic #0: ▁skins ▁update ▁engine ▁screen ▁button ▁skin ▁auto ▁map ▁hardware ▁pc ▁PC ▁Windows


Topic #1: ▁her ▁she ▁calories ▁clothes ▁myself ▁water ▁eat ▁dick ▁diet ▁weight ▁cal ▁flavor


Topic #2: ▁KD ▁golf ▁expectations ▁coast ▁mark ^ ▁Colorado ▁Friday ▁whoever ▁retired ▁soft oops


Topic #3: ▁corrupt ▁Florida ▁ped ▁primary ▁MAGA ▁she ▁supreme ortion ▁bias ▁Georgia ▁politically onald


Topic #4: ▁fight ▁Khabib ▁fights ▁fighting ▁game ana ▁fighter ▁stage ▁VR ▁rounds ▁banner ▁stream


Topic #5: ▁Trump ▁vote ▁government ▁political ▁Kavanaugh ▁state ▁party ▁Republicans ▁voting ▁election ▁country ▁GOP


Topic #6: ▁character ▁characters ▁Smash ▁Dark ▁War ▁Lu ▁IN ▁game ▁Fire ora ▁spells ▁Knight


Topic #7: ▁women ▁justice ▁Court ▁candidate ▁war ists ▁Ted ▁Democrat ism ▁proven archy ▁Nazi


Topic #8: ▁her ▁she ▁Pokemon ▁episode elle ▁trans ▁episodes < ▁idol ▁Japanese ▁crush ▁creepy


Topic #9: ▁she ▁Sox ▁her ▁Peter aim ▁Emperor ▁fandom ▁Halloween ▁Love ▁phot ERS ▁IM


Topic #10: ▁games ▁Nin

# Από το grid search επιλέγουμε doc-topic-prior = 0.7, topic-word-prior=0.01

In [13]:
lda = LatentDirichletAllocation(n_components=40, doc_topic_prior=0.7, topic_word_prior=0.01, learning_method = 'online', random_state = 42)

lda.fit(tfidf_pseudodocuments_small)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.7,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=40, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=0.01,
             total_samples=1000000.0, verbose=0)

In [21]:
print_top_words(lda, small_keys, 5)

Topic 1 
▁game
▁play
▁players
▁map
▁PC


Topic 2 
▁gun
▁shoot
▁guns
▁bugs
▁Fallout


Topic 3 
▁golf
▁mark
▁beef
▁Ferrari
▁Turn


Topic 4 
▁accounts
▁cases
▁calls
▁dismiss
▁punishment


Topic 5 
▁fight
ana
▁Khabib
▁fighting
▁fights


Topic 6 
▁$
▁pay
▁city
▁money
▁car


Topic 7 
▁boss
▁attack
▁level
▁+
▁War


Topic 8 
▁trump
▁race
▁votes
▁flag
▁Dems


Topic 9 
▁Pokemon
▁her
anda
elle
▁Thanos


Topic 10 
▁Halloween
▁suit
▁horror
▁origin
▁MCU


Topic 11 
▁Toronto
q
▁LeBron
▁ND
▁finals


Topic 12 
vy
iny
▁Eminem
▁tracks
ble


Topic 13 
▁game
▁games
]
▁card
▁cards


Topic 14 
▁WWE
▁match
ack
▁Warriors
▁Rock


Topic 15 
SU
▁wrestling
▁neither
▁Lewis
▁CF


Topic 16 
^
▁^
▁u
▁upvote
▁memes


Topic 17 
▁Star
lo
▁Wars
▁Paul
▁star


Topic 18 
▁car
▁cars
▁water
▁ride
▁Conor


Topic 19 
^
~
▁K
ak
▁anime


Topic 20 
▁rep
zer
ore
oud
▁Liverpool


Topic 21 
▁wood
▁Bojack
▁Bama
▁Lakers
▁AC


Topic 22 
▁movie
▁show
▁episode
▁movies
▁scene


Topic 23 
▁her
▁she
▁women
▁sex
▁She


Topic 24 
▁phone
▁Apple


In [16]:
# tfidf_pseudodocuments.shape
print(lda.components_.shape)

(40, 14599)


In [19]:
tr_lda = np.transpose(lda.components_)
LDA_matrix= np.zeros([15003,40])

with open('vocab.bpe.from','r') as f:
    keys = f.read().splitlines()
    for i, key in enumerate(keys):
        if key in voc_dict_inv_clean.keys():
            LDA_matrix[i,:] += tr_lda[voc_dict_inv_clean[key],:]

In [26]:
with open('LDA_vectors.pickle', 'wb') as h:
    pickle.dump(LDA_matrix, h)

In [None]:
LDA_matrix= np.zeros([15003,40])

with open('zero_LDA_vectors.pickle', 'wb') as h:
    pickle.dump(LDA_matrix, h)