In [1]:
import random
import numpy as np
import pickle

from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing as ppr
from scipy import sparse

In [2]:
def tfidf(count_matrix):
    ''' Takes as input a count matrix 
        of term occurencies in documents
        'doc-term' (N_doc x N_term)
        and returns a tfidf matrix of 
        same dimensions'''
    
    [total_number_of_documents, total_number_of_terms] = count_matrix.shape
            
    tf = count_matrix
    
    documents_containing_the_word = np.count_nonzero(count_matrix, axis = 0)

    df = (documents_containing_the_word + 1)/(total_number_of_documents +1)
    idf = np.log(np.reciprocal(df)) + 1
    
    tf_idf = np.zeros([total_number_of_documents, total_number_of_terms])

    for i in range(total_number_of_documents):
        tf_idf[i,:] = tf[i,:]*idf
    
    tf_idf = ppr.normalize(tf_idf, norm='l2')
    return(tf_idf)

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        print('\n')
    print()

In [4]:
data_file = open('LDA_pseudodocuments.train' , 'r')
lines = data_file.readlines()
print(len(lines))
# random.Random(78).shuffle(lines)
data_file.close()

45388


In [5]:
voc_dict = {}
voc_dict_inv = {}
i = 0
with open('vocab.bpe.from','r') as f:
    keys = f.read().splitlines()
    for key in keys:
        if i>2 :
            voc_dict[i-3] = key
            voc_dict_inv[key] = i-3            
#             print(key)
        i+=1
print('\nVocabulary size:')
print(i)


Vocabulary size:
15003


In [6]:
count_matrix= np.zeros([len(lines), i-3])

for l, pseudodocument in enumerate(lines):
    words = pseudodocument.split(' ')
    for w in words:
        if w!='\n':
            count_matrix[l, voc_dict_inv[w]] += 1

In [7]:
count_matrix.shape

(45388, 15000)

# Κόβουμε όσα υπερβαίνουν την ζητούμενη συχνότητα 0.9

In [8]:
small_keys=[]
voc_dict_clean={}
voc_dict_inv_clean={}
count_matrix_small = np.zeros([45388,14599])

counter = 0
non_stop_words = 0
for i in range(15000):
#     print(np.count_nonzero(count_matrix[:,i]))
    if np.count_nonzero(count_matrix[:,i])/45388 < 0.9 :
        print(voc_dict[i])
        non_stop_words+=1
        small_keys.append(voc_dict[i])
        
        voc_dict_clean[counter]=voc_dict[i]
        voc_dict_inv_clean[voc_dict[i]]=counter
        count_matrix_small[:,counter] += count_matrix[:,i]
        counter+=1
        
        
print(non_stop_words)

▁her
▁she
▁game
▁play
▁team
▁money
▁K
▁She
7
▁$
▁show
▁games
^
▁W
▁against
T
M
P
▁world
▁E
▁9
▁sub
ar
▁&
▁name
▁season
A
▁left
..
▁t
C
~
▁V
▁top
▁m
▁gonna
us
▁looks
▁Thanks
▁d
ies
L
(
▁guys
▁b
▁help
▁problem
▁comment
▁live
▁fun
▁call
G
▁J
▁8
▁yeah
▁week
▁run
w
▁playing
est
am
▁7
et
▁c
▁talking
v
▁car
▁pay
os
▁w
R
E
▁job
▁change
▁g
▁looking
▁free
▁case
▁super
▁h
ie
▁nice
ia
▁l
▁days
▁using
▁hate
▁full
▁means
▁O
▁yet
▁players
▁buy
z
▁story
▁care
▁win
▁side
ant
▁between
▁head
▁watch
▁For
▁school
F
▁stop
ist
▁women
▁heard
ent
ion
▁kids
▁each
O
▁Can
ch
▁called
im
▁wanted
▁second
N
#
ity
H
▁home
▁aren
▁Do
▁that’s
▁mind
▁half
▁Trump
▁cool
▁came
▁power
▁didn’t
▁hit
▁st
▁thinking
▁exactly
▁level
▁I’ve
▁Like
▁n
el
W
▁All
▁likely
ated
▁saw
K
▁played
▁started
▁US
▁To
▁^
▁That’s
▁basically
▁etc
▁says
▁Then
▁happened
▁ones
ating
▁line
▁k
▁weird
▁OP
▁friends
▁hand
▁needs
▁found
▁Don
▁during
▁friend
▁fine
▁rather
ad
▁player
▁happen
▁took
▁character
ism
▁white
▁Because
▁s
ur
▁At
id
▁seem
▁yes
▁must
▁so

▁decision
▁morning
ase
off
oh
▁ya
▁Ok
▁role
izing
▁pop
that
▁plus
▁minute
▁ret
▁mess
▁AND
▁board
▁laws
▁loss
▁Someone
▁holding
ere
▁mental
▁Definitely
▁Imagine
▁Canada
▁became
▁noticed
ld
ame
▁she’s
▁harder
▁De
▁bitch
▁info
▁born
▁Except
▁rep
▁ind
▁beautiful
▁built
ified
ock
az
▁evil
▁mar
▁data
▁continue
▁exp
com
end
IT
▁none
▁sitting
▁You’re
▁stories
AN
uff
eg
ows
▁sold
▁suck
my
▁switch
▁anime
▁finish
▁jump
Q
▁father
▁su
res
▁moved
▁ins
▁tank
ren
ved
▁released
like
ents
▁cold
▁likes
▁film
▁opposite
▁ship
▁interest
▁shame
▁ahead
▁politics
▁porn
▁rid
▁offense
▁mistake
▁wins
▁expected
▁Nice
▁four
ities
▁Fl
▁cap
ones
▁offer
bl
gr
▁wild
▁possibly
ered
▁skill
▁constantly
▁draw
ck
▁barely
▁shut
▁paper
▁voting
ives
▁realized
▁page
▁hilarious
which
▁perfectly
▁focus
▁Seems
▁Q
ai
▁faster
▁Google
▁Big
▁jobs
▁research
▁afford
ster
▁imp
▁China
ss
▁weapons
▁statement
ble
▁min
▁theory
ages
▁military
ok
▁There’s
▁husband
▁asshole
▁tough
▁arm
▁regardless
▁excited
▁despite
▁cash
▁create
▁TO
ces
▁Someti

uss
▁claims
ction
▁fights
▁frame
ef
▁greatest
▁scenario
▁aside
▁cats
▁solo
▁competitive
▁generation
▁accident
▁gang
▁rough
game
▁guilty
▁economy
Man
▁responsible
▁Out
reat
ower
OU
▁leader
▁enjoyed
▁steal
with
▁challenge
▁metal
▁dev
Th
▁Pre
▁civil
▁glass
omb
IL
▁quest
▁Twitter
▁casual
ipp
▁religious
▁defensive
ode
hip
▁Joe
▁appear
▁mission
▁Bar
▁Better
▁tag
▁RB
▁explanation
▁products
▁equal
▁mark
▁gym
▁personality
▁covered
▁edge
▁yours
ears
▁significant
▁bug
▁sur
▁EDIT
▁Ford
▁highest
▁att
▁threw
▁pizza
ining
▁brown
▁dunno
▁Many
▁Glad
▁wont
▁perhaps
OM
▁location
▁sport
▁hat
▁camp
AS
ires
▁gaming
ern
▁Anyone
▁particularly
▁yea
▁Mc
▁pushing
▁flying
▁custom
▁FBI
▁Australia
unt
▁HE
▁United
▁goals
▁opening
ishing
▁decade
▁fantasy
▁neck
▁ticket
aining
ring
▁foreign
▁decisions
▁stretch
▁drops
▁empty
itch
board
▁dig
▁Two
ending
▁dudes
▁factor
▁paint
ead
▁via
iting
▁era
ups
▁log
iest
▁Game
▁enemies
▁bringing
ology
rist
af
▁busy
▁parent
rated
ung
bit
key
min
▁pat
▁Has
▁AT
ret
oooo
▁Id
▁console
▁ad

ji
ulated
▁bias
▁object
iced
▁canon
▁confidence
aur
▁youtube
▁invest
ilt
▁responsibility
▁Marvel
VE
▁pics
▁boot
▁addition
▁ac
▁Mal
▁SU
▁vegan
ada
ords
nder
▁existence
▁SA
alt
▁CH
▁bas
oud
▁region
▁Jim
▁Senate
▁DE
ext
aff
▁relationships
▁global
▁wave
▁Chris
▁typical
▁existed
▁steps
▁viol
▁print
▁WW
▁snap
▁thousand
▁percent
▁whoever
ym
▁actor
usion
itt
▁consent
▁bugs
▁answers
▁sec
▁politicians
▁comics
▁announced
▁mentally
▁poster
▁Season
lers
edly
▁Char
▁policies
▁div
▁expansion
▁Sun
oof
▁trap
▁network
▁Ass
▁wonderful
▁aspect
▁bum
chan
▁Top
cast
▁Funny
izz
▁loose
▁collection
▁compl
rest
▁smash
inc
▁height
AA
▁Pet
▁developed
▁focused
▁smooth
ention
ford
▁international
▁Air
▁skins
▁Considering
ags
▁Ev
some
▁exception
▁Sy
▁elite
▁angle
▁critical
▁Death
gen
▁testing
▁qual
▁initial
▁creating
zz
CE
▁NBA
▁tanks
▁Act
▁Ro
|
▁measure
▁ate
▁exclusive
▁charged
▁reveal
▁plate
▁hom
▁weren’t
▁detect
▁adds
pre
▁counts
lines
▁Sadly
▁serve
▁acceptable
▁Mil
▁Ant
▁corrupt
▁rise
▁Kinda
▁legitimate
arian
gl
▁

▁Kevin
▁MA
▁plants
esting
▁viable
▁ordered
▁rh
▁FF
▁Clearly
▁opponents
▁proceed
▁Des
▁monsters
▁biased
▁Nick
▁uncle
▁sexy
▁ears
▁abs
▁artists
▁ham
iders
▁WWE
▁reasoning
▁fee
dle
▁seed
▁they’d
▁Washington
▁MM
▁funding
▁overl
▁technical
▁teaching
▁Bet
▁Turn
lessly
▁trolling
▁programs
▁bombs
▁broad
icted
▁Kn
▁quests
▁electric
▁rational
part
▁exam
▁ive
▁Use
▁Knight
▁NYC
ami
▁resist
▁crush
▁honor
▁fetish
ogan
▁removing
▁mentality
iger
▁permanent
▁FO
▁childhood
oney
▁Ireland
▁soldiers
▁combination
▁spl
itude
▁pays
▁witch
▁root
maybe
▁Women
▁dozen
▁tack
▁goo
▁emb
▁championship
▁Hill
water
ivity
▁unp
▁cousin
got
▁Half
apt
▁Pokemon
▁)
▁pathetic
war
▁affected
▁Ye
▁desperate
▁discovered
▁generic
▁Toronto
aka
▁starter
▁handed
▁rifle
err
▁ra
▁footage
▁orders
acking
▁blocked
▁remembered
▁Thor
▁Ter
▁Cur
▁screenshot
anda
away
▁hon
▁toward
▁teammates
▁Dam
▁decl
▁links
▁Pol
▁calories
▁prem
▁facing
there
▁sweat
▁grad
▁ultimately
▁Neither
ummy
▁audio
▁XD
▁Besides
▁esc
▁Sad
each
▁eh
▁replied
▁Pal
▁Poor
▁la

fit
▁vanilla
▁string
vent
utes
▁planes
▁fewer
▁CP
▁leadership
▁Titan
erved
▁Wall
▁MT
BC
▁sized
▁SF
rid
▁pra
▁Chad
▁empire
VER
▁lip
▁responding
elling
▁fourth
TF
MM
▁Kim
▁SJW
▁boots
▁Acc
▁II
▁UFC
ether
inity
▁silent
▁Queen
▁cruel
▁inches
▁communication
▁bab
▁spoke
idents
▁surprisingly
▁chosen
▁dates
▁alleg
▁chunk
▁funds
sed
▁Jul
▁waited
▁caring
▁CPU
▁insight
▁wr
▁broadcast
▁worker
▁Off
▁Hello
▁remains
▁Ins
ession
▁loaded
ully
▁stra
▁hurting
▁douche
▁Cup
▁African
▁protected
▁flesh
▁Conf
▁representation
ume
▁Ka
▁Jedi
por
▁panel
▁Bel
▁rewatch
▁Username
erry
▁ble
▁Isn’t
irm
▁quant
▁responded
▁communities
▁Took
▁peoples
▁Inst
▁doc
AME
▁manual
▁challenges
▁snake
uffy
▁atm
▁Anyway
hole
▁Thursday
▁encounter
▁Pen
▁albums
▁casting
iving
▁Luckily
▁intentional
▁performed
SS
ston
ez
▁occup
▁queen
hey
▁Perfect
▁assets
imp
riv
▁wasting
ira
azz
tern
rad
▁CEO
▁Cha
ray
▁invented
▁tem
▁Adam
▁wrap
▁Hes
rif
icle
▁Cheers
fort
▁Ren
▁blade
lee
▁La
▁Inc
▁closely
ho
▁exclusively
▁shitpost
▁rein
▁severe
▁chief
ro

▁investigate
▁lil
▁spreading
will
▁extend
▁Whats
▁powder
▁School
▁welfare
arians
▁Using
▁agg
Can
▁campus
▁wizard
▁autism
▁searching
▁Point
▁ann
▁KD
▁corrected
▁cle
▁Ba
▁Assuming
▁joining
▁politically
▁Mind
▁excuses
▁floating
▁extension
NC
▁accomplish
▁NHL
▁Enjoy
▁Mod
▁serial
▁adopted
▁underestimate
▁impeach
athetic
▁prepare
▁technique
▁“f
▁Note
▁versus
otes
new
▁deploy
▁aver
ND
wor
▁moderate
PC
roy
▁restr
▁vers
▁Mary
onald
▁cure
▁bal
imal
isa
▁Completely
▁happily
▁luxury
▁receiver
pot
▁separ
PP
▁german
▁opens
itty
▁friendship
▁marks
▁preference
onic
▁voices
▁brick
▁Na
▁camps
▁spir
▁appreci
▁seemingly
itting
▁evolution
▁tackle
iner
▁defence
▁icon
▁goods
AST
▁trucks
▁preseason
leting
▁que
something
lyn
▁dynamic
▁incel
▁fulf
▁Coll
▁poverty
▁cou
▁chew
▁SB
▁troops
▁Happ
ras
▁theyre
▁lightning
▁reform
▁Eminem
▁NEVER
▁Jay
▁Everybody
▁Shadow
▁ultra
▁essential
▁RO
irect
▁ward
▁computers
aaa
▁qualify
▁ly
▁Nike
ulations
▁notch
cking
▁leather
atable
▁dated
▁application
▁dominant
▁hal
▁Open
▁Hol
▁i

▁During
▁importantly
▁urban
▁differ
▁knight
▁cried
▁rejected
▁advent
▁worn
▁resource
▁treats
umblr
▁centre
▁whites
ESS
▁resem
▁Turkey
▁Virginia
▁liter
▁sits
▁participate
▁dur
▁arrow
▁Odd
▁headed
▁sympathy
build
▁Graham
cers
▁Thankfully
omed
▁worship
▁Emperor
▁afternoon
▁Speaking
▁trauma
▁Rid
▁Street
▁fascinating
▁drank
▁crossing
▁contin
▁Hall
▁banana
▁chasing
▁Ark
▁surrounded
▁Os
icide
▁heroin
arter
fting
▁incels
▁tent
ties
▁Tbf
supp
▁exploit
aph
▁ESPN
▁fabric
▁MAN
▁Patrick
▁lawn
▁heading
ushing
rooms
▁expense
▁Cos
▁AA
▁doll
▁mach
▁slept
▁Lear
▁sane
▁fallout
bly
▁mold
▁Horde
▁platforms
eaking
▁buried
fers
▁Bills
▁entering
▁rack
▁compens
vation
▁Greek
▁defenders
▁Colorado
▁reboot
▁horde
aire
▁strictly
▁smoked
uns
▁belly
HAHA
▁enforce
ption
▁Baby
▁wiki
▁Alliance
▁Davis
▁Bruce
▁browser
▁efforts
▁cri
jin
ument
▁axe
▁arri
▁cultures
▁promotion
gb
▁hooked
▁papers
▁french
▁sniper
▁hats
▁aunt
▁dice
’re
▁texts
▁couldnt
velop
▁tube
▁refuses
agging
▁lat
▁frequent
damn
▁clim
▁ping
▁ut
▁competing
▁F

▁ML
UST
▁Mercedes
▁Atl
▁slice
▁operation
▁mates
THING
▁MORE
▁infuriating
▁dat
actually
▁“D
▁Egypt
▁disrespectful
▁paranoid
▁Throw
▁moderators
▁poss
▁magazine
secut
▁engaging
▁omg
wide
owski
▁TM
▁entered
screen
▁upgraded
tra
▁payments
▁pedest
▁arrive
▁TIME
▁flies
▁accountable
▁custody
▁countless
▁cores
▁highlights
▁Build
rous
▁corpse
▁misses
▁Canadians
ribute
ogy
▁Want
▁mechanical
▁darker
▁morons
▁cycl
▁layers
trans
aria
▁legisl
cket
▁organized
olver
▁pose
▁consumption
▁alpha
▁Calling
▁Perfectly
ports
▁Dra
▁Rockstar
▁Listen
▁bri
uber
▁sponsor
▁penet
oped
▁JP
▁dems
▁climbing
▁nig
itic
ria
▁locks
▁cis
▁Truly
▁inspiration
▁rainbow
▁wal
▁sued
▁lun
▁blatantly
▁obscure
oration
▁helicopter
inf
▁stare
▁Ahhh
▁You’ll
▁Wii
▁frustration
▁ally
OTH
▁Coast
▁We’ve
▁innoc
▁rhetoric
▁gifts
▁knives
▁targeting
▁choke
▁Tar
▁supplies
▁explod
▁jaw
▁grant
▁spinning
▁GF
▁jar
▁McD
▁Om
▁et
▁Jersey
▁mocking
▁neglect
▁HC
▁pant
▁wiped
▁inclined
▁profess
▁taller
▁“M
▁insert
▁diverse
ribution
▁skeptical
print
bid
▁Wol

▁Lucas
ifier
inch
▁popcorn
▁viewing
umer
Jo
▁vibes
▁airl
▁crunch
▁leverage
▁Times
▁font
▁admins
ussion
▁hipp
▁attraction
▁bee
▁hateful
igi
▁circlejerk
▁Barb
▁Rosen
▁boil
▁labeled
▁shallow
▁Doc
▁chickens
▁stamp
▁Barry
▁nomin
▁electoral
▁elimin
▁Cook
▁lacks
beard
▁whining
▁sessions
▁murdering
▁matched
▁Champ
▁grenade
▁mush
▁we’d
▁GOOD
▁Hah
▁idiotic
▁Fant
▁nerve
▁cookie
▁Spotify
▁pillow
▁minions
▁execut
▁underground
eat
ooking
▁lik
▁tension
uted
▁winners
▁refresh
▁tourist
Cl
ophobia
▁atheist
points
▁tid
plate
uition
▁holidays
▁overtime
▁BMW
▁remo
▁thankfully
▁princess
▁Food
▁rides
cell
▁Brew
igious
might
VP
▁WS
▁pity
icating
▁asset
make
▁cooldown
▁Ironically
ivid
▁Arsenal
▁noticing
inding
▁launcher
▁awake
▁wel
ptic
▁dot
fem
▁elves
▁Uncle
▁craw
▁flipping
▁robbed
▁Europeans
▁laundry
esty
olds
▁Btw
▁Galaxy
▁Blake
▁nonetheless
▁excessive
axx
▁lick
▁Cas
▁Ess
▁Bungie
▁inb
▁immature
rels
▁Cooper
olar
▁conver
▁indo
▁Government
▁Serena
▁entrance
vey
add
OTUS
▁HER
▁cul
▁Dodgers
▁partly
iri
▁Footbal

contact
▁crystal
bb
aughs
▁Seth
eee
▁passenger
round
▁achievements
dep
▁scumb
▁employers
▁intr
actions
▁Hea
▁Eren
▁BJ
graph
before
YS
▁humour
imer
▁Cow
▁Press
lords
▁alternatives
vor
▁Dist
▁deposit
▁NFC
▁notifications
▁hel
▁Alonso
▁printing
pit
▁digit
▁Nebraska
▁athe
▁sett
since
▁debating
▁AFAIK
▁Rap
▁glance
esters
One
▁fry
▁Thus
▁Offic
▁DOES
▁fictional
▁restart
▁illegally
▁vodka
▁decrease
▁relatable
▁keeper
▁incons
▁hog
▁versa
▁priorit
▁hij
▁placement
▁Luck
essel
▁counters
tor
▁density
pad
uting
▁Amy
▁soph
▁flashbacks
▁recruiting
▁Serious
owed
▁datab
▁Isa
▁Watson
ixt
▁Music
▁ABOUT
▁orbs
▁stunt
▁residents
▁Palest
▁Hon
mmmm
▁reception
bu
▁fanf
ridge
▁ALWAYS
▁Close
▁independence
▁mixing
▁CG
▁mines
▁prank
▁equipped
▁Ze
▁tagged
▁pork
▁elf
▁pumped
▁Parker
actic
▁thankful
peror
oping
▁gor
▁eliminate
▁sway
▁manages
hate
▁Bark
fed
▁underage
onder
▁illust
▁incest
▁emphas
▁Loki
▁waters
▁HIM
▁prosec
▁EN
oro
▁disqual
▁disability
▁converted
▁contributing
▁suicidal
log
▁elevator
▁junior
▁FM
▁restric

phones
▁Gor
duc
▁Drop
▁decently
▁Liber
cra
▁exch
▁photoshop
chool
▁felony
▁amaz
▁immigrant
▁diseases
▁Cous
▁venom
▁stain
▁worldwide
ieving
▁breasts
▁threshold
▁Blizz
▁Beautiful
▁pens
onda
har
▁dads
▁stro
▁chairs
▁Gw
▁repetitive
▁Legit
▁Quest
▁shithole
▁louder
▁recovered
▁electronics
▁layout
▁personnel
writing
▁Done
asive
SO
▁crypt
▁oun
God
▁mm
▁invaded
xit
▁disadvantage
▁pup
isive
▁Incred
▁taco
▁BEC
jer
▁nearest
▁hospitals
rolling
▁Connor
▁Parad
▁Hal
▁snacks
▁destro
▁rappers
▁horny
omo
gas
▁Belgium
attering
▁statistically
rics
▁flames
▁HIS
▁Nixon
▁Chrome
▁humid
▁Gods
▁Early
▁slack
▁ports
▁duration
▁Illinois
▁surpass
▁LR
character
ringe
▁sincerely
▁Public
▁vaccine
▁Herm
▁poisoning
▁Center
▁possess
▁prequel
alu
obby
▁defe
▁earnings
▁assumes
▁freaked
▁whiskey
▁Advent
▁Strong
▁TI
OY
ibilities
▁haircut
▁bisexual
▁priorities
▁Oz
errym
▁vide
▁wides
▁criticizing
allen
▁nerds
Some
▁edges
habib
▁Steel
bucks
reme
▁Fnatic
▁stubborn
ocus
geable
responsible
▁nod
▁inspir
▁separately
▁nat
▁Ohh
▁warr
▁

ijuana
▁Edd
ghan
cember
olen
intend
fortunate
▁gimm
yssey
ropriate
▁inaccur
▁calend
ausible
isexual
onnell
▁]
specially
▁USS
oices
riend
▁athlet
essage
▁incon
accept
mao
uall
vember
terms
scar
logy
▁sla
▁trem
whi
thusi
lc
alax
▁Charlott
▁tariff
vard
itage
▁intri
▁Yout
ufact
zard
uum
▁electron
byn
▁Aug
▁phen
ructure
ttes
▁Amaz
allas
▁autoc
isd
attoo
geous
▁fol
enix
▁nerv
ateur
odgers
reland
uine
▁Fors
▁Ferr
ophies
▁hous
▁immigr
elsea
nesday
▁consequ
accur
unte
indsight
irgin
fortable
cedes
▁coc
stable
▁inapp
apore
ueller
utions
▁intellig
acceptable
▁overse
▁Conserv
▁naz
▁enem
▁epis
ername
oln
GBT
render
etheless
▁paran
▁convin
ugees
akistan
▁incred
Bay
ilies
bis
ouston
yond
evit
▁rapp
aghetti
▁bacter
▁unde
owa
▁controll
ront
esda
ackson
▁ou
izzard
atell
▁Seren
odka
▁cir
uana
▁simult
▁Somet
priced
▁breat
▁Anth
▁thresh
roit
isode
▁perman
▁controvers
abama
▁obnox
gypt
iversary
uild
iosity
ormous
▁stam
▁Minnes
perial
onso
▁Afghan
atever
aughters
▁Veget
▁Rober
▁anx
rational
amond
Bron
▁downv

In [9]:
small_keys

['▁her',
 '▁she',
 '▁game',
 '▁play',
 '▁team',
 '▁money',
 '▁K',
 '▁She',
 '7',
 '▁$',
 '▁show',
 '▁games',
 '^',
 '▁W',
 '▁against',
 'T',
 'M',
 'P',
 '▁world',
 '▁E',
 '▁9',
 '▁sub',
 'ar',
 '▁&',
 '▁name',
 '▁season',
 'A',
 '▁left',
 '..',
 '▁t',
 'C',
 '~',
 '▁V',
 '▁top',
 '▁m',
 '▁gonna',
 'us',
 '▁looks',
 '▁Thanks',
 '▁d',
 'ies',
 'L',
 '(',
 '▁guys',
 '▁b',
 '▁help',
 '▁problem',
 '▁comment',
 '▁live',
 '▁fun',
 '▁call',
 'G',
 '▁J',
 '▁8',
 '▁yeah',
 '▁week',
 '▁run',
 'w',
 '▁playing',
 'est',
 'am',
 '▁7',
 'et',
 '▁c',
 '▁talking',
 'v',
 '▁car',
 '▁pay',
 'os',
 '▁w',
 'R',
 'E',
 '▁job',
 '▁change',
 '▁g',
 '▁looking',
 '▁free',
 '▁case',
 '▁super',
 '▁h',
 'ie',
 '▁nice',
 'ia',
 '▁l',
 '▁days',
 '▁using',
 '▁hate',
 '▁full',
 '▁means',
 '▁O',
 '▁yet',
 '▁players',
 '▁buy',
 'z',
 '▁story',
 '▁care',
 '▁win',
 '▁side',
 'ant',
 '▁between',
 '▁head',
 '▁watch',
 '▁For',
 '▁school',
 'F',
 '▁stop',
 'ist',
 '▁women',
 '▁heard',
 'ent',
 'ion',
 '▁kids',
 '▁each',
 'O'

In [10]:
tfidf_pseudodocuments_small = tfidf(count_matrix_small)

# Grid Search

In [11]:
dt = [0.5, 0.7, 1]
tw = [0.01, 0.05, 0.1, 0.5]

for d in dt:
    for t in tw:
        print('\n\n FOR doc_topic_prior = '+ str(d) +' topic_word_prior =  '+str(t)+ '............  \n\n')
        lda = LatentDirichletAllocation(n_components=40, doc_topic_prior=d, topic_word_prior=t, learning_method = 'online', random_state = 42)

        lda.fit(tfidf_pseudodocuments_small)
        
        print_top_words(lda, small_keys, 12)



 FOR doc_topic_prior = 0.5 topic_word_prior =  0.01............  


Topic #0: ▁water ▁food ▁eat ▁weight ▁eating ▁meat ▁fat ▁coffee ▁drink ▁cheese ▁chicken ▁milk


Topic #1: ▁song ▁album ▁smoke ▁songs ▁drugs ▁dick ▁Sim ▁weed ▁plant hh ▁awkward ▁smoking


Topic #2: AN IN Q ▁review AD IT AL ▁log ▁AT OW U ▁THE


Topic #3: ▁bot ▁Gen ▁crowd ▁banned ▁enjoyed ▁minimum ▁obvious ▁mis ▁kicked ▁stupid box els


Topic #4: ▁stream com Man ▁stage ▁boss ▁quit ik ▁fighting ▁shield ▁Le ▁epic ora


Topic #5: ▁PC ▁min ▁reasonable ▁chat ep ▁companies ▁mode game ▁account ▁require ▁relatively ▁ban


Topic #6: ▁map OT RE ▁character ▁IN ▁counter ▁art ▁shoot AT ▁army ▁Japan ▁PS


Topic #7: ▁engine ▁grind ▁comp ▁bug ▁kills ability ▁air omb ▁plane & ▁fly ▁war


Topic #8: ▁majority ▁Japanese ▁related ▁gen ▁Mon ▁talked ▁rem ▁fore ▁Time ony ▁truly ▁cycle


Topic #9: ▁driver ▁rent ▁truck ▁camera ▁laughing ▁pat ▁lazy ▁display ▁tour ▁office ▁itself ene


Topic #10: ▁game ▁play ▁games ▁players ▁playing ▁played ▁player

Topic #0: ▁game ▁games ▁play ^ ▁character ▁characters ▁players ~ ▁playing ▁damage ▁her ▁V


Topic #1: ▁Bojack ▁episode ▁Bitcoin uj ▁crypto ▁palette ▁BTC message contact ▁she ▁episodes ▁Doctor


Topic #2: ▁Bitcoin ▁Bojack ▁episode ^ ▁crypto ▁BTC uj ▁palette message contact ▁her ▁she


Topic #3: ▁Bojack ▁Bitcoin uj ▁episode ▁crypto ▁BTC ▁palette ▁she message contact ^ ▁episodes


Topic #4: ▁Bojack ▁episode uj ^ ▁Bitcoin ▁palette ▁she ▁crypto message contact ▁her ▁BTC


Topic #5: ▁Bitcoin ▁BTC ▁crypto ▁Bojack ▁episode uj ▁palette coin message chain ▁bitcoin contact


Topic #6: ▁Bojack ▁Bitcoin ▁episode ▁crypto uj ▁BTC ^ ▁palette message ▁her contact ▁she


Topic #7: ▁Bitcoin ▁Bojack ▁episode ▁crypto ▁BTC uj ▁palette message contact ▁she ▁episodes ▁Doctor


Topic #8: ▁Bitcoin ▁Bojack ▁episode ▁crypto ▁BTC uj ▁palette message ▁she contact ^ ▁her


Topic #9: ▁Bojack ▁episode uj ▁palette ▁Bitcoin ^ ▁crypto message ▁she contact ▁BTC ▁episodes


Topic #10: ▁Bojack ▁Bitcoin ▁episode ▁crypto uj ▁

Topic #0: ▁phone ▁Apple 7 ▁Google ▁phones ▁iPhone ▁engine ▁app ▁battery ▁camera ▁price ▁screen


Topic #1: ▁water ▁meat ▁eat ▁food ▁cheese ▁dry ▁salt ▁plant ▁chicken ▁sauce ▁taste ▁fish


Topic #2: ▁golf ▁Friday ▁Ferrari ▁Howard EC ▁mark aze ▁9 ▁Em 7 ▁slam ▁longest


Topic #3: ▁accounts ▁money ▁Mueller ▁calls ▁Elon ▁job ▁cases ▁management ▁diversity ▁interview ▁case ▁sites


Topic #4: ▁fight ▁Khabib ▁fighting ▁fights ▁fighter ai ▁belt ~ ▁SP Z ▁fighters ou


Topic #5: ▁$ ▁car ▁pay ▁money ▁city ▁cars ▁company ▁driving ▁road ▁drive ▁insurance ▁rent


Topic #6: ▁War ▁character ar ath ▁DM ▁boss ▁level ▁kill ▁attack ▁war ▁damage ▁spell


Topic #7: ia ▁German ian ▁war ▁English ▁race ▁French ▁Roman ▁flag ▁Spanish ▁British ▁Italy


Topic #8: ▁her ▁subs ▁she elle ▁Japanese ▁Chad _ girl ▁idol ie aid ▁Part


Topic #9: ▁Emperor ▁fandom ▁Ain ▁shiny ▁Venom ▁Finally ▁Love ▁colors ror ▁downvotes ▁beauty YY


Topic #10: ▁team ▁game ▁season ▁teams ▁players ▁league ▁player ▁play ▁fans ▁defense ▁football ▁

Topic #0: ▁skins ▁update ▁engine ▁screen ▁button ▁skin ▁auto ▁map ▁hardware ▁pc ▁PC ▁Windows


Topic #1: ▁her ▁she ▁calories ▁clothes ▁myself ▁water ▁eat ▁dick ▁diet ▁weight ▁cal ▁flavor


Topic #2: ▁KD ▁golf ▁expectations ▁coast ▁mark ^ ▁Colorado ▁Friday ▁whoever ▁retired ▁soft oops


Topic #3: ▁corrupt ▁Florida ▁ped ▁primary ▁MAGA ▁she ▁supreme ortion ▁bias ▁Georgia ▁politically onald


Topic #4: ▁fight ▁Khabib ▁fights ▁fighting ▁game ana ▁fighter ▁stage ▁VR ▁rounds ▁banner ▁stream


Topic #5: ▁Trump ▁vote ▁government ▁political ▁Kavanaugh ▁state ▁party ▁Republicans ▁voting ▁election ▁country ▁GOP


Topic #6: ▁character ▁characters ▁Smash ▁Dark ▁War ▁Lu ▁IN ▁game ▁Fire ora ▁spells ▁Knight


Topic #7: ▁women ▁justice ▁Court ▁candidate ▁war ists ▁Ted ▁Democrat ism ▁proven archy ▁Nazi


Topic #8: ▁her ▁she ▁Pokemon ▁episode elle ▁trans ▁episodes < ▁idol ▁Japanese ▁crush ▁creepy


Topic #9: ▁she ▁Sox ▁her ▁Peter aim ▁Emperor ▁fandom ▁Halloween ▁Love ▁phot ERS ▁IM


Topic #10: ▁games ▁Nin

# Από το grid search επιλέγουμε doc-topic-prior = 0.7, topic-word-prior=0.01

In [13]:
lda = LatentDirichletAllocation(n_components=40, doc_topic_prior=0.7, topic_word_prior=0.01, learning_method = 'online', random_state = 42)

lda.fit(tfidf_pseudodocuments_small)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.7,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=40, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=0.01,
             total_samples=1000000.0, verbose=0)

In [14]:
print_top_words(lda, small_keys, 15)

Topic #0: ▁game ▁play ▁players ▁map ▁PC ▁damage ▁playing ▁player ▁raid ▁mode ▁tank ▁maps ▁patch ▁kills ▁stream


Topic #1: ▁gun ▁shoot ▁guns ▁bugs ▁Fallout ▁wall ▁shooter ▁pub ▁BO ▁filter ▁bullet ▁settings ▁barrel ▁ammo ▁survival


Topic #2: ▁golf ▁mark ▁beef ▁Ferrari ▁Turn ▁pod oops ▁Friday EC ▁slam ▁retired mg ▁uniform ▁9 ▁coast


Topic #3: ▁accounts ▁cases ▁calls ▁dismiss ▁punishment ▁London ▁stupid ▁aren ▁basis ▁case ▁MAGA ▁rapist ▁diversity ▁mis ▁Facebook


Topic #4: ▁fight ana ▁Khabib ▁fighting ▁fights ▁vs ▁stage ▁belt ▁Tony z ▁punch ▁tournament ▁fighter ▁knock ▁round


Topic #5: ▁$ ▁pay ▁city ▁money ▁car ▁police ▁company ▁insurance ▁rent ▁traffic ▁paid ▁service ▁area ▁legal ▁paying


Topic #6: ▁boss ▁attack ▁level ▁+ ▁War ▁weapon ▁damage ath ▁buff ▁units ar ▁quest ▁armor ▁DM ▁spell


Topic #7: ▁trump ▁race ▁votes ▁flag ▁Dems ists ▁white ▁idiots ▁Ted ▁Democrat ▁European ism ▁nonsense ist ▁victims


Topic #8: ▁Pokemon ▁her anda elle ▁Thanos ony aid ie ▁she ▁Cha ▁Finn ▁related ▁Tim

In [16]:
# tfidf_pseudodocuments.shape
print(lda.components_.shape)

(40, 14599)


In [17]:
index_list = [10, 9134, 1134, 1001, 1080, 13651, 9574, 11627,1432, 7172]

In [18]:
for i in index_list:
    print(voc_dict_clean[i])
    print(lda.components_[:,i])
    print(np.argmax(lda.components_[:,i]))

▁show
[1.00008726e-02 1.00116561e-01 7.87742813e-01 8.20382005e-01
 7.06752967e+00 1.02247245e-02 1.11733095e-02 4.97690040e-02
 4.16097749e+01 3.18671291e+01 1.48395025e+00 5.97044694e+00
 5.76559891e-02 3.13911935e+01 1.22307716e+01 4.63963939e+00
 6.49966844e+01 1.11289512e+00 4.60632857e+01 1.26477152e-02
 5.96593411e-01 3.52885362e+02 2.52244894e+00 1.16103567e-02
 7.96083006e-01 3.55557966e+00 6.91506632e+00 2.47282282e-02
 7.47250094e-01 9.30957428e+01 1.17536359e+01 1.24765040e+00
 1.36307140e-02 1.00144423e-02 2.00235603e-02 1.00084128e-02
 1.00106050e-02 2.79898762e+01 1.00010484e-02 4.31925903e-02]
21
omi
[1.01321718e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00156308e-02
 1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 1.00000000e-02 1.00000000e-02 5.98404420e+01 1.00000000e-02
 1.00000000e-02 8.29419954e-01 1.00000000e-02 1.00000000e-02
 1.00000000e-02 1.00000000e-02 1.00000000e-02 1.00000000e-02
 1.0000000

In [19]:
tr_lda = np.transpose(lda.components_)
LDA_matrix= np.zeros([15003,40])

with open('vocab.bpe.from','r') as f:
    keys = f.read().splitlines()
    for i, key in enumerate(keys):
        if key in voc_dict_inv_clean.keys():
            LDA_matrix[i,:] += tr_lda[voc_dict_inv_clean[key],:]

In [26]:
with open('LDA_vectors.pickle', 'wb') as h:
    pickle.dump(LDA_matrix, h)