# LDA-RDF Topic Modelling

In [278]:
from rdflib import Graph
from rdflib.namespace import RDF, FOAF

import rdflib
import urllib

### Cargando datos 

In [279]:
#Se crea un grafo y luego cargamos los valores de prueba
g = Graph()
g.parse("sampleBig.ttl", format="turtle")

<Graph identifier=N6f7046917b784a4189d55c22cf1ffbf1 (<class 'rdflib.graph.Graph'>)>

### Creando documentos

In [280]:
def de_uri(uri):
    """
        Función que toma una URI y la convierte en un string.
    """
    return uri.split("/")[-1].replace("_"," ").split("#")[-1]

In [281]:
documents = []
#Se buscan todas las personas
for s in g.subjects(RDF.type, FOAF.Person):
    subject = de_uri(s)
    
    #Se comienza el documento con el nombre del recurso
    document = subject
    
    #Se buscan los triples asociados a ese recurso y se van agregando al documento
    for p,o in g.predicate_objects(s):
        
        prop = de_uri(p)
        if not(prop == u'birthDate' or prop == u'deathDate'):
            obj = de_uri(o)
            document += " " + prop + " " + obj + " "
    
    #Se agregan a una lista de documentos
    documents.append(document)
    
    
    #Guardamos un documento de prueba
    if subject == "August Horch":
        test_document = document

In [282]:
#Un ejemplo de como se ve el documento
august = rdflib.term.URIRef(u'http://dbpedia.org/resource/August_Horch')
for s,p,o in g.triples((august, None, None)):
    subject = de_uri(s)
    prop = de_uri(p)
    obj = de_uri(o)
    print (subject,prop,obj)
print test_document

(u'August Horch', u'surname', u'Horch')
(u'August Horch', u'deathPlace', u'Bavaria')
(u'August Horch', u'description', u'German engineer and automobile pioneer')
(u'August Horch', u'deathDate', u'1951-02-03')
(u'August Horch', u'type', u'Person')
(u'August Horch', u'name', u'August Horch')
(u'August Horch', u'givenName', u'August')
(u'August Horch', u'birthDate', u'1868-10-12')
(u'August Horch', u'birthPlace', u'Rhine Province')
(u'August Horch', u'deathPlace', u'M\xfcnchberg')
(u'August Horch', u'birthPlace', u'Winningen')
August Horch surname Horch  deathPlace Bavaria  description German engineer and automobile pioneer  type Person  name August Horch  givenName August  birthPlace Rhine Province  deathPlace Münchberg  birthPlace Winningen 


## Latent Dirichlet Allocation (LDA)

In [283]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

### Extracción de caracteristicas
Primero se necesita transformar los documentos a una lista de frecuencias de palabras (bag of words)

In [297]:
#Se crea un CountVectorizer que analiza los documentos, y busca las palabras más representativas
vectorizer = CountVectorizer(max_df=0.5, min_df=0.001,
                                max_features=1000)

In [299]:
#Se transforman los documentos en una matriz de caracteristicas
X = vectorizer.fit_transform(documents)
X.shape

(2366, 1000)

In [300]:
feature_names = vectorizer.get_feature_names()
feature_names

[u'01',
 u'02',
 u'03',
 u'04',
 u'07',
 u'08',
 u'10',
 u'11',
 u'13',
 u'1385',
 u'14',
 u'1st',
 u'4th',
 u'abbey',
 u'abbot',
 u'abbott',
 u'abd',
 u'abdera',
 u'abdul',
 u'abel',
 u'abercromby',
 u'aberdeen',
 u'abu',
 u'academic',
 u'academy',
 u'activist',
 u'actor',
 u'actress',
 u'adam',
 u'adams',
 u'adelaide',
 u'admiral',
 u'adrian',
 u'advisor',
 u'advocate',
 u'afghan',
 u'afghanistan',
 u'africa',
 u'african',
 u'agricola',
 u'ahmed',
 u'akron',
 u'al',
 u'alabama',
 u'alan',
 u'albania',
 u'albert',
 u'alexander',
 u'alexandria',
 u'algeria',
 u'allen',
 u'alto',
 u'am',
 u'ambrose',
 u'american',
 u'amsterdam',
 u'an',
 u'anagni',
 u'anastasius',
 u'ancient',
 u'and',
 u'anderson',
 u'andreas',
 u'andrew',
 u'andrews',
 u'andr\xe9',
 u'angeles',
 u'anhalt',
 u'ansbach',
 u'anthony',
 u'anthropologist',
 u'antioch',
 u'anton',
 u'antonio',
 u'apostle',
 u'apostolic',
 u'arabia',
 u'aragon',
 u'archbishop',
 u'architect',
 u'arizona',
 u'armstrong',
 u'army',
 u'arnold',

### LDA
Luego de transformar los documentos a vectores de ocurrencia, se entrena el modelo LDA

In [301]:
lda = LatentDirichletAllocation()

In [302]:
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

Luego de entrenar el modelo, vemos las n primeras palabras de cada tópico, con esto podemos asignarle un nombre a cada tópico.

In [304]:
n_top_words = 20
print_top_words(lda, feature_names, n_top_words)

Topic #0:
thomas of saint and martin president bill van bishop francisco empire ottoman minister egypt christian kenneth ludwig politician prime sultan
Topic #1:
of the england and kingdom king henry republic hungary austria edward poland vienna london francis karl in founder austrian duke
Topic #2:
john william robert george and composer of american brandenburg arthur smith scotland english writer frederick novelist st scottish chemist elector
Topic #3:
pope roman rome of empire papal italy france ii emperor holy states paris iii state saxony mary gregory jacques clement
Topic #4:
russian alexander soviet north union empire canada jan romania venice sweden moscow anderson canadian professor walter wayne jonathan mexico designer
Topic #5:
american california director actor musician film and south singer lee producer los angeles british writer illinois roger songwriter gary wisconsin
Topic #6:
italian de and american poet louis french pennsylvania writer italy author stephen maryland je

Ahora podemos entregarle nuevos documentos ya transformados en vectores de frecuencia al modelo LDA y ver a qué temas corresponde

In [306]:
lda.transform(X)

array([[ 0.1       ,  7.0998976 ,  0.10002125, ...,  0.1000113 ,
         0.1       ,  0.10001586],
       [ 0.1       ,  0.1       ,  4.76031331, ...,  0.10001538,
         0.10000007,  0.1       ],
       [ 0.10000844,  0.10000503,  0.10000572, ...,  0.10000171,
         0.10001832,  0.1000096 ],
       ..., 
       [ 0.1000092 ,  0.10000937,  1.33823092, ...,  0.10000533,
         2.4298062 ,  5.45968314],
       [ 0.1       ,  0.10000421,  0.10001437, ...,  2.88743182,
         0.10000977,  2.10000481],
       [ 3.10000202,  0.1       ,  0.1       , ...,  2.18370233,
         0.1       ,  0.10000095]])