# LDA-RDF Topic Modelling

In [310]:
from rdflib import Graph
from rdflib.namespace import RDF, FOAF

import rdflib
import urllib

### Cargando datos 

In [311]:
#Se crea un grafo y luego cargamos los valores de prueba
g = Graph()
g.parse("sampleBig.ttl", format="turtle")

<Graph identifier=N7d86a68da07b43ceb41d0f56df644646 (<class 'rdflib.graph.Graph'>)>

### Creando documentos

In [312]:
def de_uri(uri):
    """
        Función que toma una URI y la convierte en un string.
    """
    return uri.split("/")[-1].replace("_"," ").split("#")[-1]

In [313]:
documents = []
#Se buscan todas las personas
for s in g.subjects(RDF.type, FOAF.Person):
    subject = de_uri(s)
    
    #Se comienza el documento con el nombre del recurso
    document = subject
    
    #Se buscan los triples asociados a ese recurso y se van agregando al documento
    for p,o in g.predicate_objects(s):
        
        prop = de_uri(p)
        if not(prop == u'birthDate' or prop == u'deathDate'):
            obj = de_uri(o)
            document += " " + prop + " " + obj + " "
    
    #Se agregan a una lista de documentos
    documents.append(document)
    
    
    #Guardamos un documento de prueba
    if subject == "August Horch":
        test_document = document

In [314]:
#Un ejemplo de como se ve el documento
august = rdflib.term.URIRef(u'http://dbpedia.org/resource/August_Horch')
for s,p,o in g.triples((august, None, None)):
    subject = de_uri(s)
    prop = de_uri(p)
    obj = de_uri(o)
    print (subject,prop,obj)
print test_document

(u'August Horch', u'description', u'German engineer and automobile pioneer')
(u'August Horch', u'deathDate', u'1951-02-03')
(u'August Horch', u'givenName', u'August')
(u'August Horch', u'birthDate', u'1868-10-12')
(u'August Horch', u'birthPlace', u'Winningen')
(u'August Horch', u'surname', u'Horch')
(u'August Horch', u'deathPlace', u'Bavaria')
(u'August Horch', u'name', u'August Horch')
(u'August Horch', u'birthPlace', u'Rhine Province')
(u'August Horch', u'deathPlace', u'M\xfcnchberg')
(u'August Horch', u'type', u'Person')
August Horch description German engineer and automobile pioneer  givenName August  birthPlace Winningen  surname Horch  deathPlace Bavaria  name August Horch  birthPlace Rhine Province  deathPlace Münchberg  type Person 


## Latent Dirichlet Allocation (LDA)

In [315]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

### Extracción de caracteristicas
Primero se necesita transformar los documentos a una lista de frecuencias de palabras (bag of words)

In [316]:
#Se crea un CountVectorizer que analiza los documentos, y busca las palabras más representativas
vectorizer = CountVectorizer(max_df=0.5, min_df=0.01,
                                max_features=1000)

In [317]:
#Se transforman los documentos en una matriz de caracteristicas
X = vectorizer.fit_transform(documents)
X.shape

(2366, 110)

In [318]:
feature_names = vectorizer.get_feature_names()
feature_names

[u'actor',
 u'actress',
 u'american',
 u'ancient',
 u'and',
 u'angeles',
 u'architect',
 u'artist',
 u'astronomer',
 u'australian',
 u'austria',
 u'author',
 u'british',
 u'brooklyn',
 u'california',
 u'charles',
 u'chemist',
 u'chicago',
 u'christian',
 u'city',
 u'composer',
 u'computer',
 u'county',
 u'david',
 u'de',
 u'director',
 u'emperor',
 u'empire',
 u'engineer',
 u'england',
 u'english',
 u'fiction',
 u'film',
 u'founder',
 u'france',
 u'french',
 u'general',
 u'george',
 u'german',
 u'germany',
 u'greek',
 u'henry',
 u'historian',
 u'holy',
 u'hungary',
 u'ii',
 u'illinois',
 u'in',
 u'italian',
 u'italy',
 u'james',
 u'japanese',
 u'john',
 u'king',
 u'kingdom',
 u'london',
 u'los',
 u'massachusetts',
 u'mathematician',
 u'musician',
 u'new',
 u'novelist',
 u'of',
 u'painter',
 u'papal',
 u'paris',
 u'paul',
 u'pennsylvania',
 u'philosopher',
 u'physicist',
 u'player',
 u'playwright',
 u'poet',
 u'poland',
 u'politician',
 u'pope',
 u'president',
 u'producer',
 u'province'

### LDA
Luego de transformar los documentos a vectores de ocurrencia, se entrena el modelo LDA

In [319]:
lda = LatentDirichletAllocation()

In [320]:
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

Luego de entrenar el modelo, vemos las n primeras palabras de cada tópico, con esto podemos asignarle un nombre a cada tópico.

In [321]:
n_top_words = 20
print_top_words(lda, feature_names, n_top_words)

Topic #0:
john england and london philosopher mathematician kingdom artist british english ancient greek of astronomer poet playwright hungary german italy writer
Topic #1:
new york american city and author scientist san brooklyn computer actor massachusetts california playwright painter state states united politician mathematician
Topic #2:
writer thomas david english south american fiction and science county australian author london composer producer england singer of california novelist
Topic #3:
pope roman empire rome italy saint charles papal henry states holy state province player kingdom and republic christian of composer
Topic #4:
california american director musician film russian massachusetts and actor illinois producer los historian angeles west composer chicago screenwriter scotland novelist
Topic #5:
william singer american actor ii and physicist actress poland engineer chemist texas architect songwriter british in vienna english mathematician the
Topic #6:
united states d

Ahora podemos entregarle nuevos documentos ya transformados en vectores de frecuencia al modelo LDA y ver a qué temas corresponde

In [322]:
lda.transform(X)

array([[ 0.1       ,  2.10000034,  2.09999966, ...,  0.1       ,
         0.1       ,  0.1       ],
       [ 0.1       ,  0.1       ,  0.1       , ...,  0.10003642,
         0.1       ,  0.1       ],
       [ 1.70799038,  0.1000073 ,  1.29977914, ...,  0.10000979,
         0.10001902,  0.10000069],
       ..., 
       [ 1.09998241,  1.10059784,  0.1       , ...,  0.1       ,
         0.1       ,  0.10001607],
       [ 0.1       ,  1.51290653,  0.10000476, ...,  0.1       ,
         0.10000299,  0.1       ],
       [ 3.78901791,  3.40497885,  0.10000716, ...,  0.10000925,
         0.10000709,  1.10589967]])

# Parece que el dataset no es muy bueno porque los temas que encuentra no son muy representativos