Generación de un corpus limitado

In [1]:
def corpus(input_file, output_file, umbral = 5):
    with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            
    ## Con gensim no es necesario usar la función build_vocab, el constructor lo construye en base al array de palabras
    corpus_frequency = {}
    for line in lines[1:]:
        for word in line.split("\t")[0].split(" "):
            if word in corpus_frequency:
                corpus_frequency[word] += 1
                continue
            corpus_frequency[word] = 1

    # Eliminar palabras que aparezcan menos veces de la esperada
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("sinopsis\tgenero\n")
        for line in lines[1:]:
            synopsis, genre = line.split("\t")
            file.write(f"{synopsis}\t{genre}")

In [2]:
input_file = '../data/stemming_data.txt'
outpu_file = '../data/corpus_data.txt'
corpus(input_file, outpu_file)


En esta sección prepararemos los modelos de entrenamiento

In [11]:
# Obtener la lista de stopwords en español

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('spanish'))
special_chars = [ "''", "...", "``", "<<", ">>", '""', "”", "“"]
category_alias = {}

def word_accepted(word):
    return word.lower() not in stop_words and word not in special_chars

def vectorization(model_type, model = None, corpus = None):
    
    ## Vectorizamos las palabras
    if model_type in ['CBOW','SG']:
        keywords = model.wv.index_to_key
        vector = []
        tmp = []
        print
        for text in corpus:
            for word in text:
                if word in keywords:
                    tmp += [model.wv[word]]
            vector += [tmp]
            tmp = []
                
        return vector
    elif model_type == 'BAYES':
        from sklearn.feature_extraction.text import CountVectorizer
    return CountVectorizer()


def categorize(category, inverse = 0):
    if not inverse:
        if category in category_alias:
            return category_alias[category]
        if category_alias:
            category_alias[category] = max(category_alias.values()) + 1
        else:
            category_alias[category] = 0
        return category_alias[category]
    else:
        for translation, alias in category_alias.items():
            if alias == category:
                return translation
        return "None"

def normalize(text):
    import re
    import nltk
    import string
    from nltk.tokenize import word_tokenize
    from nltk.stem import SnowballStemmer
    
    stemmer = SnowballStemmer('spanish')
    
    tokenize = []
    for s in text:
        tmp = [token.lower() for token in word_tokenize(s) if word_accepted(token) and not re.search("^\s*\d+\s*$", token) ]
        tokenize.append([stemmer.stem(token) for token in nltk.word_tokenize(' '.join(tmp))])
    return tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/moffinguer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/moffinguer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
## Load a model to train
def training(input_file, model_type='CBOW'):  
    if model_type in ['CBOW','SG']:
        
        # Obtener listado de palabras por cada pelicula de los ejemplos
        corpus = []
        topic = []
        with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        for line in lines[1:]:
            tmp = line.split("\t")
            corpus += [tmp[0].split(" ")]
            topic += [tmp[1].strip()]
        
        from gensim.models import Word2Vec
        model = Word2Vec(sentences=corpus, sg = ( 0 if model_type == 'CBOW' else 1 ) )
        vector = vectorization(model_type, model, corpus)
        
        # Calculamos una media para normalizar y quedarnos con vectores de 5 elementos por cada ejemplo
        import numpy as np
        weight = [np.mean(np.array(weights), axis=0) for weights in vector]
        
        from sklearn.svm import SVC
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import make_pipeline
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
        clf.fit(weight, [categorize(theme) for theme in topic])
        return (model, weight, clf)    
    
    elif model_type == "BAYES":
        import pandas as pd 
        df = pd.read_csv(input_file, delimiter='\t')
       
        ## Convertimos cada genero a un valor numérico
        df['genero'] = df['genero'].apply( lambda c : categorize(c) )

        ## Entrenamiento, creando vectores por las palabras
        from sklearn.model_selection import train_test_split
        films_train, films_test, response_train, response_test = train_test_split( df.sinopsis, df.genero, test_size = .25)
         
        vector = vectorization(model_type)
        films_train_count = vector.fit_transform(films_train)
        
        from sklearn.naive_bayes import MultinomialNB
        model = MultinomialNB()
        model.fit(films_train_count, response_train)
        return (model,vector)
    
    else:
        print(f"ERROR unknown model {model}")

def predict(synopsis, model, vector, clf ,type_model = 'CBOW'):
    if type(synopsis) is str:
        synopsis = [synopsis]
   
    if type_model in ['CBOW','SG']:
        
        synopsis = normalize(synopsis)
        temp_vector = vectorization(type_model, model, synopsis)
       
        import numpy as np
        weight = [np.mean(np.array(weights), axis=0) for weights in temp_vector]

        predictions = clf.predict(weight)
        
    elif type_model == "BAYES":
        tmp = []
        for i in synopsis:
            tmp += [' '.join(tmp)]
        synopsis = tmp
        
        synopsis = vector.transform(synopsis)
        predictions = model.predict(synopsis)
        
    else:
        print(f"ERROR unknown model {type_model}")
        
    for category in predictions:
        print(f"Film has a category of {categorize(category, 1)}")

Testeo con predicciones

In [14]:
input_file = '../data/corpus_data.txt'

## Para los otros modelos, deben de devolver un parametro extra (model, vector, clf)
(model,vector) = training(input_file, "BAYES")
clf = None

predict('Las Aes Sadai, una poderosa fortaleza de mujeres, parecen dominar la magia por su capacidad de contactar con el Poder Único que se obtiene de la Fuente Verdadera, que hace girar la vital Rueda del Tiempo. La Época de la locura ha llegado por la contaminación de una parte de la fuente dejando un mundo arruinado y desorganizado en su forma de vida', model, vector, clf, 'BAYES')

Film has a category of Drama


In [12]:
input_file = '../data/corpus_data.txt'

## Para los otros modelos, deben de devolver un parametro extra (model, vector, clf)
(model,vector,clf) = training(input_file, "CBOW")

predict('Las Aes Sadai, una poderosa fortaleza de mujeres, parecen dominar la magia por su capacidad de contactar con el Poder Único que se obtiene de la Fuente Verdadera, que hace girar la vital Rueda del Tiempo. La Época de la locura ha llegado por la contaminación de una parte de la fuente dejando un mundo arruinado y desorganizado en su forma de vida', model, vector, clf, 'CBOW')

Film has a category of Drama
