# Generación de un corpus limitado a un umbral determinado

In [14]:
def corpus(input_file, output_file, umbral = 5):
    with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            
    ## Con gensim no es necesario usar la función build_vocab, el constructor lo construye en base al array de palabras
    corpus_frequency = {}
    for line in lines[1:]:
        for word in line.split("\t")[0].split(" "):
            if word in corpus_frequency:
                corpus_frequency[word] += 1
                continue
            corpus_frequency[word] = 1

    # Eliminar palabras que aparezcan menos veces de la esperada
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write("sinopsis\tgenero\n")
        for line in lines[1:]:
            synopsis, genre = line.split("\t")
            file.write(f"{synopsis}\t{genre}")

In [15]:
input_file = '../data/stemming_data.txt'
outpu_file = '../data/corpus_data.txt'
corpus(input_file, outpu_file)

Preparamos una sección para tokenizar las palabras de los ejemplos de prueba que se vayan probando, y así aumentar la precisión

In [16]:
# Obtener la lista de stopwords en español

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string


stop_words = set(stopwords.words('spanish'))
special_chars = [ "''", "...", "``", "<<", ">>", '""', "”", "“"]
category_alias = {}

def word_accepted(word):
    return word.lower() not in stop_words and word[0] not in string.punctuation and word[-1] not in string.punctuation and word not in special_chars

def vectorization(model_type, model = None, corpus = None):
    
    ## Vectorizamos las palabras
    if model_type in ['CBOW','SG']:
        keywords = model.wv.index_to_key
        vector = []
        tmp = []
        print
        for text in corpus:
            for word in text:
                if word in keywords:
                    tmp += [model.wv[word]]
            vector += [tmp]
            tmp = []
                
        return vector
    elif model_type == 'BAYES':
        from sklearn.feature_extraction.text import CountVectorizer
    return CountVectorizer()


def categorize(category, inverse = 0):
    if not inverse:
        if category in category_alias:
            return category_alias[category]
        if category_alias:
            category_alias[category] = max(category_alias.values()) + 1
        else:
            category_alias[category] = 0
        return category_alias[category]
    else:
        for translation, alias in category_alias.items():
            if alias == category:
                return translation
        return "None"

def normalize(text):
    import re
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.stem import SnowballStemmer
    
    stemmer = SnowballStemmer('spanish')
    
    tokenize = []
    for s in text:
        tmp = [token.lower() for token in word_tokenize(s) if word_accepted(token) and not re.search("^\s*\d+\s*$", token) ]
        tokenize.append([stemmer.stem(token) for token in nltk.word_tokenize(' '.join(tmp))])
    return tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/moffinguer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/moffinguer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Función que hace el entrenamiento de las palabras del corpus
### En base a los 3 tipos de modelos con los que vamos a trabajar creamos una función genérica que los abarque

Aquellos modelos de la librería de Gensim, Bolsa de palabras y Skip Gram, son entrenados con el corpus limitado.
Al necesitar vectores de palabras, que indique características de las palabras en base a valores numéricos, ya que no puede analizar cadenas como tal, requerimos de un proceso de vectorización, de esta forma y calculando la media de cada una de las propiedades de cada palabra, obtendremos un vector unidimensional por cada una las sinopsis de las peliculas.

En el modelo de Bayes, repartimos el corpus en 2 secciones de manera que usando las facilidades de la librería de SKLearn, vectorizamos las palabras del corpus de entremiento.

In [39]:
## Load a model to train
def training(input_file, model_type='CBOW'):  
    if model_type in ['CBOW','SG']:
        
        # Obtener listado de palabras por cada pelicula de los ejemplos
        corpus = []
        topic = []
        with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        for line in lines[1:]:
            tmp = line.split("\t")
            corpus += [tmp[0].split(" ")]
            topic += [tmp[1].strip()]
        
        from gensim.models import Word2Vec
        model = Word2Vec(sentences=corpus, sg = ( 0 if model_type == 'CBOW' else 1 ), epochs=200, seed=673721 )
        vector = vectorization(model_type, model, corpus)
        
        # Calculamos una media para normalizar y quedarnos con vectores de 5 elementos por cada ejemplo
        import numpy as np
        weight = [np.mean(np.array(weights), axis=0) for weights in vector]
        
        from sklearn.svm import SVC
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import make_pipeline
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
        clf.fit(weight, [categorize(theme) for theme in topic])
        return (model, weight, clf)    
    
    elif model_type == "BAYES":
        import pandas as pd 
        df = pd.read_csv(input_file, delimiter='\t')
       
        ## Convertimos cada genero a un valor numérico
        df['genero'] = df['genero'].apply( lambda c : categorize(c) )

        ## Entrenamiento, creando vectores por las palabras
        from sklearn.model_selection import train_test_split
        films_train, films_test, response_train, response_test = train_test_split( df.sinopsis, df.genero, test_size = .25)
         
        vector = vectorization(model_type)
        films_train_count = vector.fit_transform(films_train)
        
        from sklearn.naive_bayes import MultinomialNB
        model = MultinomialNB()
        model.fit(films_train_count, response_train)
        return (model,vector)
    
    else:
        print(f"ERROR unknown model {model}")

# Función que predice el tipo de película
### En base a los 3 tipos de modelos con los que vamos a trabajar creamos una función genérica que los abarque

La idea es similar en los 3 modelos, al tomar las sinopsis y tokenizarlas, buscamos encontrar la mayor similitud sobre cada vector y una categoría

In [None]:
def predict(synopsis, model, vector, clf ,type_model = 'CBOW', expected_output = []):
    if type(synopsis) is str:
        synopsis = [synopsis]
   
    if type_model in ['CBOW','SG']:
        
        synopsis = normalize(synopsis)
        temp_vector = vectorization(type_model, model, synopsis)
       
        import numpy as np
        weight = [np.mean(np.array(weights), axis=0) for weights in temp_vector]

        predictions = clf.predict(weight)
        
    elif type_model == "BAYES":
        tmp = []
        for i in synopsis:
            tmp += [' '.join(tmp)]
        synopsis = tmp
        
        synopsis = vector.transform(synopsis)
        predictions = model.predict(synopsis)
        
    else:
        print(f"ERROR unknown model {type_model}")
        
    accuracy = 0
    for i in range(len(expected_output)):
        predict = categorize(predictions[i], 1)
        accuracy += ( predict in expected_output[i])
        print(f"Film has a category of {predict}. Expected a category of {expected_output[i]}")
        
    print(f"The model {type_model} has an accuracy of {100 * accuracy / len(expected_output)}\n")

## Testing

In [40]:
with open('../data/data.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

examples = lines[1:int(len(lines) / 2):15]
expected_output = [ theme.split("\t")[1] for theme in examples]
examples = [ synopsis.split("\t")[0] for synopsis in examples]


print("Usamos Bayes Multinomial")
input_file = '../data/corpus_data.txt'
(model,vector) = training(input_file, "BAYES")
clf = None

predict(examples, model, vector, clf, 'BAYES', expected_output)
print("Usamos bolsa de palabras ahora:")


## Bolsa de palabras
(model, vector, clf) = training(input_file, 'CBOW')
predict(examples, model, vector, clf, 'CBOW', expected_output)


print("Usamos SG:")
## SG
(model, vector, clf) = training(input_file, 'SG')
predict(examples, model, vector, clf, 'SG', expected_output)



Usamos Bayes Multinomial
Film has a category of Drama. Expected a category of Comedia

Film has a category of Drama. Expected a category of Comedia

Film has a category of Drama. Expected a category of Suspense

Film has a category of Drama. Expected a category of Drama

Film has a category of Drama. Expected a category of Documental

Film has a category of Drama. Expected a category of Aventura

Film has a category of Drama. Expected a category of Crimen

Film has a category of Drama. Expected a category of Drama

Film has a category of Drama. Expected a category of Drama

Film has a category of Drama. Expected a category of Crimen

Film has a category of Drama. Expected a category of Acción

Film has a category of Drama. Expected a category of Drama

Film has a category of Drama. Expected a category of Acción

Film has a category of Drama. Expected a category of Crimen

Film has a category of Drama. Expected a category of Fantasía

The model BAYES has an accuracy of 26.66666666666666