In [1]:
import os
import re
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
import string
from sklearn.calibration import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, GlobalAveragePooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd
from gensim.models import KeyedVectors
import gensim.downloader

In [16]:
###############
## Glove pre-trained embeddings with different dimensionalities (at least 3)
###############
glove_vectors100 = gensim.downloader.load('glove-wiki-gigaword-100')
glove_vectors200 = gensim.downloader.load('glove-wiki-gigaword-200')
glove_vectors300 = gensim.downloader.load('glove-wiki-gigaword-300')

In [2]:
###########
## CREANDO DATASET
###########
def clean_gutenberg_text(text):
    text = re.sub(r'^.*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*', '', text, flags=re.DOTALL)
    text = re.sub(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*?$', '', text, flags=re.DOTALL)
    authors = r'William Oberfield|James Branch Cabell|Wilhelm Raabe'
    text = re.sub(r'(?i)\b(?:' + authors + r')\b.*?(?=\n)', '', text, flags=re.DOTALL)
    text = re.sub(r'(Produced by.*?Distributed Proofreaders|Title:.*?Author:.*?Release date:.*?Language:.*?Credits:.*?Project Gutenberg Distributed Proofreaders)', '', text, flags=re.DOTALL)
    text = re.sub(r'(CONTENTS.*?THE AFTERWORD|BIBLIOGRAPHY|INDEX)', '', text, flags=re.DOTALL)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    
    return text

def split_into_segments(text, segment_size=200):
    words = text.split() 
    segments = [' '.join(words[i:i + segment_size]) for i in range(0, len(words), segment_size)]
    return segments

def load_books_data(base_path='books', segment_size=200):
    texts = []
    labels = []
    authors = os.listdir(base_path)

    for author in authors:
        author_path = os.path.join(base_path, author)
        for book_file in os.listdir(author_path):
            with open(os.path.join(author_path, book_file), 'r', encoding='utf-8') as f:
                book_text = f.read()
                clean_text = clean_gutenberg_text(book_text)
                segments = split_into_segments(clean_text, segment_size)
                texts.extend(segments)
                labels.extend([author] * len(segments))
    
    return texts, labels

def create_dataset(texts, authors):
    df = pd.DataFrame({'text': texts, 'author': authors})
    return df

texts, labels = load_books_data()
dataset = create_dataset(texts, labels)

In [5]:
############
## MODELOS 
############

def build_simple_model(vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=200, weights=[embedding_matrix], trainable=False))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model , "simple"


def build_deep_model(vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=200, weights=[embedding_matrix], trainable=False))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(200, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model, "deep"

def build_batchnorm_model(vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=200, weights=[embedding_matrix], trainable=False))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(200, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization()) 
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    return model , "batchNorm"

In [29]:
#############
## FUNCIÓN DE COMPILACIÓN
#############
def compile_and_train_model(model, X_train, y_train, X_val, y_val, size, arquitectureLabel):
  model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
  )
  checkpoint_callback = ModelCheckpoint(
    f'best_model_{size}_{arquitectureLabel}.keras',
    monitor='accuracy',
    save_best_only=True,
    verbose=1 
  )

  history = model.fit(
    X_train, 
    y_train, 
    validation_data=(X_val, y_val), 
    epochs=50, 
    batch_size=32,
    callbacks=[checkpoint_callback]
  )
  return history

In [30]:
#######
## EJECUCIÓN
#######

results = {}
embeddings = {'100':glove_vectors100,'200':glove_vectors200,'300':glove_vectors300}
for size,glove_vector in embeddings.items():
    size = int(size)
    word2vecModel = glove_vector
    vocab_size = len(word2vecModel.key_to_index)
    tokenizer = Tokenizer(num_words=vocab_size)
    texts, labels = load_books_data()
    dataset = create_dataset(texts, labels)

    X_train, X_test, y_train, y_test = train_test_split(
        dataset['text'], dataset['author'], test_size=0.2, random_state=42)

    X_val, X_test, y_val, y_test = train_test_split(
        X_test, y_test, test_size=0.5, random_state=42)
    
    train_counts = pd.Series(y_train).value_counts().reset_index()
    train_counts.columns = ['class', 'count']
    train_counts['set'] = 'train'

    val_counts = pd.Series(y_val).value_counts().reset_index()
    val_counts.columns = ['class', 'count']
    val_counts['set'] = 'validation'

    test_counts = pd.Series(y_test).value_counts().reset_index()
    test_counts.columns = ['class', 'count']
    test_counts['set'] = 'test'

    summary = pd.concat([train_counts, val_counts, test_counts], ignore_index=True)

    summary = summary[['set', 'class', 'count']]

    display(summary)
    
    tokenizer.fit_on_texts(X_train)
    
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_val = encoder.transform(y_val)
    y_test = encoder.transform(y_test)
    
    X_train = tokenizer.texts_to_sequences(X_train)
    X_val = tokenizer.texts_to_sequences(X_val)
    X_test = tokenizer.texts_to_sequences(X_test)
    
    max_length = 200
    X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
    X_val = pad_sequences(X_val, maxlen=max_length, padding='post')
    X_test = pad_sequences(X_test, maxlen=max_length, padding='post')
    
    embedding_matrix = np.zeros((vocab_size, size))
    for word, index in word2vecModel.key_to_index.items():
        embedding_matrix[index] = word2vecModel[word]
            
    for modelToTrain in [build_simple_model , build_deep_model , build_batchnorm_model ]:
        print(f"Entrenando con embeddings de tamaño: {size}")

        model , arquitectureLabel  = modelToTrain(vocab_size , size, embedding_matrix)
        history = compile_and_train_model(model, X_train, y_train, X_val, y_val, size, arquitectureLabel)
        test_loss , test_accuracy = history.model.evaluate(X_test, y_test)
        results[f"{size}_{arquitectureLabel}"] = {"accuracy": test_accuracy , "loss": test_loss}

for size, metrics in results.items():
    print(f"Results for embeddings size {size}:")
    print(f"Accuracy: {metrics['accuracy']}")


Unnamed: 0,set,class,count
0,train,James Branch Cabell,667
1,train,Washington Irving,576
2,train,William Oberfield,46
3,validation,James Branch Cabell,86
4,validation,Washington Irving,64
5,validation,William Oberfield,11
6,test,James Branch Cabell,91
7,test,Washington Irving,66
8,test,William Oberfield,5


Entrenando con embeddings de tamaño: 100




Epoch 1/50
[1m37/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 3ms/step - accuracy: 0.5094 - loss: 0.9180
Epoch 1: accuracy improved from -inf to 0.54151, saving model to best_model_100_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 39ms/step - accuracy: 0.5133 - loss: 0.9104 - val_accuracy: 0.5342 - val_loss: 0.8823
Epoch 2/50
[1m22/41[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.5855 - loss: 0.7880 
Epoch 2: accuracy improved from 0.54151 to 0.63305, saving model to best_model_100_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.6034 - loss: 0.7810 - val_accuracy: 0.7267 - val_loss: 0.8621
Epoch 3/50
[1m37/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 3ms/step - accuracy: 0.7696 - loss: 0.7390
Epoch 3: accuracy improved from 0.63305 to 0.72382, saving model to best_model_100_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

Unnamed: 0,set,class,count
0,train,James Branch Cabell,667
1,train,Washington Irving,576
2,train,William Oberfield,46
3,validation,James Branch Cabell,86
4,validation,Washington Irving,64
5,validation,William Oberfield,11
6,test,James Branch Cabell,91
7,test,Washington Irving,66
8,test,William Oberfield,5


Entrenando con embeddings de tamaño: 200




Epoch 1/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5180 - loss: 0.8985
Epoch 1: accuracy improved from -inf to 0.56012, saving model to best_model_200_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 81ms/step - accuracy: 0.5190 - loss: 0.8971 - val_accuracy: 0.5404 - val_loss: 0.8674
Epoch 2/50
[1m38/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 6ms/step - accuracy: 0.5932 - loss: 0.7662
Epoch 2: accuracy improved from 0.56012 to 0.68037, saving model to best_model_200_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.6013 - loss: 0.7650 - val_accuracy: 0.7826 - val_loss: 0.7881
Epoch 3/50
[1m34/41[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.7879 - loss: 0.6753
Epoch 3: accuracy improved from 0.68037 to 0.79364, saving model to best_model_200_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

Unnamed: 0,set,class,count
0,train,James Branch Cabell,667
1,train,Washington Irving,576
2,train,William Oberfield,46
3,validation,James Branch Cabell,86
4,validation,Washington Irving,64
5,validation,William Oberfield,11
6,test,James Branch Cabell,91
7,test,Washington Irving,66
8,test,William Oberfield,5


Entrenando con embeddings de tamaño: 300




Epoch 1/50
[1m36/41[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 6ms/step - accuracy: 0.5011 - loss: 0.9481
Epoch 1: accuracy improved from -inf to 0.52366, saving model to best_model_300_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 201ms/step - accuracy: 0.5036 - loss: 0.9373 - val_accuracy: 0.5342 - val_loss: 0.8634
Epoch 2/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6068 - loss: 0.7889
Epoch 2: accuracy improved from 0.52366 to 0.65710, saving model to best_model_300_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 178ms/step - accuracy: 0.6080 - loss: 0.7883 - val_accuracy: 0.7267 - val_loss: 0.8229
Epoch 3/50
[1m31/41[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.7657 - loss: 0.7076
Epoch 3: accuracy improved from 0.65710 to 0.78743, saving model to best_model_300_simple.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4